# Playing with Vectorizers

In [1]:
import pandas as pd
import ftfy
import numpy as np
import matplotlib.pyplot as plt
import string
%matplotlib inline

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

In [4]:
raw_data = pd.read_csv('artist_lyrics.csv').drop('Unnamed: 0', axis=1)
raw_data.head()

Unnamed: 0,artist,song,link,lyrics,genre
0,Mos Def,Mathematics prod. by DJ Premier,http://genius.com/Yasiin-bey-mathematics-lyrics,"Beats by Su-Primo for all of my people, negroe...",rap
1,Mos Def,Back Home by A$AP Rocky (Ft. Acyde & A$AP Yam...,http://genius.com/A-ap-rocky-back-home-lyrics,"\nGotta find my way back home, I've been away ...",rap
2,Mos Def,Ms. Fat Booty prod. by Ayatollah,http://genius.com/Yasiin-bey-ms-fat-booty-lyrics,"I know I can't afford to stop\nFor one moment,...",rap
3,Mos Def,Respiration by Black Star (Ft. Common) prod. ...,http://genius.com/Black-star-respiration-lyrics,"""What'd you do last night?""\n""We did umm, two ...",rap
4,Mos Def,Two Words by Kanye West (Ft. Freeway) prod. b...,http://genius.com/Kanye-west-two-words-lyrics,Half-Hook: Kanye West]\nNow throw ya hands up ...,rap


In [5]:
working_data = raw_data.copy()

In [6]:
working_data.shape

(540, 5)

In [7]:
working_data.dropna(inplace=True)

In [8]:
working_data.shape

(533, 5)

In [12]:
X = working_data.copy()['lyrics']
y = [1 if v=='rap' else 0 for v in working_data['genre']]

In [15]:
def vectorizer_testing(X, y):
    vecs = (TfidfVectorizer(), TfidfVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS),
            CountVectorizer(), CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS))
    
    for vec in vecs:
        data = vec.fit_transform(X).todense()
        X_train, X_test, y_train, y_test = train_test_split(data, y)
        rf = RandomForestClassifier()
        nb = BernoulliNB()
        rf.fit(X_train, y_train)
        nb.fit(X_train, y_train)
        print vec
        print 'RF:', rf.score(X_test, y_test), 'NB:', nb.score(X_test, y_test)

In [16]:
vectorizer_testing(X, y)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)
RF: 0.858208955224 NB: 0.888059701493
TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words=frozenset(['all', 'six', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'four', 'not', 'own', 'through', 'yourselves', 'fify', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how'

### Looks like the best way to do this is with a TFIDF Vectorizer, with Stop Word removal

In [None]:
rf.