In [None]:
reset -sf

In [1]:
import pandas as pd
import ftfy
import numpy as np
import matplotlib.pyplot as plt
import string
%matplotlib inline

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier

In [3]:
df = pd.read_csv('artist_lyrics.csv').drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,artist,song,link,lyrics,genre
0,Mos Def,Mathematics prod. by DJ Premier,http://genius.com/Yasiin-bey-mathematics-lyrics,"Beats by Su-Primo for all of my people, negroe...",rap
1,Mos Def,Back Home by A$AP Rocky (Ft. Acyde & A$AP Yam...,http://genius.com/A-ap-rocky-back-home-lyrics,"\nGotta find my way back home, I've been away ...",rap
2,Mos Def,Ms. Fat Booty prod. by Ayatollah,http://genius.com/Yasiin-bey-ms-fat-booty-lyrics,"I know I can't afford to stop\nFor one moment,...",rap
3,Mos Def,Respiration by Black Star (Ft. Common) prod. ...,http://genius.com/Black-star-respiration-lyrics,"""What'd you do last night?""\n""We did umm, two ...",rap
4,Mos Def,Two Words by Kanye West (Ft. Freeway) prod. b...,http://genius.com/Kanye-west-two-words-lyrics,Half-Hook: Kanye West]\nNow throw ya hands up ...,rap


In [None]:
# df.info()

### Messing with a song I know well 

In [5]:
math = df['lyrics'][0]
math

'Beats by Su-Primo for all of my people, negroes and latinos\nAnd even the gringos\n\n\nYo, check it\nOne for Charlie Hustle, 2 for Steady Rock\n3 for the forthcoming live future shock\nIt\'s 5 dimensions, 6 senses\n7 firmaments of heaven and hell,8 Million Stories to tell\n9 planets faithfully keep in orbit with the probable tenth\nThe universe expands length\nThe body of my text possess extra strength\nPower-lift the powerless up out of this towering inferno\nMy ink so hot it burn through the journal\nI\'m blacker than midnight on Broadway and Myrtle\nHip-Hop passed all your tall social hurdles\nLike the nationwide project-prison-industry complex\nWorking-class poor: better keep your alarm set\nStreets too loud to ever hear freedom ring\nSay evacuate your sleep, it\'s dangerous to dream\nFor ch-ching, cats get the "cha-pow!" You dead now\nKilling fields need blood to graze the cash cow\nIt\'s a numbers game, but shit don\'t add up somehow\nLike I got, 16 to 32 bars to rock it\nBut on

In [6]:
# making a function to get rid of punctuation, newlines
def song_cleaner(song):    
    song = song.lower()
    clean_song = ""
    for i, char in enumerate(song):
        if char not in string.punctuation:
            clean_song += char
        else:
            clean_song += " "
    return clean_song.replace('\n', ' ')

In [7]:
# making a function to get rid of weird double spaces
def double_space_remove(math):
    i = 0
    while i < len(math)-1:
        if (math[i] == ' ') & (math[i+1] == ' '):
            math = math[:i] + math[i+1:]
            i+=2
        else:
            i+=1
    return math

In [9]:
# example of "cleaned" song 
" ".join(song_cleaner(double_space_remove(math)).split())

'beats by su primo for all of my people negroes and latinos and even the gringos yo check it one for charlie hustle 2 for steady rock 3 for the forthcoming live future shock it s 5 dimensions 6 senses 7 firmaments of heaven and hell 8 million stories to tell 9 planets faithfully keep in orbit with the probable tenth the universe expands length the body of my text possess extra strength power lift the powerless up out of this towering inferno my ink so hot it burn through the journal i m blacker than midnight on broadway and myrtle hip hop passed all your tall social hurdles like the nationwide project prison industry complex working class poor better keep your alarm set streets too loud to ever hear freedom ring say evacuate your sleep it s dangerous to dream for ch ching cats get the cha pow you dead now killing fields need blood to graze the cash cow it s a numbers game but shit don t add up somehow like i got 16 to 32 bars to rock it but only 15 of profits ever see my pockets like 6

In [10]:
# checking to see if songs are NaN's 

drops = []
for i, song in enumerate(df['lyrics']):
    if type(song) != str:
        drops.append(i)

In [12]:
# they're all NaNs
df['lyrics'][drops]

25     NaN
335    NaN
338    NaN
368    NaN
429    NaN
520    NaN
522    NaN
Name: lyrics, dtype: object

In [13]:
df = df.drop(drops)

In [14]:
print df.shape
print df['lyrics'].shape

(533, 5)
(533,)


In [15]:
df.head()

Unnamed: 0,artist,song,link,lyrics,genre
0,Mos Def,Mathematics prod. by DJ Premier,http://genius.com/Yasiin-bey-mathematics-lyrics,"Beats by Su-Primo for all of my people, negroe...",rap
1,Mos Def,Back Home by A$AP Rocky (Ft. Acyde & A$AP Yam...,http://genius.com/A-ap-rocky-back-home-lyrics,"\nGotta find my way back home, I've been away ...",rap
2,Mos Def,Ms. Fat Booty prod. by Ayatollah,http://genius.com/Yasiin-bey-ms-fat-booty-lyrics,"I know I can't afford to stop\nFor one moment,...",rap
3,Mos Def,Respiration by Black Star (Ft. Common) prod. ...,http://genius.com/Black-star-respiration-lyrics,"""What'd you do last night?""\n""We did umm, two ...",rap
4,Mos Def,Two Words by Kanye West (Ft. Freeway) prod. b...,http://genius.com/Kanye-west-two-words-lyrics,Half-Hook: Kanye West]\nNow throw ya hands up ...,rap


In [None]:
df.info()

In [64]:
# figuring out which songs can't be correctly encoded to unicode
# for some reason running this over and over, the list gets shorter and shorter !?!?!?
trouble_songs = []
for i, song in enumerate(df['lyrics']):
    try:
        df['lyrics'][i] = unicode(song)
    except:
        trouble_songs.append(i)

In [65]:
print trouble_songs

[8, 27, 28, 31, 32, 35, 37, 43, 49, 54, 57, 59, 63, 73, 74, 76, 81, 84, 87, 88, 89, 90, 91, 92, 96, 97, 99, 102, 105, 106, 107, 108, 110, 120, 128, 129, 135, 136, 137, 139, 140, 144, 150, 161, 165, 166, 172, 182, 187, 189, 193, 194, 200, 205, 206, 209, 211, 215, 218, 219, 220, 222, 225, 228, 229, 230, 233, 235, 236, 238, 239, 240, 248, 249, 251, 253, 259, 260, 263, 264, 266, 269, 273, 274, 276, 284, 289, 296, 305, 383, 392, 396, 400, 406, 408, 415, 419, 420, 421, 435, 438, 440, 444, 452, 459, 464, 466, 468, 469, 471, 473, 478, 490, 492, 496, 503, 504, 515, 519, 521, 527, 534, 537]


In [34]:
for i, song in enumerate(df['lyrics'][trouble_songs]):
    for word in song.split():
        try:
            unicode(word)
        except:
            print df['artist'][trouble_songs[i]], '---', word
            print

Mos Def --- hoéy

Mos Def --- SUV’s,red



In [35]:
for song in trouble_songs:
    print type(df['lyrics'][song])
    print df['lyrics'][song]
    print

<type 'str'>
Uh huh uh huh uh huh
Oh this takes me home
It makes me think about sitting outside of my old home when I was younger and singing something like

(K'Naan in Somali)
Gabaryaray shuxaano
Maro Shabeel eh xirato
Maro qafiif eh huwato
Magacaaga ii sheeg
Magacayga waa Sharaf
Sharaf, Xaaji weyaan
Aqalada xiriirta
Dhinac baa ka jooga
Ala yaa usheega?
Tinta ushaleeya?
Naa hoéy Zamzamey
Sabaax nuurey
Adoo kilkiley
Iyo
Kaloon badanay
Adoo xajka jira
Xasuus badhanay!
Saxiibtaa
Caasho, cashaq baa dhilay
Ugu dhambaystina
Aniga iyo geel uba
Ugu banaan bixin
Waa aniga orodneey
Nabad dheynee
Mareekan waa laga soo waayey
Mareekan waa laga soo waayey
Mareekan waa laga soo waayey



There are certain things fresh and certain things mish
I got my own sound I don't sound like the rest
And even my attire and my choice of dress
And not long ago I don't spoke English
My point is police pull me over a lot
They wonder what kind of rap sheet I got
And sometimes I take a young girl out to eat
And hold 

In [None]:
# weird windows encoding
# not sure this is relevant
ftfy.guess_bytes('I\xe2\u20ac\u2122m')

### Going to try some different models here

In [36]:
sws = stop_words.ENGLISH_STOP_WORDS

In [41]:
cv = CountVectorizer(binary=True, stop_words=sws)
tfidf = TfidfVectorizer(stop_words=sws)

In [42]:
d = cv.fit_transform(df['lyrics'])
td = tfidf.fit_transform(df['lyrics'])

In [43]:
# how is this possible?
# its somehow regaining the dropped rows?
df['lyrics'].shape

(540,)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 533 entries, 0 to 539
Data columns (total 5 columns):
artist    533 non-null object
song      533 non-null object
link      533 non-null object
lyrics    533 non-null object
genre     533 non-null object
dtypes: object(5)
memory usage: 25.0+ KB


In [49]:
print df.shape
print df['lyrics'].shape
print df['genre'].shape

(533, 5)
(540,)
(533,)


In [46]:
df2 = pd.DataFrame(d.todense(), columns=cv.get_feature_names())

In [47]:
df2.head()

Unnamed: 0,000,02,03,10,100,100k,103,106,10x,11,...,zone,zones,zonin,zoning,zoo,zoogang,zoom,zorro,zuckerbergs,zulu
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 540 entries, 0 to 539
Columns: 10898 entries, 000 to zulu
dtypes: int64(10898)
memory usage: 44.9 MB


In [68]:
bnb = BernoulliNB()
mnb = MultinomialNB()
gnb = GaussianNB()

In [66]:
X_train, X_test, y_train, y_test = train_test_split(d.todense(), df['genre'])

In [69]:
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [70]:
# already got a ~90% accurate model
# try CV now
bnb.score(X_test, y_test)

0.88888888888888884

In [71]:
# cross-val scores
print cross_val_score(bnb, d.todense(), df['genre'], cv=10).mean()

0.844444444444


In [72]:
for model in [bnb, mnb, gnb]:
    print model
    print cross_val_score(model, d.todense(), df['genre'], cv=10).mean()

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
0.844444444444
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.818518518519
GaussianNB()
0.674074074074


In [None]:
# adding stop words actually lowers accuracy by ~5%

In [None]:
# now think about stemming, removing stop words, mojibake, tf-idf
# cram into decision tree to get the most important features
# etc

In [100]:
def print_importants(model, top=10):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    important_words = df2.columns[indices]
    print important_words[:top]

In [77]:
X = d.todense()
y = df['genre']

In [92]:
y = [1 if v=='rap' else 0 for v in df['genre']]

### Messing with some ExtremeTrees

In [73]:
erf = ExtraTreesClassifier(n_estimators=250, random_state=0)

In [102]:
print_importants(erf)

Index([u'niggas', u'shit', u'nigga', u'ain', u'yo', u'fuck', u'em', u'ya',
       u'gon', u'gotta'],
      dtype='object')


In [82]:
# Extreme Trees RF does better than NB --- expected result
print cross_val_score(erf, X, y, cv=10).mean()

0.894444444444


### Trying with a plain RandomForest

In [103]:
rf = RandomForestClassifier()

In [104]:
print cross_val_score(rf, X, y, cv=10).mean()

0.864814814815


In [101]:
print_importants(rf)

Index([u'gotta', u'ya', u'ass', u'ain', u'black', u'fuck', u'shit', u'hood',
       u'man', u'say'],
      dtype='object')


### Now try with Gradient Boosting

In [96]:
gbr = GradientBoostingClassifier()

In [97]:
gbr.fit(X_train, y_train)

GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [98]:
gbr.score(X_test, y_test)

0.96296296296296291

In [106]:
print cross_val_score(gbr, X, y, cv=10, n_jobs=-1).mean()

0.862962962963


### Next up

In [None]:
# could try doing SVM, might need to grid search parameters
# would consider doing non-binary counts for data!