# Real-World Python Machine Learing Tut w/ Scikit Learn

## Load data


In [109]:
class Genre:
    HIPHOP = 'Hip Hop'
    ROCK = 'Rock'
    POP = 'Pop'
    UNKOWN = 'unknown'

class Lyric:
    
    def __init__(self, words:str, genre:str) -> None:
        self.words = words
        self.genre = genre

    def __repr__(self) -> str:
        return f'Lyric({self.words[0:5]}, {self.genre})'

    def get_genre(self) -> str:
        if self.genre == 'Hip Hop':
            return Genre.HIPHOP
        elif self.genre == 'Rock':
            return Genre.ROCK
        elif self.genre == 'Pop': 
            return Genre.POP
        else: # genre is unkown
            return Genre.UNKOWN 
        

In [110]:
import pandas as pd

df = pd.read_csv('data.csv')

unfiltered_lyrics = list(map(lambda x:Lyric(x[0], x[1]), df.values.tolist()))

unk_lyrics = [l for l in unfiltered_lyrics if l.get_genre() == Genre.UNKOWN]

lyrics = list(set(unfiltered_lyrics) - set(unk_lyrics))

unk_lyrics

[Lyric(Feels, unknown),
 Lyric(Shado, unknown),
 Lyric(Slave, unknown),
 Lyric(You'v, unknown),
 Lyric(Magic, unknown),
 Lyric(Intro, unknown),
 Lyric(Hey s, unknown),
 Lyric(Can't, unknown),
 Lyric(I am , unknown),
 Lyric(I for, unknown),
 Lyric(Hwach, unknown),
 Lyric(Here , unknown),
 Lyric(If I , unknown),
 Lyric(Well , unknown),
 Lyric(anyti, unknown),
 Lyric([Hook, unknown),
 Lyric(See t, unknown),
 Lyric(There, unknown),
 Lyric([Chor, unknown),
 Lyric(Chop , unknown),
 Lyric(Hey g, unknown),
 Lyric(There, unknown),
 Lyric(Rose , unknown),
 Lyric(Sent , unknown),
 Lyric(Somet, unknown),
 Lyric(I wis, unknown),
 Lyric(He co, unknown),
 Lyric(Take , unknown),
 Lyric(I fee, unknown),
 Lyric((you , unknown),
 Lyric(Smoot, unknown),
 Lyric(I kno, unknown),
 Lyric(They , unknown),
 Lyric(One M, unknown),
 Lyric(Buzzi, unknown),
 Lyric(Save , unknown),
 Lyric(Close, unknown),
 Lyric(You c, unknown),
 Lyric(We're, unknown),
 Lyric((Kany, unknown),
 Lyric(I kno, unknown),
 Lyric(Here , un

## Prep Data


In [111]:
# of lyrics : 40000 -> train ; 10000 -> estimate/test ; 5000 in unk_lyrics -> is the unkown lyrics for acc test
from sklearn.model_selection import train_test_split

training, test = train_test_split(lyrics, test_size=0.2, random_state=42)
print(len(training), len(test))

40000 10000


In [112]:
train_x = [x.words for x in training]
train_y = [x.genre for x in training]

test_x = [x.words for x in test]
test_y = [x.genre for x in test]

In [128]:
# for unk lyrics
unk_x = [x.words for x in unk_lyrics]
unk_y = [x.genre for x in unk_lyrics]

Bag of Words vectorization

In [113]:
from sklearn.feature_extraction.text import CountVectorizer 

vectorizer = CountVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

In [129]:
# for unk lyrics
unk_x_vectors = vectorizer.transform(unk_x)

In [114]:
train_x_vectors
train_y
#* above is the data that we will fit a model around for machine learning algo

['Pop',
 'Pop',
 'Rock',
 'Pop',
 'Hip Hop',
 'Rock',
 'Rock',
 'Pop',
 'Rock',
 'Hip Hop',
 'Rock',
 'Hip Hop',
 'Hip Hop',
 'Hip Hop',
 'Rock',
 'Pop',
 'Pop',
 'Hip Hop',
 'Pop',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Pop',
 'Rock',
 'Rock',
 'Pop',
 'Hip Hop',
 'Pop',
 'Rock',
 'Hip Hop',
 'Hip Hop',
 'Pop',
 'Hip Hop',
 'Rock',
 'Rock',
 'Hip Hop',
 'Rock',
 'Rock',
 'Hip Hop',
 'Rock',
 'Rock',
 'Rock',
 'Pop',
 'Pop',
 'Pop',
 'Rock',
 'Pop',
 'Rock',
 'Rock',
 'Rock',
 'Pop',
 'Hip Hop',
 'Hip Hop',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Pop',
 'Pop',
 'Pop',
 'Pop',
 'Rock',
 'Rock',
 'Hip Hop',
 'Pop',
 'Pop',
 'Pop',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Pop',
 'Hip Hop',
 'Rock',
 'Pop',
 'Pop',
 'Rock',
 'Hip Hop',
 'Pop',
 'Pop',
 'Rock',
 'Pop',
 'Pop',
 'Hip Hop',
 'Rock',
 'Rock',
 'Pop',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Rock',
 'Hip Hop',
 'Rock',
 'Hip Hop',
 'Rock',
 'Rock',
 'Hip Hop',
 'Rock',
 'Pop',
 'Hip Hop',
 'P

## Classification

In [115]:
test_x[0], test_y[0]

("At home. Drawing pictures. Of mountain tops. With him on top. Lemon yellow sun. Arms raised in a v. And the dead lay in pools of maroon below. Daddy didn't give attention. To the fact. That mommy didn't care. King jeremy the wicked. Ruled his world. Jeremy spoke in class today. Jeremy spoke in class today. Clearly I remember. Picking on the boy. Seemed a harmless little fuck. But we unleashed a lion. Gnashed his teeth and. Bit the recess ladies breast. How could I forget?. And he hit me with a surprise left. My jaw left hurting. Dropped wide open. Just like the day. Like the day I heard. Daddy didn't give affection. And the boy was something. That mommy wouldn't wear. King jeremy the wicked. Ruled his world. Jeremy spoke in class today. Jeremy spoke in class today. Jeremy spoke in class today. Try to forget this. Try to erase this. From the black board. Jeremy spoke in class today. Jeremy spoke in class today. Jeremy spoke in spoke in. Jeremy spoke in spoke in. Jeremy spoke in class 

#### Naive Bayes

In [116]:
from sklearn.naive_bayes import MultinomialNB

clf_mnb = MultinomialNB()
clf_mnb.fit(train_x_vectors, train_y)

MultinomialNB()

In [117]:
print(test_x[0], test_y[0])
clf_mnb.predict(train_x_vectors[0])

At home. Drawing pictures. Of mountain tops. With him on top. Lemon yellow sun. Arms raised in a v. And the dead lay in pools of maroon below. Daddy didn't give attention. To the fact. That mommy didn't care. King jeremy the wicked. Ruled his world. Jeremy spoke in class today. Jeremy spoke in class today. Clearly I remember. Picking on the boy. Seemed a harmless little fuck. But we unleashed a lion. Gnashed his teeth and. Bit the recess ladies breast. How could I forget?. And he hit me with a surprise left. My jaw left hurting. Dropped wide open. Just like the day. Like the day I heard. Daddy didn't give affection. And the boy was something. That mommy wouldn't wear. King jeremy the wicked. Ruled his world. Jeremy spoke in class today. Jeremy spoke in class today. Jeremy spoke in class today. Try to forget this. Try to erase this. From the black board. Jeremy spoke in class today. Jeremy spoke in class today. Jeremy spoke in spoke in. Jeremy spoke in spoke in. Jeremy spoke in class to

array(['Pop'], dtype='<U7')

#### Linear SVM

In [118]:
# from sklearn import svm

# clf_svm = svm.SVC(kernel='linear')

# clf_svm.fit(train_x_vectors, train_y)

# test_x[0]
# # test_x_vectors[0]



In [119]:
# clf_svm.predict(test_x_vectors[0])

#### Descision Tree

In [120]:
# from sklearn.tree import DecisionTreeClassifier

# clf_dec = DecisionTreeClassifier()
# clf_dec.fit(train_x_vectors, train_y)


In [121]:
# clf_dec.predict(test_x_vectors[0])

#### Logistic Regression

In [122]:
# from sklearn.linear_model import LogisticRegression

# clf_log = LogisticRegression()
# clf_log.fit(train_x_vectors, train_y)


In [123]:
# clf_log.predict(train_x_vectors[0])

## Evaluation

In [131]:
print(clf_mnb.score(test_x_vectors, test_y))


0.6516


In [125]:
from sklearn.metrics import f1_score

f1_score(test_y, clf_mnb.predict(test_x_vectors), average=None, labels=[Genre.HIPHOP, Genre.POP, Genre.ROCK])

array([0.63720799, 0.50783699, 0.74778981])

In [126]:
train_y.count(Genre.HIPHOP)

7474

In [127]:
from sklearn.metrics import accuracy_score

pred = clf_mnb.predict(test_x_vectors)
estimated_accuracy = accuracy_score(test_y, pred)
pd.Series(estimated_accuracy).to_csv('ea.csv', index=False, header=False)

# Predicting Testing Set

#### ! This will be a bigger part of my acc doc

In [130]:
pred = clf_mnb.predict(unk_x_vectors)
pred = pd.Series(pred).to_csv('pred.csv', index=False, header=False)