In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm_notebook

In [2]:
df = pd.read_csv("lyrics.csv").set_index("index")

In [3]:
df = df.dropna()
df = df.query("genre != 'Not Available'")
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
df.head(10)

Unnamed: 0,song,year,artist,genre,lyrics
0,open-my-eyes-that-i-may-see,2014,deadbeat,Electronic,"1. Open my eyes, that I may see\nGlimpses of t..."
1,feeling-blue,2006,finley-quaye,Pop,I'm right at home\nMust be beneficial yeah\nI'...
2,everything-i-wanted,2007,ashley-monroe,Country,-Girl\nWaking up with you makes me feel alive....
3,you-re-for-me,2007,buck-owens,Country,Well when I saw you walkin' at avenue then I k...
4,can-t-help-myself,2007,gary-moore,Rock,"In this world of make believe,\nI will try to ..."
5,throw-it-away,2003,black-lips,Rock,I see you walking around my block\nyou think y...
6,dropped,2013,atoms-for-peace,Rock,It slipped\nOut of my hands\nWent deep down\nW...
7,forever-yours,2007,blank-logic,R&B,intro: instrumental\nverse 1:\nThis ride is go...
8,samadhi,2009,epica,Metal,Solve mentem a molestis\nMentem ad concretum d...
9,i-o-i-o,2006,b3,Pop,"[Chorus]\nI E O,I I E I I I I E O,I O, I E O, ..."


In [5]:
set(df['genre'].values)

{'Country',
 'Electronic',
 'Folk',
 'Hip-Hop',
 'Indie',
 'Jazz',
 'Metal',
 'Other',
 'Pop',
 'R&B',
 'Rock'}

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [7]:
vectorizer = CountVectorizer(min_df=100)

In [8]:
result = vectorizer.fit_transform(df['lyrics'].values)

In [9]:
result.shape

(242615, 12273)

In [10]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [11]:
label = encoder.fit_transform(df[['genre']].values).toarray()

In [12]:
from sklearn.naive_bayes import MultinomialNB

In [13]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(result, df[['genre']].values):
    train_X = result[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = result[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.37929400127763924
0.37921440936443823
0.38245780351585845
0.3808787766374016
0.38689997732846926


In [81]:
tfidf_vectorizer = TfidfVectorizer(min_df=100)

In [82]:
tfidf_result = tfidf_vectorizer.fit_transform(df['lyrics'].values)

In [83]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(tfidf_result, df[['genre']].values):
    train_X = tfidf_result[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = tfidf_result[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.5499824839779917
0.5501813535570027
0.5491828617356718
0.5526565269362351
0.5514128485747851


In [17]:
title = np.array([" ".join(I.split("-")) for I in df["song"].values])

In [18]:
title_vectorizer = TfidfVectorizer(min_df=5)
title_result = tfidf_vectorizer.fit_transform(title)

In [19]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(result, df[['genre']].values):
    train_X = title_result[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = title_result[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.47136645578749975
0.4738067760283571
0.47130226902705935
0.47483615679485597
0.47517467383911455


In [20]:
1 / 90165

1.1090778018077969e-05

In [79]:
W = np.random.normal(size=(12273, 11))
b = np.random.normal(size=(11,))

In [85]:
def softmax(inputs):
    exp_result =np.exp(inputs)
    return exp_result / float(max(sum(exp_result), 0.00001))

def model(X, W, b):
    y_linear = np.dot(X, W) + b
    return softmax(y_linear)

def calculate_l2(X, b, delta):
    l2 = b
    l2 += X.sum()
    return l2 * delta

In [35]:
def sgd_logistic_regression(X, y, W_aksen, b_aksen, lr, delta):
    prediction = model(X, W_aksen, b_aksen)
    shape = W_aksen[:,0].shape[0]
    error = prediction - y
    for kelas in range(len(prediction)):
        l2 = calculate_l2(W_aksen[:,kelas], b_aksen[kelas], delta)
        b_aksen[kelas] -= lr * (error[kelas] + l2)
        W_aksen[:, kelas] -= lr * (X * error[kelas] + l2)

In [86]:
for row in tqdm_notebook(range(10000)):
    sgd_logistic_regression(np.array(tfidf_result[row].todense())[0], label[row], W, b, 0.01, 1 / 20000)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

In [25]:
result.shape

(242615, 12273)

In [99]:
W[:,0]

array([-0.44678461,  1.01035552, -1.00354298, ...,  1.11696896,
       -0.60314249,  0.6883406 ])

In [28]:
W_boom = W.copy()

In [87]:
W.shape[0]

12273

In [97]:
removedWord = []
for I in range(W.shape[0]):
    if np.sum(np.abs(W[I,:]) > 1) < 0:
        removedWord.append(I)

In [98]:
len(removedWord)

0