In [1]:
import pandas as pd
from scipy import sparse
import numpy as np
import random
from tqdm import tqdm_notebook

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

In [3]:
df = pd.read_csv("lyrics.csv").set_index("index")

In [4]:
df = df.dropna()
df = df.query("genre != 'Not Available'")
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
len(df)

242615

In [6]:
df.head(10)

Unnamed: 0,song,year,artist,genre,lyrics
0,you-re-mine,2006,dag-nasty,Rock,The last time I spoke to you\nI was seconds aw...
1,in-your-head,2007,the-berlin-project,Rock,I'm not one who can complain\nbecause no one's...
2,these-foolish-things,2016,count-basie,Jazz,Oh! Will you never let me be?\nOh! Will you ne...
3,till-i-see-ya,2004,beverley-knight,Pop,Just one moment\nIn the blink of an eye\nDid I...
4,hospital-food,2006,eels,Rock,Coming through the alley trying to walk withou...
5,kiss-of-death,2007,buried-alive,Metal,Infected by our callous acts.\nOur blood flows...
6,at-the-close-of-every-day,2006,arid,Rock,At the close of every day theres a woman passi...
7,be-my-love,2009,bobby-valentino,Hip-Hop,Baby girl how you do? 'Cause I think you can't...
8,white-trash-party-remix,2012,eminem,Hip-Hop,"Yeah Oh. get up, I said get up lets go\nBetter..."
9,we-let-her-down,2009,chris-isaak,Rock,"Her mother, her father\nNever showed her any a..."


In [7]:
lyricsList = []

for row in df['lyrics'].values:
    lyricsList.append(row.replace("\n"," "))

df['lyrics'] = lyricsList

In [8]:
df.head(5)

Unnamed: 0,song,year,artist,genre,lyrics
0,you-re-mine,2006,dag-nasty,Rock,The last time I spoke to you I was seconds awa...
1,in-your-head,2007,the-berlin-project,Rock,I'm not one who can complain because no one's ...
2,these-foolish-things,2016,count-basie,Jazz,Oh! Will you never let me be? Oh! Will you nev...
3,till-i-see-ya,2004,beverley-knight,Pop,Just one moment In the blink of an eye Did I m...
4,hospital-food,2006,eels,Rock,Coming through the alley trying to walk withou...


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [10]:
vectorizer = CountVectorizer(min_df=25)

In [11]:
result = vectorizer.fit_transform(df['lyrics'].values)

In [12]:
result.shape

(242615, 29883)

In [13]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [14]:
label = encoder.fit_transform(df[['genre']].values).toarray()

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(result, df[['genre']].values):
    train_X = result[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = result[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.41379026109176337
0.41682466408375235
0.413103064526101
0.4164708791888216
0.4110142418434016


In [17]:
tfidf_vectorizer = TfidfVectorizer(min_df=25)

In [18]:
tfidf_result = tfidf_vectorizer.fit_transform(df['lyrics'].values)

In [19]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(tfidf_result, df[['genre']].values):
    train_X = tfidf_result[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = tfidf_result[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.548375131370165
0.5502225702745034
0.547905117160934
0.5474836156794856
0.5471258682165749


In [20]:
tfidf_result.shape

(242615, 29883)

In [21]:
title = np.array([" ".join(I.split("-")) for I in df["song"].values])

In [22]:
title_vectorizer = TfidfVectorizer(min_df=25)
title_result = tfidf_vectorizer.fit_transform(title)

In [23]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(result, df[['genre']].values):
    train_X = title_result[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = title_result[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.4683784285037196
0.470200313247053
0.46794303732250686
0.4693128889988047
0.46754879531729837


In [24]:
W = np.random.normal(size=(result.shape[1], 11))
b = np.random.normal(size=(11,))

In [25]:
def softmax(inputs):
    exp_result =np.exp(inputs)
    return exp_result / float(max(sum(exp_result), 0.00001))

def model(X, W, b):
    y_linear = np.dot(X, W) + b
    return softmax(y_linear)

def calculate_l2(X, b, delta):
    l2 = b
    l2 += X.sum()
    return l2 * delta

In [26]:
def sgd_logistic_regression(X, y, W_aksen, b_aksen, lr, delta):
    prediction = model(X, W_aksen, b_aksen)
    shape = W_aksen[:,0].shape[0]
    error = prediction - y
    for kelas in range(len(prediction)):
        l2 = calculate_l2(W_aksen[:,kelas], b_aksen[kelas], delta)
        b_aksen[kelas] -= lr * (error[kelas] + l2)
        W_aksen[:, kelas] -= lr * (X * error[kelas] + l2)

In [27]:
for row in tqdm_notebook(range(30000)):
    sgd_logistic_regression(np.array(tfidf_result[row].todense())[0], label[row], W, b, 0.01, 1 / 20000)

HBox(children=(IntProgress(value=0, max=30000), HTML(value='')))




In [28]:
insertedWord = []
for I in range(W.shape[0]):
    if np.sum(np.abs(W[I,:]) > 1) > 1:
        insertedWord.append(I)

In [29]:
len(insertedWord)

27147

In [30]:
tfidf_result_filtered = tfidf_result[:, insertedWord]

In [31]:
tfidf_result_filtered

<242615x27147 sparse matrix of type '<class 'numpy.float64'>'
	with 19586574 stored elements in Compressed Sparse Row format>

In [32]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(tfidf_result_filtered, df[['genre']].values):
    train_X = tfidf_result_filtered[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = tfidf_result_filtered[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.5467059575081913
0.5466779325694502
0.548750077282938
0.5477515353860105
0.5436014757105464


In [33]:
sid = SentimentIntensityAnalyzer()

In [34]:
sentimentScore = []
for I in tqdm_notebook(lyricsList):
    ss = sid.polarity_scores(I)
    sentimentScore.append(ss)

HBox(children=(IntProgress(value=0, max=242615), HTML(value='')))




In [35]:
sentimentLabel = []

for I in tqdm_notebook(sentimentScore):
    if I['compound'] > 0:
        sentimentLabel.append([1,0])
    else:
        sentimentLabel.append([0,1])

HBox(children=(IntProgress(value=0, max=242615), HTML(value='')))




In [36]:
sentimentLabel = np.array(sentimentLabel)

In [38]:
combined = sparse.hstack((tfidf_result_filtered,sentimentLabel))

In [39]:
combined = combined.tocsr()

In [40]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(combined, df[['genre']].values):
    train_X = combined[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = combined[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.5500649123168545
0.5475640919957134
0.5497392988891866
0.548637731338362
0.5488365382633608


In [41]:
combined = sparse.hstack((tfidf_result,sentimentLabel))
combined = combined.tocsr()

In [42]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(combined, df[['genre']].values):
    train_X = combined[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = combined[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.5519607641107013
0.550531695655758
0.5509346083300702
0.5509047442397264
0.5499701148003875


In [43]:
combined = sparse.hstack((title_result,sentimentLabel))
combined = combined.tocsr()

In [44]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(combined, df[['genre']].values):
    train_X = combined[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = combined[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.46835782141900384
0.47053004698705797
0.46932382581456217
0.46879765879394913
0.46944495970650674
