In [1]:
import pandas as pd
from scipy import sparse
import numpy as np
import random
from tqdm import tqdm_notebook

In [2]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

In [3]:
df = pd.read_csv("lyrics.csv").set_index("index")

In [4]:
df = df.dropna()
df = df.query("genre != 'Not Available'")
df = df.sample(frac=1).reset_index(drop=True)

In [5]:
len(df)

242615

In [6]:
df.head(10)

Unnamed: 0,song,year,artist,genre,lyrics
0,i-wish,2007,drain-s-t-h,Rock,Am I too corrosive?\nAm I just too weak?\nAm I...
1,real-life,2009,alice-peacock,Pop,Met her at the high school dance\nDown at the ...
2,we-rot-the-voodoo,2007,the-dingees,Rock,"We rot the voodoo, for we do not fear\nSpirit ..."
3,for-all-that-you-want,1999,gary-barlow,Pop,"I can't sit and wait,\nfor you to say,\nIf the..."
4,i-ve-had-enough,2016,emmylou-harris,Country,"Love it's not I who didn't try\nHard enough, h..."
5,rudolph-the-red-nose-reindeer,2007,dean-martin,Jazz,Rudolph the red-nosed reindeer\nHad a very shi...
6,world-keeps-on-turning,2007,fleetwood-mac,Rock,"I don't look for no worries, worries and troub..."
7,good,2005,better-than-ezra,Rock,Looking around the house\nHidden behind the wi...
8,20th-century,2006,alphaville,Pop,In the beginning\nThere was no light\nNo teena...
9,deep-deep-ocean,2014,belinda-carlisle,Pop,I heard you call my name\nWhy do you sit in si...


In [7]:
lyricsList = []

for row in df['lyrics'].values:
    lyricsList.append(row.replace("\n"," "))

df['lyrics'] = lyricsList

In [8]:
df.head(5)

Unnamed: 0,song,year,artist,genre,lyrics
0,i-wish,2007,drain-s-t-h,Rock,Am I too corrosive? Am I just too weak? Am I t...
1,real-life,2009,alice-peacock,Pop,Met her at the high school dance Down at the A...
2,we-rot-the-voodoo,2007,the-dingees,Rock,"We rot the voodoo, for we do not fear Spirit o..."
3,for-all-that-you-want,1999,gary-barlow,Pop,"I can't sit and wait, for you to say, If there..."
4,i-ve-had-enough,2016,emmylou-harris,Country,"Love it's not I who didn't try Hard enough, ha..."


In [58]:
set(df['genre'].values)

{'Country',
 'Electronic',
 'Folk',
 'Hip-Hop',
 'Indie',
 'Jazz',
 'Metal',
 'Other',
 'Pop',
 'R&B',
 'Rock'}

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold

In [10]:
vectorizer = CountVectorizer(min_df=25)

In [11]:
result = vectorizer.fit_transform(df['lyrics'].values)

In [12]:
result.shape

(242615, 29883)

In [13]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [14]:
label = encoder.fit_transform(df[['genre']].values).toarray()

In [15]:
from sklearn.naive_bayes import MultinomialNB

In [16]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(result, df[['genre']].values):
    train_X = result[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = result[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.41354297607517465
0.41676283900750144
0.4154936834078684
0.4173158567247846
0.4099837177188318


In [51]:
sum([0.41354297607517465, 0.41676283900750144, 0.4154936834078684, 0.4173158567247846, 0.4099837177188318]) / 5

0.41461981458683217

In [17]:
tfidf_vectorizer = TfidfVectorizer(min_df=25)

In [18]:
tfidf_result = tfidf_vectorizer.fit_transform(df['lyrics'].values)

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(tfidf_result, df[['genre']].values):
    train_X = tfidf_result[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = tfidf_result[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.5495909493683928
0.5490066771082351
0.5477402468932259
0.5482255471744776
0.546919763391661


In [52]:
sum([0.5495909493683928, 0.5490066771082351, 0.5477402468932259, 0.5482255471744776, 0.546919763391661]) / 5

0.5482966367871984

In [None]:
tfidf_result.shape

(242615, 29883)

In [None]:
title = np.array([" ".join(I.split("-")) for I in df["song"].values])

In [None]:
title_vectorizer = TfidfVectorizer(min_df=25)
title_result = tfidf_vectorizer.fit_transform(title)

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(result, df[['genre']].values):
    train_X = title_result[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = title_result[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.4687081418591712
0.4697469293545462
0.46728355625167445
0.46766415234326697
0.46895030812671323


In [53]:
sum([0.4687081418591712, 0.4697469293545462, 0.46728355625167445, 0.46766415234326697, 0.46895030812671323]) / 5

0.4684706175870744

In [None]:
W = np.random.normal(size=(result.shape[1], 11))
b = np.random.normal(size=(11,))

In [None]:
def softmax(inputs):
    exp_result =np.exp(inputs)
    return exp_result / float(max(sum(exp_result), 0.00001))

def model(X, W, b):
    y_linear = np.dot(X, W) + b
    return softmax(y_linear)

def calculate_l2(X, b, delta):
    l2 = b
    l2 += X.sum()
    return l2 * delta

In [None]:
def sgd_logistic_regression(X, y, W_aksen, b_aksen, lr, delta):
    prediction = model(X, W_aksen, b_aksen)
    shape = W_aksen[:,0].shape[0]
    error = prediction - y
    for kelas in range(len(prediction)):
        l2 = calculate_l2(W_aksen[:,kelas], b_aksen[kelas], delta)
        b_aksen[kelas] -= lr * (error[kelas] + l2)
        W_aksen[:, kelas] -= lr * (X * error[kelas] + l2)

In [None]:
for row in tqdm_notebook(range(30000)):
    sgd_logistic_regression(np.array(tfidf_result[row].todense())[0], label[row], W, b, 0.01, 1 / 20000)

HBox(children=(IntProgress(value=0, max=30000), HTML(value='')))




In [None]:
insertedWord = []
for I in range(W.shape[0]):
    if np.sum(np.abs(W[I,:]) > 1) > 1:
        insertedWord.append(I)

In [None]:
len(insertedWord)

27128

In [None]:
tfidf_result_filtered = tfidf_result[:, insertedWord]

In [None]:
tfidf_result_filtered

<242615x27128 sparse matrix of type '<class 'numpy.float64'>'
	with 20133191 stored elements in Compressed Sparse Row format>

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(tfidf_result_filtered, df[['genre']].values):
    train_X = tfidf_result_filtered[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = tfidf_result_filtered[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.5467059575081913
0.5483266012694749
0.5464418935350246
0.5459997526895016
0.5457861868546343


In [54]:
sum([0.5467059575081913, 0.5483266012694749, 0.5464418935350246, 0.5459997526895016, 0.5457861868546343]) / 5

0.5466520783713653

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
sentimentScore = []
for I in tqdm_notebook(lyricsList):
    ss = sid.polarity_scores(I)
    sentimentScore.append(ss)

HBox(children=(IntProgress(value=0, max=242615), HTML(value='')))




In [None]:
sentimentLabel = []

for I in tqdm_notebook(sentimentScore):
    if I['compound'] > 0:
        sentimentLabel.append([1,0])
    else:
        sentimentLabel.append([0,1])

HBox(children=(IntProgress(value=0, max=242615), HTML(value='')))

In [None]:
sentimentLabel = np.array(sentimentLabel)

In [44]:
combined = sparse.hstack((tfidf_result_filtered,sentimentLabel))

In [45]:
combined = combined.tocsr()

In [46]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(combined, df[['genre']].values):
    train_X = combined[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = combined[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.5460877449667195
0.5506965625257605
0.5496156461884055
0.5502040311611228
0.549475463220594


In [55]:
sum([0.5460877449667195, 0.5506965625257605, 0.5496156461884055, 0.5502040311611228, 0.549475463220594]) / 5

0.5492158896125205

In [47]:
combined = sparse.hstack((tfidf_result,sentimentLabel))
combined = combined.tocsr()

In [48]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(combined, df[['genre']].values):
    train_X = combined[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = combined[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.5524759412285943
0.5524070563020361
0.5525833110071513
0.5496888009562673
0.5490220326057833


In [56]:
sum([0.5524759412285943, 0.5524070563020361, 0.5525833110071513, 0.5496888009562673, 0.5490220326057833]) / 5

0.5512354284199665

In [49]:
combined = sparse.hstack((title_result,sentimentLabel))
combined = combined.tocsr()

In [50]:
kf = StratifiedKFold(n_splits=5, shuffle=True)
for train, test in kf.split(combined, df[['genre']].values):
    train_X = combined[train]
    train_y = df['genre'].values[train]
    
    model = MultinomialNB()
    model.fit(train_X, train_y)
    
    test_X = combined[test]
    test_y = df['genre'].values[test]
    
    prediction = model.predict(test_X)
    accuracy = 0
    for I in range(len(prediction)):
        if prediction[I] == test_y[I]:
            accuracy += 1
    print(accuracy / len(test_y))

0.4692027118923486
0.4697057126370456
0.4685406920429487
0.46910679691686247
0.4699396112863002


In [57]:
sum([0.4692027118923486, 0.4697057126370456, 0.4685406920429487, 0.46910679691686247, 0.4699396112863002]) / 5

0.46929910495510113