In [202]:
#Helpful Articles:

#https://towardsdatascience.com/beyond-the-lyrics-the-intersection-of-music-and-data-visualization-4a71039f447c
#https://towardsdatascience.com/how-we-used-nltk-and-nlp-to-predict-a-songs-genre-from-its-lyrics-54e338ded537

#http://cs229.stanford.edu/proj2017/final-reports/5241796.pdf

#https://algorithmia.com/blog/using-machine-learning-for-sentiment-analysis-a-deep-dive
#https://towardsdatascience.com/naive-bayes-document-classification-in-python-e33ff50f937e
#https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34
#https://colab.research.google.com/drive/1ixOZTKLz4aAa-MtC6dy_sAvc9HujQmHN#scrollTo=ytg-0FTHG7ik
#https://perun.pmf.uns.ac.rs/radovanovic/dmsem/cd/install/Weka/doc/pubs/2004/KibriyaAI04-MultinomialNBRevisited.pdf

# Predicting Song Genre from Song Lyrics

The data is an accumulation of 380,000+ lyrics from the website MetroLyrics.com. We have multiple artists in 10 varying genres ranging from Pop & Rock to Indie & Folk. The songs range from years 1960 to 2016. 

The goal of the project was, as said in title, to predict genre based on the lyrics. In order to tackle this problem, this involved doing natural language processing in order to effectively analyze the lyrics. 

In [608]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import seaborn as sns
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing

In [699]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Manda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Manda\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Manda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Manda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [713]:
df1 = pd.read_csv('lyrics.csv', delimiter=',')
df1.dataframeName = 'lyrics.csv'
emptyLyrics = len(df1)
df1 = df1[df1['lyrics']!='instrumental'].dropna()
emptyLyrics -= len(df1)
print(str(emptyLyrics) + " rows dropped (no lyrics)")

#drop 'Not Available' genre
df1 = df1[df1['genre']!='Not Available'].dropna()
df1['lyrics'].dropna(inplace=True)
df1['lyrics'].dropna(inplace=True)

#Change all the text to lower case. 
df1['lyrics'] = [entry.lower() for entry in df1['lyrics']]

95761 rows dropped (no lyrics)


In [714]:
df1 = df1.sample(frac = 0.02)

### Work Tokenization

Basically, what word tokenization does is splits the text into individual words. For instance, for the lyrics "oh baby, how you doing?" into a list with the tokens ['oh','baby', ",",'how','you','doing','?']. As seen here, it splits punctuations into their own tokens which will be removed later.

In [715]:
df1['lyrics']= [word_tokenize(entry) for entry in df1['lyrics']]

### Lemmatizing the Lyrics and Removing Stop Words from Lyrics

##### Stop Words:
As most of us are aware, there are words in songs that have no significance to the listener for understanding the song. Some examples include "la la", "oh", "ooo", "na", etc. It is important that we remove these words along with other words that the NLTK package deems as stopwords such as "the", "an", "a", etc.

In [718]:
#reset the index (because we took random sample of data) for next function
df1 = df1.drop(columns = ['index'])
df1 = df1.reset_index(drop = True)

In [719]:
customStopWords = ["'s", "n't", "'m", "'re", "'ll","'ve","...", "ä±", "''", '``',\
                  '--', "'d", 'el', 'la', 'chorus', 'verse', 'oh', 'la', 'ya', 'na', 'wo', 'wan', 'Chorus', 'Verse',
                  'ca', 'cuz', '[Verse 1:]', '[Intro:]', '[Chorus]', '\n', 's', 't', 'n', 'don',
                  'ya','aah','ye','hey','ba','da','buh','duh','doo','oh','ooh','woo','uh','hoo','ah','yeah',
                   'oo','la','chorus','beep','ha']

stopWords = stopwords.words('english') + customStopWords

###### Lemmatization:
WordNetLemmatizer() enables the process of converting a word to its base form. This involves grouping together different inflected forms of a word, while still keeping the context of the word. Words with multiple variations, but with similar meanings, can be analysed as a single item. An example of this is "better" to "good", "saying" to "say", and "heard" to "hear".

WordNetLemmatizer() takes a part of speech parameter: here it is "pos_tag". This is important for lemmatization so the computer can recognize the context of the word to lemmatize it properly. 

If a speech parameter is not supplied, the default is "noun." This means that an attempt will be made to find the closest noun, which can create trouble.

In [720]:
#labeling words as their respective parts of speech
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

#segmenting by parts of speech
for index,entry in enumerate(df1['lyrics']):
    Final_words = []
    verb_words = []
    adv_words = []
    noun_words = []
    adj_words = []
    #WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word not in stopWords and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
        df1.loc[index,'lyrics_final'] = str(Final_words)
        
        if word not in stopwords.words('english') and word not in stopWords and word.isalpha() and 'NN' in tag:
            noun_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            noun_words.append(noun_Final)
        df1.loc[index,'lyrics_noun'] = str(noun_words)


### Split Data for Train & Test

In [799]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(df1['lyrics_final'],df1['genre'],test_size=0.3)

# What is TFIDF Vectorizer - Measuring Originality

### TFIDF - term-frequency x inverse document-frequency

TFIDF takes an occurence of a token (i.e. a lemmatized word) in the data and scales down the impact of these tokens that occur very frequently in our data. In our example, this means a lemmatized word would be "less informative" if it occured more frequently across all genres. A key example of this would be the word "love".

The tf-idf is a statistic that increases with the number of times a word appears in the document (lyrics of a song), penalized by the number of genres in the data that contain that word.

However, if a lemmatized word appeared more frequently in a particular genre it would be more valuable/original, and we would  capture that originality which will help us predict the genre of that that group of words. 


------------------

max_df - ignoring terms that have a document frequency higher than 0.5   
.fit() - fits the vocabulary and term frequencies of word-vector   
.transform() - using "fit()" calculation parameters, apply the transformation to a dataset.  
  

In [910]:
Tfidf_vect = TfidfVectorizer(sublinear_tf=True, max_features=500, max_df=0.5) #igoring
Tfidf_vect.fit(df1['lyrics_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

### What is a TFIDF Vector?

First Column: Document index (song lyrics) 

Second Column: Specific word-vector index (lemmatized word in lyrics)  

Third Column: TFIDF score ("originality score") for word in lyrics

----------------

Here is an example of what a tfidf sparse matrix looks like:

In [933]:
print(Test_X_Tfidf[:,199:200])

  (63, 0)	0.29619966120734137
  (88, 0)	0.2329065782522921
  (142, 0)	0.1118067687214563
  (193, 0)	0.24542051581260635
  (227, 0)	0.15556056436107607
  (270, 0)	0.260991868315882
  (364, 0)	0.36506060953644315
  (374, 0)	0.3806046422433339
  (420, 0)	0.08462105943768704
  (443, 0)	0.08600433945050576
  (473, 0)	0.09718290825405236
  (493, 0)	0.09454518568139618
  (562, 0)	0.11171370566583995
  (609, 0)	0.14494775945472493
  (619, 0)	0.17205466904809663
  (631, 0)	0.08207772112174122
  (674, 0)	0.1726111973479978
  (723, 0)	0.14199423484291174
  (850, 0)	0.41936202081384105
  (855, 0)	0.17222275569383147
  (922, 0)	0.11386297191683073
  (939, 0)	0.09092201587821115
  (961, 0)	0.20846012864704794
  (1011, 0)	0.11263148123692943
  (1029, 0)	0.09753028472012273
  (1101, 0)	0.2031039363502327
  (1121, 0)	0.15286874128829503
  (1375, 0)	0.138341401157753


We see here a more tangible look at the tfidf sparse matrix using .get_feature_names(). get_features_names() shows, in alphabetical order, the words of the word-vectors that would be in column 2.

In [938]:
#these are examples of what the second column represents
Tfidf_vect.fit(df1['lyrics_final'].head(2)).get_feature_names()

['acerca',
 'activo',
 'aleja',
 'alto',
 'amarte',
 'amigo',
 'arena',
 'arrepentidos',
 'arte',
 'atenciones',
 'atento',
 'ayudar',
 'aãºn',
 'besa',
 'bien',
 'botella',
 'buena',
 'buenos',
 'caballos',
 'camino',
 'carros',
 'ciegas',
 'conmigo',
 'conocer',
 'correr',
 'corridos',
 'cry',
 'cualquier',
 'cuidado',
 'dedos',
 'deja',
 'deportivos',
 'elegante',
 'ella',
 'empinarle',
 'encuentro',
 'entre',
 'eres',
 'es',
 'escuchar',
 'esfuerzo',
 'esta',
 'estatura',
 'existe',
 'familia',
 'fue',
 'gano',
 'gente',
 'girl',
 'go',
 'guerrero',
 'gusta',
 'gustan',
 'gusto',
 'hacerla',
 'hay',
 'hombre',
 'importante',
 'las',
 'le',
 'lo',
 'logrado',
 'los',
 'love',
 'luis',
 'luna',
 'luz',
 'madre',
 'manos',
 'marea',
 'mirada',
 'moldearla',
 'moriria',
 'mucha',
 'muchas',
 'mujeres',
 'muy',
 'mãºsica',
 'nadie',
 'never',
 'nivel',
 'obra',
 'olvidare',
 'olvido',
 'otra',
 'padre',
 'para',
 'parte',
 'pasar',
 'pedro',
 'pendiente',
 'pero',
 'piel',
 'present',
 

# Naive Bayes - Multinomial

In [876]:
#TFIDF
Naive = naive_bayes.MultinomialNB(alpha = .01)
Naive.fit(Train_X_Tfidf,Train_Y)
predictions_NB = Naive.predict(Test_X_Tfidf)
print("Naive Bayes Accuracy Score TFIDF:",accuracy_score(predictions_NB, Test_Y)*100)


Naive Bayes Accuracy Score TFIDF: 50.824175824175825


In [871]:
#SVM
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score: ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score:  53.57142857142857
