# SD201 : MINING OF LARGE DATASETS

## MUSIC GENRE CLASSIFICATION USING SONG LYRICS

In [2]:
# initial imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re

from langdetect import detect

### 1. CONSTRUCTION OF THE DATASET

The source of the dataset is a csv file : a result of a scrapping work through [GENIUS](https://genius.com/).

#### 1.1 RAW DATA EXPLORATION

In [3]:
# loading the dataset from csv file
data = pd.read_csv("lyrics.csv", sep='#')

In [4]:
# columns of dataset
data.columns

Index(['artist', 'title', 'lyrics', 'genre', 'url'], dtype='object')

In [5]:
# inspecting the content of the dataset
data.head()

Unnamed: 0,artist,title,lyrics,genre,url
0,Eminem,Rap God,"Rap God Lyrics\r\n""Look, I was gonna go easy o...",rap,https://genius.com/Eminem-rap-god-lyrics
1,Cardi B,WAP,WAP Lyrics\r\nWhores in this house\r\nThere's ...,rap,https://genius.com/Cardi-b-wap-lyrics
2,Kendrick Lamar,HUMBLE.,HUMBLE. Lyrics\r\nNobody pray for me\r\nIt bee...,rap,https://genius.com/Kendrick-lamar-humble-lyrics
3,Migos,Bad and Boujee,"Bad and Boujee Lyrics\r\nYou know, young rich ...",rap,https://genius.com/Migos-bad-and-boujee-lyrics
4,Drake,God's Plan,God’s Plan Lyrics\r\nAnd they wishin' and wish...,rap,https://genius.com/Drake-gods-plan-lyrics


In [6]:
# size of the dataset
data.shape

(6858, 5)

In [7]:
# types of data 
data.dtypes

artist    object
title     object
lyrics    object
genre     object
url       object
dtype: object

#### 1.2 RAW DATA CLEANING

In [8]:
# Removing unwanted columns
data = data.drop(columns = ['artist','title','url'])

In [9]:
#Missing values
data.isna().sum()[data.isna().sum() > 0]

lyrics    25
dtype: int64

In [10]:
# Deleting rows with missing values
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [11]:
#Check missing values (check is dropping went right)
data.isna().sum()


lyrics    0
genre     0
dtype: int64

In [12]:
#deleting the "[title]-Lyrics" from the beginning of each lyrics and "[number xxx]Embed" from the end and the non-english lyrics

for i in range(len(data['lyrics'])):
    if(detect(data['lyrics'][i]) != 'en'):
        data.drop([i])

    if(re.findall(r'\d+.*Embed.*', data['lyrics'][i])!=[]):
        data['lyrics'][i] = data['lyrics'][i].split(re.findall(r'\d+.*Embed.*', data['lyrics'][i])[0])[0]

    if (re.findall(r'\bLyrics\b', data['lyrics'][i])!=[]):
        data['lyrics'][i] = data['lyrics'][i].split('Lyrics')[1]
    data['lyrics'][i] = data['lyrics'][i].lower()

In [13]:
#Checking the data cleaning on a random song lyrics
print(data['lyrics'][1])


whores in this house
there's some whores in this house
there's some whores in this house
there's some whores in this house (hol' up)
i said certified freak, seven days a week
wet-ass pussy, make that pullout game weak, woo (ah)

yeah, yeah, yeah, yeah
yeah, you fuckin' with some wet-ass pussy
bring a bucket and a mop for this wet-ass pussy
give me everything you got for this wet-ass pussy

beat it up, nigga, catch a charge
extra large and extra hard
put this pussy right in your face
swipe your nose like a credit card
hop on top, i wanna ride
i do a kegel while it's inside
spit in my mouth, look in my eyes
this pussy is wet, come take a dive
tie me up like i'm surprised
let's roleplay, i'll wear a disguise
i want you to park that big mack truck right in this little garage
make it cream, make me scream
out in public, make a scene
i don't cook, i don't clean
but let me tell you how i got this ring (ayy, ayy)
you might also like
gobble me, swallow me, drip dow

In [None]:
# Common repeated words by genre
for the_genre in data.genre.unique():
    print(the_genre)
    print(data.loc[data['genre'] == the_genre].lyrics.str.split(expand=True).stack().value_counts()[0:10])

rap


In [None]:
# Removing punctuation and stopwords from lyrics

from nltk import word_tokenize

from nltk.corpus import stopwords


#stopwords = stopwords.words('english')
stopwords = ["'d","'m","'s","'ve","'re","'ll","'cause","'bout", 
             "a", "able", "about", "above", "across", "actually", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "all", "almost", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "an", "and", "any", "anybody", "anyhow",  "apart", "are", "aren", "arent", "aren't", "around", "as", "at",  "aw", "away",
             "b", "back", "be" , "became", "because", "become", "becomes", "becoming", "been", "before", "behind", "being", "below", "beside", "besides", "best", "better", "between", "beyond","both", "bottom", "but", "by", 
             "c", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch",  "cit", "clearly", "c'mon", "cn", "co", "com", "come", "comes","could", "couldn", "couldnt", "couldn't", "currently", 
             "d", "date", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "do", "does", "doesn", "doesn't", "doing","doin'", "don", "done", "don't", "down", "due", "during", 
             "e", "each", "ei", "eight", "eighty", "either", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "especially",  "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly",  "except", "ey", 
             "f", "far", "fc", "few", "fifteen", "fifth", "fify", "fill", "first", "five", "fix", "fo", "for", "former", "formerly", "forth", "forty", "found", "four",  "from", "front", "fu", "full", "further", "furthermore",
             "g", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "go", "goes", "going","goin'", "gon" ,"gon'" , "gonna" , "gone", "got", "gotten", "gr", 
             "h", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll",  "her", "here", "heres", "here's", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his",  "ho",  "how",  "however", "how's", 
             "i", "ia", "i'd", "if", "ignored", "ih", "ii", "i'll", "im", "i'm", "in", "inc", "indeed",  "inner", "instead", "into", "is", "isn", "isn't", "it", "it'd", "it'll", "its", "it's", "itself", "iv", "i've",
             "j", "just", 
             "k", "ke", "keep", "keeps", "kept","know", "known", "knows", "ko", 
             "l", "la","ll", "last", "lately", "later", "latter", "latterly", "least", "les", "less", "lest", "let", "lets", "let's", "like", "liked", "likely", "line", "little", "ll", "look", "looking", "looks", "los", 
             "m", "ma", "made", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms","much", "mug", "must", "mustn", "mustn't", "my", "myself", 
             "n", "n't", "na", "name", "near", "nearly", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nn", "no", "nobody", "non", "none", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere",  
             "o", "oa", "obviously", "of" , "off", "often", "oh" , "ooh" , "ok", "okay","old",  "on", "once", "one", "ones", "only", "onto", "oo", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", 
             "p","part", "perhaps", "probably", 
             "q", "quickly", "quite", 
             "r", "rather", "readily", "really", "right", "run",
             "s", "sa", "said", "same", "saw", "say", "saying", "says", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty","so" , "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere","still",  "such", 
             "t", "take", "taken", "taking","to","tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there","there's", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "together", "too", "took", "top", "toward", "towards",  "truly",  "twelve", "twenty", "twice", "two",  
             "u", "uh", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", 
             "v", "va","very",
             "w", "wa", "want", "wanna" , "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", 
             "x", "xo",
             "y", "yes", "yet", "yeah" , "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", 
             "z", "zero"]

data['lyrics'] = data['lyrics'].str.replace("[-\?.,\/#!$%\^&\*;:{}=\_~()\`]", ' ')
data['lyrics'] = data['lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))



In [None]:
# After cleaning : Checking Common repeated words by genre
for the_genre in data.genre.unique():
    print(the_genre)
    print(data.loc[data['genre'] == the_genre].lyrics.str.split(expand=True).stack().value_counts()[0:10])

In [None]:
# Lemmatize lyrics = reduce words (“stay” from “staying”)
import nltk

from nltk.stem import WordNetLemmatizer

def lemmatize_lyrics(lyrics):
    # lyrics to list of words
    lyrics_tokens = lyrics.split()

    # lemmatizing every token of every song lyrics
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    lyrics_tokens = [lemmatizer.lemmatize(token) for token in lyrics_tokens]
        
    # joining tokens together 
    cleaned_lyrics = " ".join(lyrics_tokens)
    return cleaned_lyrics


In [None]:
data["lyrics"]  = data["lyrics"].apply(lambda x:  lemmatize_lyrics(x))

In [None]:
# After lemmatizing : Checking Common repeated words by genre
for the_genre in data.genre.unique():
    print(the_genre)
    print(data.loc[data['genre'] == the_genre].lyrics.str.split(expand=True).stack().value_counts()[0:10])

#### 1.3 CLEANED RAW DATA VISUALISATION

In [None]:
print("Number of songs grouped by genre of music:",data.groupby('genre').count()['lyrics'])
ax = plt.subplots()
ax = sns.countplot(x="genre", data = data, palette= "Set1")
ax.set_title("Number of songs by genre")
plt.show()

**Word clouds by genre:**

In [None]:
data_genre = data.genre.unique()

In [None]:
data_classified = []
for genre_lyrics in data_genre:
    lyrics_list = []
    for i in range(len(data.lyrics) ):
        if (data.genre[i]==genre_lyrics):
            lyrics_list.append(data.lyrics[i])
    data_classified.append(lyrics_list)

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

index_genre = 0 
#iterate through every music genre
for lyrics_of_genre in data_classified :

    wordcloud = WordCloud(width = 300, height = 300,background_color ='white', min_font_size = 10).generate(" ".join(lyrics_of_genre)+" ")
 
    # plot the WordCloud                       
    plt.figure(figsize = (5, 5), facecolor = None)
    plt.imshow(wordcloud)
    plt.title(data_genre[index_genre])
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()
    # Save the image in the img folder:
    #wordcloud.to_file("img"+data_genre[index_genre]+"_words.png")
    index_genre +=1

### 2. DATA MODELING

In [None]:
#imports for modeling
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from nltk.tokenize import RegexpTokenizer


In [None]:
# Selecting the columns for training
y = data.genre.values
#x = data.lyrics.values 

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(ngram_range = (1,1),tokenizer = token.tokenize)
x= cv.fit_transform(data['lyrics'])

In [None]:
print(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
from sklearn.naive_bayes import BernoulliNB,MultinomialNB

print ("Bernoulli naive bayes: %s"%(BernoulliNB().fit(x_train,y_train).score(x_test,y_test)))
print ("Multinomial naive bayes: %s"%(MultinomialNB().fit(x_train,y_train).score(x_test,y_test)))


## Another approach

### Resizing the dataset

In [None]:
df_lyrics = pd.read_csv("https://www.kaggle.com/datasets/neisse/scrapped-lyrics-from-6-genres")