In [3]:
#importing all the needed libraries for creating our exploratory plots
import pandas as pd
from collections import Counter
from wordcloud import STOPWORDS
import string
import re
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

In [4]:
#reading in the df 
df = pd.read_csv('../DATA/cleaned_train_lyrics.csv', encoding='latin1')
df.head()

Unnamed: 0.1,Unnamed: 0,Lyric,genre
0,0,"See me, ancient one! Dismal Tuat, Nergal unsaf...",Metal
1,1,Feels like Im covered in lies so turn off the ...,Metal
2,2,"Works of art, painted black Magniloquent, blee...",Metal
3,3,Into the cage like an animal You must survive ...,Metal
4,4,Paralysed in pleasure I hear you call Lost my ...,Metal


In [5]:
## Prelim Work: Remove Custom Words and Clean Lyrics (lower case and remove punctuation)
#function to convert lyrics to lowercase, remove all punctuation, split lyrics and remove common stopwords
def clean_lyrics(lyrics):
    lyrics = lyrics.lower() 
    lyrics = re.sub(f'[{string.punctuation}]', '', lyrics)  
    words = lyrics.split()  
    words = [word for word in words if word not in STOPWORDS]  
    return words
#function to remove custom words from custom_words_to_remove list setup below
def remove_custom_words(lyrics, custom_words):
    lyrics = clean_lyrics(lyrics)  
    return ' '.join([word for word in lyrics if word not in custom_words])
#list of custom words to remove from the lyrics
custom_words_to_remove = ['he', 'her', 'it', 'and', 'the', 'you', 'i', 'we', 'im', 'dont', 'got', 'verse', 'chorus', 'youre', 'oh', 'ill', '1', '2']

#apply the remove_custom_words function to the 'Lyric' column and create a new 'cleaned_lyrics' column
df['cleaned_lyrics'] = df['Lyric'].apply(lambda x: remove_custom_words(x, custom_words_to_remove))
#create a new 'word_count' column to count the number of words in the cleaned lyrics
df['word_count'] = df['cleaned_lyrics'].apply(lambda x: len(x.split()))

#init an empty list and extract words from the 'cleaned_lyrics' column and extend the cleaned_words list
cleaned_words = []
df['cleaned_lyrics'].apply(lambda x: cleaned_words.extend(x.split()))
#count the frequency of each word in the cleaned_words list using Counter
cleaned_word_counts = Counter(cleaned_words)
#get the 50 most common words and their frequencies
cleaned_common_words = cleaned_word_counts.most_common(50)

print(cleaned_common_words)


[('know', 580937), ('love', 480302), ('now', 458553), ('time', 390728), ('will', 379832), ('one', 378435), ('see', 373051), ('never', 364395), ('go', 355086), ('cant', 301345), ('back', 292851), ('life', 280511), ('yeah', 273786), ('come', 269401), ('way', 265705), ('cause', 264817), ('take', 262438), ('make', 256300), ('say', 252583), ('let', 244015), ('want', 242396), ('aint', 238216), ('away', 214665), ('feel', 214508), ('man', 210734), ('ive', 210314), ('right', 208098), ('baby', 201397), ('well', 198980), ('thats', 198818), ('day', 189445), ('night', 189391), ('need', 189309), ('world', 186944), ('heart', 184775), ('gonna', 183131), ('tell', 176110), ('still', 172695), ('wanna', 170217), ('us', 167388), ('think', 166142), ('theres', 158045), ('keep', 154695), ('eyes', 154109), ('every', 153406), ('good', 152118), ('mind', 151396), ('give', 149525), ('little', 147629), ('said', 142507)]


In [6]:
## Feature Columns: Adding Binary Classes for top 20 most commonly appearing songs 
#list of common words to create binary feature columns from prelim results
common_words = ['know', 'love', 'now', 'time', 'will', 'one', 'see', 'never', 'go', 'cant', 'back', 'life', 'yeah', 
                'come', 'way', 'cause', 'take', 'make', 'say', 'let', 'want', 'aint', 'away', 'feel', 'man', 'ive', 
                'right', 'baby', 'well', 'thats', 'day', 'night', 'need', 'world', 'heart', 'gonna', 'tell', 'still', 
                'wanna', 'us', 'think', 'theres', 'keep', 'eyes', 'every', 'good', 'mind', 'give', 'little', 'said']
#create a binary column for each common word
#1 if the word is present in the cleaned lyrics, 0 otherwise
for word in common_words:
    df[word] = df['cleaned_lyrics'].apply(lambda x: 1 if word in x.split() else 0)
#display first few rows of df to check if it worked correctly
df.head()

Unnamed: 0.1,Unnamed: 0,Lyric,genre,cleaned_lyrics,word_count,know,love,now,time,will,...,think,theres,keep,eyes,every,good,mind,give,little,said
0,0,"See me, ancient one! Dismal Tuat, Nergal unsaf...",Metal,see ancient one dismal tuat nergal unsafe spre...,37,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,Feels like Im covered in lies so turn off the ...,Metal,feels covered lies turn light closing eyes fly...,88,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,2,"Works of art, painted black Magniloquent, blee...",Metal,works art painted black magniloquent bleeding ...,91,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,Into the cage like an animal You must survive ...,Metal,cage animal must survive kill die learning cri...,93,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Paralysed in pleasure I hear you call Lost my ...,Metal,paralysed pleasure hear call lost cognitive co...,100,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
## Analysis: Random Forest Classifier 
#define the features for the classifier as the list of common words
features = common_words
#set X to be the feature columns and y to be the genre
X = df[features]  
y = df['genre']   
#split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#init a Random Forest Classifier with 40 trees and a fixed random seed for reproducibility
rf_model = RandomForestClassifier(n_estimators=40, random_state=42)
#fit the Random Forest model using the training data
rf_model.fit(X_train, y_train)
#make predictions on the test data
y_pred = rf_model.predict(X_test)
#calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
#print a classification report with precision, recall, and F1-score for each genre
print(classification_report(y_test, y_pred))


Accuracy: 0.4558
              precision    recall  f1-score   support

       Metal       0.46      0.56      0.50     20030
     country       0.51      0.55      0.53     20044
         pop       0.30      0.25      0.28     20069
         rap       0.66      0.69      0.68     19866
        rock       0.28      0.23      0.25     19991

    accuracy                           0.46    100000
   macro avg       0.44      0.46      0.45    100000
weighted avg       0.44      0.46      0.45    100000



In [23]:
## Benchmark 1: Dummy Classifier 
features = common_words

X = df[features]  
y = df['genre']   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#^^^^ Above lines are the same from previous cell
#init a Dummy Classifier using the 'most_frequent' strategy
dummy_model = DummyClassifier(strategy='most_frequent')
dummy_model.fit(X_train, y_train)
#make predictions on the test data using the Dummy Classifier
y_dummy_pred = dummy_model.predict(X_test)
#calculate and print the accuracy of the Dummy Classifier
dummy_accuracy = accuracy_score(y_test, y_dummy_pred)
print(f"Dummy Classifier Accuracy: {dummy_accuracy}")
#print a detailed classification report for the Dummy Classifier
print("Dummy Classifier Classification Report:")
print(classification_report(y_test, y_dummy_pred))


Dummy Classifier Accuracy: 0.19866
Dummy Classifier Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

       Metal       0.00      0.00      0.00     20030
     country       0.00      0.00      0.00     20044
         pop       0.00      0.00      0.00     20069
         rap       0.20      1.00      0.33     19866
        rock       0.00      0.00      0.00     19991

    accuracy                           0.20    100000
   macro avg       0.04      0.20      0.07    100000
weighted avg       0.04      0.20      0.07    100000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [8]:
## Benchmark 2: KNN Classifier 
features = ['word_count']

X = df[features]  
y = df['genre']   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#^^^^ Above lines are the same from previous cell
#init a K-Nearest Neighbors Classifier with 3 neighbors, fit it and make preductitions on the test data
knn_model = KNeighborsClassifier(n_neighbors=3)

knn_model.fit(X_train, y_train)

y_knn_pred = knn_model.predict(X_test)
# Calculate and print the accuracy of the KNN Classifier
knn_accuracy = accuracy_score(y_test, y_knn_pred)
print(f"KNN Classifier Accuracy: {knn_accuracy}")
# Print a detailed classification report for the KNN Classifier
print("KNN Classifier Classification Report:")
print(classification_report(y_test, y_knn_pred))


KNN Classifier Accuracy: 0.30263
KNN Classifier Classification Report:
              precision    recall  f1-score   support

       Metal       0.24      0.45      0.31     20030
     country       0.23      0.27      0.25     20044
         pop       0.22      0.17      0.19     20069
         rap       0.71      0.53      0.61     19866
        rock       0.22      0.10      0.14     19991

    accuracy                           0.30    100000
   macro avg       0.32      0.30      0.30    100000
weighted avg       0.32      0.30      0.30    100000



In [22]:
## Cross Validation Report 

#init a KFold cross-validation with 5 splits, shuffling data and setting a random state of 42
kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
#loop through each split in the KFold cross-validation
for train_index, test_index in kf.split(X):
    #split the dataset into training and testing sets for each fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train the Random Forest model on the current fold's training set
    rf_model.fit(X_train, y_train)
    #make predictions on the current fold's test set
    y_pred = rf_model.predict(X_test)
    #calculate the accuracy for the current fold
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)
    #print the classification report for the current fold
    print(f"Fold Classification Report:\n{classification_report(y_test, y_pred)}\n")
#calculate the mean accuracy across all folds and print the accuracy across the 5 folds
mean_accuracy = np.mean(accuracies)
print(f"Mean Accuracy across {kf.n_splits} folds: {mean_accuracy}")


Fold Classification Report:
              precision    recall  f1-score   support

       Metal       0.46      0.56      0.50     20030
     country       0.51      0.55      0.53     20044
         pop       0.31      0.25      0.28     20069
         rap       0.66      0.69      0.68     19866
        rock       0.28      0.23      0.25     19991

    accuracy                           0.46    100000
   macro avg       0.44      0.46      0.45    100000
weighted avg       0.44      0.46      0.45    100000


Fold Classification Report:
              precision    recall  f1-score   support

       Metal       0.46      0.57      0.51     20065
     country       0.51      0.55      0.53     19999
         pop       0.30      0.25      0.27     19935
         rap       0.66      0.70      0.68     19905
        rock       0.28      0.22      0.25     20096

    accuracy                           0.46    100000
   macro avg       0.44      0.46      0.45    100000
weighted avg       0