# Genre Prediction

## What are the 10 most popular genres per decade on spotify?

### Import Libraries

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import random
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

### Load Dataset

In [2]:
df_sptfy = pd.read_csv('data/billboard-lyrics-spotify.csv')

In [13]:
df_sptfy.head()

Unnamed: 0,artist_all,artist_base,rank,song,year,artist_featured,song_clean,artist_clean,lyrics,acousticness,...,speechiness,tempo,time_signature,valence,duration_min,num_words,words_per_sec,num_uniq_words,decade,uniq_ratio
0,percy faith,percy faith,1,theme from a summer place,1960,,theme from a summer place,percy faith,theres a summer place where it may rain or sto...,0.631,...,0.0253,92.631,4.0,0.749,2.414883,104.0,0.717771,58.0,1960,1.793103
1,jim reeves,jim reeves,2,he'll have to go,1960,,hell have to go,jim reeves,put your sweet lips a little closer to the pho...,0.909,...,0.0379,81.181,3.0,0.2,2.310667,152.0,1.096365,69.0,1960,2.202899
2,the everly brothers,the everly brothers,3,cathy's clown,1960,,cathys clown,the everly brothers,dont want your love any more dont want your k...,0.412,...,0.0339,119.809,4.0,0.866,2.400217,121.0,0.840202,64.0,1960,1.890625
3,johnny preston,johnny preston,4,running bear,1960,,running bear,johnny preston,on the bank of the river stood running bear yo...,0.854,...,0.053,119.987,4.0,0.822,2.636667,220.0,1.390645,89.0,1960,2.47191
4,mark dinning,mark dinning,5,teen angel,1960,,teen angel,mark dinning,teen angel teen angel teen angel that fateful ...,0.936,...,0.0459,101.517,4.0,0.282,2.664883,109.0,0.681706,73.0,1960,1.493151


This is not scientifically proven but as a simplification, I'm adding a sentiment column with either 'positive' (1) or 'negative' (0), depending on the valence. Calculated where >50 valence is positive and <50 valence is negative. In Spotify's API is something called Valence, that describes the musical positiveness conveyed by a track. Tracks with high valence sound more positive (happy, cheerful, euphoric), while tracks with low valence sound more negative (sad, depressed, angry).

In [19]:
df_sptfy['sentiment'] = df_sptfy['valence'].apply(lambda x: 1 if x > 0.5 else 0)

In [4]:
print('There are', len(df_sptfy), 'songs in the dataset')
print('The earliest year in the dataset is from', df_sptfy['year'].min())
print('The latest year in the dataset is from', df_sptfy['year'].max())
print('The shortest lyrics are', df_sptfy['num_words'].min(), 'words in length')
print('The shortest lyrics are', df_sptfy['num_words'].max(), 'words in length')
print('The fastest tempo in the dataset is', df_sptfy['tempo'].max())
print('The shortest tempo in the dataset is', df_sptfy['tempo'].min())
print('The average tempo in the dataset is', df_sptfy['tempo'].mean())
print('There are', df_sptfy['artist_clean'].nunique(), 'unique artists')

There are 5566 songs in the dataset
The earliest year in the dataset is from 1960
The latest year in the dataset is from 2017
The shortest lyrics are 0.0 words in length
The shortest lyrics are 1143.0 words in length
The fastest tempo in the dataset is 233.429
The shortest tempo in the dataset is 50.975
The average tempo in the dataset is 119.35458858413638
There are 2283 unique artists


In [5]:
df_sptfy['year'].value_counts()

year
1969    101
1960    100
1973    100
1961    100
1988    100
1986    100
1985    100
1984    100
1981    100
1978    100
1977    100
1975    100
1974    100
1989    100
1972    100
1967    100
1971    100
1965    100
1963    100
1968    100
1970    100
1979     99
1992     99
1982     99
1983     99
1966     99
1964     99
1987     99
1976     99
1962     99
1980     99
1991     99
1990     98
1999     97
2003     97
1993     95
2000     95
2002     94
2005     94
1994     93
2015     93
2012     92
1996     92
2001     92
2004     91
2007     91
2017     91
2008     90
2011     90
1995     90
2006     89
2013     89
2014     89
1998     88
2009     88
2010     86
1997     86
2016     86
Name: count, dtype: int64

### Filter by decade

In [8]:
value_counts = df_sptfy['decade'].value_counts()

grouped_data = df_sptfy.groupby('decade')

for value, group in grouped_data:
    group_name = f"df_year_{value}"  # Creating a unique name for each DataFrame
    globals()[group_name] = group.copy()

In [21]:
for value in value_counts.index:
    global_variable_name = f"df_year_{value}"
    print(f"Global DataFrame '{global_variable_name}' has {len(globals()[global_variable_name])} entries.")

Global DataFrame 'df_year_1960' has 998 entries.
Global DataFrame 'df_year_1970' has 998 entries.
Global DataFrame 'df_year_1980' has 996 entries.
Global DataFrame 'df_year_1990' has 937 entries.
Global DataFrame 'df_year_2000' has 921 entries.
Global DataFrame 'df_year_2010' has 716 entries.


# We probably have to test and training splits for each decade.

### Create TF-IDF vectorizer for each decade

For each document in the test split, classify the document using KNN on the document tf-idf vectors with k=5 and cosine similarity. 

Note: you should only be comparing against the documents in the train split for classification, not those in the test split. Report the performance of your model. Compute micro- and macro-F1.

In [None]:

vectorizer = TfidfVectorizer(min_df=5)
X_train = vectorizer.fit_transform(df_year_1960['article'])
y_train = df_train['label']

knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn.fit(X_train, y_train)

X_test = vectorizer.transform(df_test['article'])
y_test = df_test['label']

y_pred = knn.predict(X_test)
print('Precision:', precision_score(y_test, y_pred, average=None))
print('Recall:', recall_score(y_test, y_pred, average=None))
print('Micro F1: {:.3f}'.format(f1_score(y_test, y_pred, average='micro')))
print('Macro F1: {:.3f}'.format(f1_score(y_test, y_pred, average='macro')))

Report the per-class F1-scores for the test split

In [None]:
print('F1 score per class:', format(f1_score(y_test, y_pred, average=None)))
from sklearn.metrics import classification_report
print(classification_report(df_test['label'], y_pred))

Create confusion matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Compute confusion matrix
conf_matrix = confusion_matrix(df_test['label'], df_test['predicted_label'], labels=[1, 2, 3, 4])

# Display confusion matrix
print(conf_matrix)

# Display confusion matrix with labels
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[1, 2, 3, 4], yticklabels=[1, 2, 3, 4])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()