# Baseline approach for song genre prediction
* Always predict the majority class

In [43]:
import pandas as pd
pd.options.display.max_columns = 200

songs = pd.read_csv('data/spotify_data.csv', index_col=[0])
songs.head()
# Delete duplicates and rows with NAs
songs.drop_duplicates(inplace=True)
songs.dropna(inplace=True)
songs.head()

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [46]:
# Find majority class
grouped_by_genre = songs.groupby(['track_genre']).size()
max_value = max(grouped_by_genre)
max_i = list(grouped_by_genre).index(max_value)
genre_names = list(grouped_by_genre.index)
majority_class = genre_names[max_i]
print(f"Majority class is: {majority_class}")

Majority class is: acoustic


In [45]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

songs_data = songs.drop(columns = ["track_genre"])
genres = songs["track_genre"]
# Numerically encode the labels
label_encoder = LabelEncoder()
encoded_genres = label_encoder.fit_transform(genres)
# Perform data split
X_train, X_test, y_train, y_test = train_test_split(songs_data, encoded_genres, test_size=0.3, 
                                                    stratify=encoded_genres, shuffle=True, random_state=100)
# Compute predictions (always predict majority class)
majority_class_label = list(label_encoder.classes_).index(majority_class)
predictions = [majority_class_label] * len(y_test)
# Evaluate baseline model
base_accuracy = accuracy_score(y_test, predictions)
base_f1_weighted = f1_score(y_test, predictions, average='weighted')
print("Baseline performance predicting always the majority class")
print(f"Accuracy: {base_accuracy:.6f}")
print(f"F1-score: {base_f1_weighted:.6f}")

Baseline performance predicting always the majority class
Accuracy: 0.008807
F1-score: 0.000154
