# Importing libraries and locating data files

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score

# Reading the datasets

In [2]:
! ls music-vibes-datathon-fall23/

meta_data_1.csv       sample_submission.csv train.csv
meta_data_2.csv       test.csv


In [3]:
train = pd.read_csv("music-vibes-datathon-fall23/train.csv")
test = pd.read_csv("music-vibes-datathon-fall23/test.csv")

meta_1 = pd.read_csv("music-vibes-datathon-fall23/meta_data_1.csv")
meta_2 = pd.read_csv("music-vibes-datathon-fall23/meta_data_2.csv")

ss = pd.read_csv("music-vibes-datathon-fall23/sample_submission.csv")

In [4]:
train.shape, test.shape, ss.shape
# NOTE: Here the number of rows for the testing and sample submission files are same.

((4961, 6), (3308, 5), (3308, 2))

# EDA

In [5]:
# train.head()

In [6]:
# test.head()

In [7]:
# train

In [8]:
# meta_1.head()

In [9]:
meta_2.head()

# I wonder what this is?

Unnamed: 0,id,a1,a10,a11,a12,a13,a14,a15,a16,a17,...,a46,a47,a48,a49,a5,a50,a6,a7,a8,a9
0,VKpqYUqyes6tjxUE,1296810.0,4.74,0.1265,62.154,1.081,2082.827044,25.412184,0.4108,1296810.0,...,0.007626219,-134.17572,143.085,0.296,-190.608,0.2388,15.352,-52.668597,0.02265,2.979
1,Bd8ufO7wOFegZXVz,694362.0,3.77,0.3495,72.997,32.317,40.525956,938313.739,2.144,694362.0,...,0.005164596,-623.2314,212.685,1.188,-281.624,0.188,11.0,137.251403,0.029,0.00789
2,RbsK7ydOf0CWomVN,895390.0,0.0,0.254,64.838,20.868,147.962896,103823.0,1.2752,895390.0,...,0.0,-571.708,161.328,0.768,-214.064,0.2352,90.819,35.451403,0.0326,0.897
3,mYx3izxEyO3axauu,752010.0,6e-06,0.09395,60.003,,61.4656,103823.0,0.096,752010.0,...,9.332867e-09,-368.48,,0.1138,,0.3924,,35.451403,0.01645,0.411
4,ECLOwMMomoDb6IM6,817342.0,0.000595,0.3735,83.968,19.398,313.431616,145531.576,2.402,817342.0,...,7.086033e-07,-931.2304,169.17,1.274,-223.452,0.294,62.125,46.651403,0.01825,0.1017


In [10]:
train['target'].value_counts()

target
Metal        517
Rock         453
Indie        432
Blues        399
R&B          375
Pop          338
Soul         331
Country      305
Lofi         305
Disco        248
Jazz         244
EDM          237
Ambient      230
Funk         219
HipHop       205
Classical    123
Name: count, dtype: int64

## Cleaning

In [11]:
result_train_df = train.loc[:, ["song_id", "target"]].merge(meta_1, left_on="song_id", right_on="id", how="left").drop(["id"], axis=1)
result_train_df = result_train_df.merge(meta_2, left_on="song_id", right_on="id", how="left").drop(["id"], axis=1)
result_train_df = result_train_df.fillna(method="ffill")
result_train_df = result_train_df.fillna(0)

# do the same cleaning for the test data, only difference that it does not have the target column, which is something we are predicting
result_test_df = test.loc[:, ["song_id"]].merge(meta_1, left_on="song_id", right_on="id", how='left').drop(["id"], axis=1)
result_test_df = result_test_df.merge(meta_2, left_on="song_id", right_on="id", how='left').drop(["id"], axis=1)
result_test_df = result_test_df.fillna(method='ffill')
result_test_df = result_test_df.fillna(0)


In [12]:
result_test_df.head()

Unnamed: 0,song_id,adaptibility,danceability,duration,energy,explicit,happening,instrumentalness,loudness,mode,...,a46,a47,a48,a49,a5,a50,a6,a7,a8,a9
0,7c61FpilqRU/3Ley,8,61.3,361817.0,56.4,False,0.0585,2e-06,-9.17,6,...,2.104914e-08,-517.188,183.7245,1.076,-243.048,0.1336,46.397,54.251403,0.01255,0.402
1,EmqUjbC3coby/LZy,6,74.5,384379.0,93.2,False,0.136,0.0561,-4.984,6,...,0.0004382812,-464.5088,223.092,1.856,-294.288,0.3412,93.625,127.851403,0.02535,0.333
2,lvF5H8aYwo+TlFJe,2,69.7,279451.0,4.45,False,0.103,0.927,-40.32,6,...,0.007989451,-179.424,208.791,0.326,-278.148,0.2524,8.873,-49.648597,0.01995,2.775
3,O+oGRFmYSUbebxCK,6,61.7,337497.0,50.0,True,0.107,0.00521,-6.494,6,...,5.787217e-05,-324.7,184.779,1.032,-244.736,0.292,85.113,41.451403,0.017,1.158
4,rUR7HzUw1p41lUUn,5,67.1,383205.0,31.4,True,0.0825,0.0,-11.182,5,...,0.0,-351.1148,201.0525,0.592,-267.216,-0.646,11.711,4.251403,0.122,0.33


In [13]:
# Clean column names
result_train_df.columns = result_train_df.columns.str.title()
result_train_df.columns = result_train_df.columns.str.replace('_', ' ')
result_test_df.columns = result_test_df.columns.str.title()
result_test_df.columns = result_test_df.columns.str.replace('_', ' ')

In [14]:
# Clean the release date
result_train_df['Release Date'] = result_train_df['Release Date'].str[:4].astype(int)
result_test_df['Release Date'] = result_test_df['Release Date'].str[:4].astype(int)

In [15]:
import numpy as np
import pandas as pd

# Assume df is your DataFrame
# Replace positive infinity with a large number
result_train_df.replace(np.inf, 1e9, inplace=True)

# Replace negative infinity with a small number
result_train_df.replace(-np.inf, -1e9, inplace=True)
result_test_df.replace(np.inf, 1e9, inplace=True)

# Replace negative infinity with a small number
result_test_df.replace(-np.inf, -1e9, inplace=True)


In [16]:
result_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4961 entries, 0 to 4960
Data columns (total 69 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Song Id           4961 non-null   object 
 1   Target            4961 non-null   object 
 2   Adaptibility      4961 non-null   int64  
 3   Danceability      4961 non-null   float64
 4   Duration          4961 non-null   float64
 5   Energy            4961 non-null   float64
 6   Explicit          4961 non-null   bool   
 7   Happening         4961 non-null   float64
 8   Instrumentalness  4961 non-null   float64
 9   Loudness          4961 non-null   float64
 10  Mode              4961 non-null   int64  
 11  Naturality        4961 non-null   float64
 12  Positiveness      4961 non-null   float64
 13  Release Date      4961 non-null   int64  
 14  Reputation        4961 non-null   int64  
 15  Speechiness       4961 non-null   float64
 16  Tempo             4961 non-null   float64


Converting target values to numerical with label encoder. Alternatively try one-hot encoding it later

In [17]:
label_encoder = LabelEncoder()
result_train_df['Target'] = label_encoder.fit_transform(result_train_df['Target'])

In [18]:
result_test_df = result_test_df.drop('Song Id', axis=1)

In [19]:
result_train_df.head()

Unnamed: 0,Song Id,Target,Adaptibility,Danceability,Duration,Energy,Explicit,Happening,Instrumentalness,Loudness,...,A46,A47,A48,A49,A5,A50,A6,A7,A8,A9
0,AA5aMeYP1klLv1BA,1,5,58.8,372005.0,48.5,False,0.0609,0.0122,-15.29,...,0.0001283386,-741.565,176.2173,0.796,-233.608,0.0,97.472,38.451403,0.0,2.574
1,Wr3MLRGLm08yjrGN,13,1,62.6,368183.0,30.9,True,0.117,1.3e-05,-15.794,...,1.110734e-07,-488.0346,187.449,0.738,-248.924,0.2948,14.376,3.251403,0.02165,2.55
2,XtEbP8zIOloM6r5I,4,5,77.6,732725.0,66.9,False,0.0863,1.8e-05,-12.622,...,1.646928e-07,-844.4118,232.5411,1.83,-306.74,0.0804,88.576,75.251403,0.0331,0.0708
3,NW5kIVzyoiV0zJmJ,14,5,37.1,355317.0,91.6,True,0.134,2.5e-05,-4.108,...,1.537279e-07,-376.2928,110.898,0.638,-147.124,-0.064,64.811,124.651403,0.075,0.00417
4,1uhYBNywheqCdzrg,9,7,44.2,497383.0,33.4,False,0.253,0.000694,-15.53,...,4.065017e-06,-518.702,131.841,0.692,-175.416,0.8732,50.888,8.251403,0.01735,1.701


In [20]:
result_test_df.head()

Unnamed: 0,Adaptibility,Danceability,Duration,Energy,Explicit,Happening,Instrumentalness,Loudness,Mode,Naturality,...,A46,A47,A48,A49,A5,A50,A6,A7,A8,A9
0,8,61.3,361817.0,56.4,False,0.0585,2e-06,-9.17,6,0.134,...,2.104914e-08,-517.188,183.7245,1.076,-243.048,0.1336,46.397,54.251403,0.01255,0.402
1,6,74.5,384379.0,93.2,False,0.136,0.0561,-4.984,6,0.111,...,0.0004382812,-464.5088,223.092,1.856,-294.288,0.3412,93.625,127.851403,0.02535,0.333
2,2,69.7,279451.0,4.45,False,0.103,0.927,-40.32,6,0.925,...,0.007989451,-179.424,208.791,0.326,-278.148,0.2524,8.873,-49.648597,0.01995,2.775
3,6,61.7,337497.0,50.0,True,0.107,0.00521,-6.494,6,0.386,...,5.787217e-05,-324.7,184.779,1.032,-244.736,0.292,85.113,41.451403,0.017,1.158
4,5,67.1,383205.0,31.4,True,0.0825,0.0,-11.182,5,0.11,...,0.0,-351.1148,201.0525,0.592,-267.216,-0.646,11.711,4.251403,0.122,0.33


# Building basic Model

In [48]:
y = result_train_df['Target']
X = result_train_df.drop(['Song Id', 'Target', 'A46', 'A47', 'A48', 'A49', 'A5', 'A50', 'A6', 'A7', 'A8', 'A9'], axis=1)

In [49]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 4961 entries, 0 to 4960
Series name: Target
Non-Null Count  Dtype
--------------  -----
4961 non-null   int64
dtypes: int64(1)
memory usage: 38.9 KB


In [50]:
# X.astype(int)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Basic Log Reg

In [52]:
logistic_reg = LogisticRegression()

In [53]:
logistic_reg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
cross_val_score(estimator=logistic_reg, X=X_train, y=y_train, cv=5)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.19758065, 0.19489247, 0.22043011, 0.20967742, 0.19354839])

## Random Forest

In [61]:
rfc = RandomForestClassifier(random_state=42, ,  n_estimators=200)

In [62]:
rfc.fit(X_train, y_train)

In [63]:
cross_val_score(estimator=rfc, X=X_train, y=y_train, cv=5, n_jobs=-1)

array([0.56317204, 0.58467742, 0.58064516, 0.54032258, 0.54032258])

In [64]:
rfc.feature_importances_

array([0.00959669, 0.01989112, 0.02128878, 0.02330385, 0.00825932,
       0.01153314, 0.01976932, 0.01265292, 0.00412644, 0.03185267,
       0.01506281, 0.03197604, 0.0574083 , 0.0184487 , 0.01288491,
       0.00324134, 0.01799247, 0.02128054, 0.02004253, 0.01430132,
       0.01260718, 0.0218097 , 0.01246105, 0.02211789, 0.01629075,
       0.02102758, 0.02416235, 0.02198794, 0.01358365, 0.01883374,
       0.01989204, 0.01738839, 0.01525009, 0.0134415 , 0.02056127,
       0.01439274, 0.0177777 , 0.0128161 , 0.03006703, 0.01373067,
       0.01155322, 0.01939584, 0.01391136, 0.01718822, 0.01513928,
       0.01507929, 0.0122447 , 0.01701439, 0.02090307, 0.01370406,
       0.02187297, 0.01328524, 0.01295381, 0.00724027, 0.0124034 ,
       0.02592615, 0.01507214])

In [65]:
rfc.predict

<bound method ForestClassifier.predict of RandomForestClassifier(max_depth=50, n_estimators=200, random_state=42)>

In [60]:
ss

Unnamed: 0,song_id,target
0,7c61FpilqRU/3Ley,Country
1,EmqUjbC3coby/LZy,Disco
2,lvF5H8aYwo+TlFJe,Lofi
3,O+oGRFmYSUbebxCK,Indie
4,rUR7HzUw1p41lUUn,HipHop
...,...,...
3303,ObfXKLfo3N9IuZGw,Funk
3304,qCxgC5trW/Xl/wC8,Indie
3305,z8dKvyoqkEVA1aKZ,Pop
3306,s2RNjtkc0Rzt5smL,EDM


# Making the submission file

In [46]:
results = rfc.predict(result_test_df)
ss['target'] = label_encoder.inverse_transform(results)
ss

Unnamed: 0,song_id,target
0,7c61FpilqRU/3Ley,Country
1,EmqUjbC3coby/LZy,Disco
2,lvF5H8aYwo+TlFJe,Lofi
3,O+oGRFmYSUbebxCK,Indie
4,rUR7HzUw1p41lUUn,HipHop
...,...,...
3303,ObfXKLfo3N9IuZGw,Funk
3304,qCxgC5trW/Xl/wC8,Indie
3305,z8dKvyoqkEVA1aKZ,Pop
3306,s2RNjtkc0Rzt5smL,EDM


# Saving the file

In [35]:
# Make sure to have index = False to prevent the additional index column slowly sweep in your output file and give you error while submitting it.
ss.to_csv("submission.csv", index=False)