In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/music-vibes-datathon-fall23/sample_submission.csv
/kaggle/input/music-vibes-datathon-fall23/meta_data_2.csv
/kaggle/input/music-vibes-datathon-fall23/train.csv
/kaggle/input/music-vibes-datathon-fall23/test.csv
/kaggle/input/music-vibes-datathon-fall23/meta_data_1.csv


In [2]:
df_meta_1 = pd.read_csv("/kaggle/input/music-vibes-datathon-fall23/meta_data_1.csv")
df_meta_2 = pd.read_csv("/kaggle/input/music-vibes-datathon-fall23/meta_data_2.csv")
df_train = pd.read_csv("/kaggle/input/music-vibes-datathon-fall23/train.csv")
df_test = pd.read_csv("/kaggle/input/music-vibes-datathon-fall23/test.csv")

**Handling missing data using linear interpolation**

In [3]:
df_meta_1.replace(np.inf, np.nan, inplace=True)
df_meta_1.interpolate(method='linear', axis=0, inplace=True)

In [4]:
data= pd.merge(df_train, df_meta_1, left_on="song_id",right_on="id",how='inner')
data.drop(columns=['song_id'],inplace=True)


data_test = pd.merge(df_test, df_meta_1, left_on="song_id",right_on="id",how='inner')
data_test.drop(columns=['song_id'],inplace=True)

Analyzing the common artists between train and test dataset, based on which we can decide to consider or drop a feature.

In [5]:
common_elements =  set(data['artist']).intersection(data_test['artist'])
len(common_elements)

976

In [6]:
common_elements=set(data['album']).intersection(data_test['album'])
len(common_elements)

452

In [7]:
features_to_drop=[
    'lyrics','track','time_signature','total_tracks','mode','explicit','duration',
]

data = data.drop(columns=features_to_drop+['id'])
data_test = data_test.drop(columns=features_to_drop)

In [8]:
Y=data['target']
data.drop(columns=['target'],inplace=True)

**Release year has an relationship in detecting genre of music, so I decided to consider this feature by little preprocessing.**

In [9]:
data['release_date'] = pd.to_datetime(data['release_date'],format='mixed')

data['release_year'] = data['release_date'].dt.year

data.drop(columns=['release_date'],inplace=True)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data, Y, test_size=0.1, stratify=Y, random_state=42)

**Catboost is capable of handling categorical features, creating a parameter that contains list of categorical variables.**


In [11]:
categorical_columns=['artist','album']

**To mitigate data imbalance issue, Synthetic Minority Over-sampling Technique (SMOTE) is used for effective data augmentation.**

In [12]:
from imblearn.over_sampling import SMOTENC
sm = SMOTENC(random_state=42, categorical_features=categorical_columns)
X_res, y_res = sm.fit_resample(X_train, y_train)

**Hyperparameter Optimization is done using Optuna**

In [13]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score


# Define the objective function for Optuna
def objective(trial):
    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_res, y_res , test_size=0.2, random_state=42)

    # Define hyperparameters to optimize
    params = {
        'iterations': trial.suggest_int('iterations', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10, log=True)
    }

    # Create the CatBoost model
    model = CatBoostClassifier(**params,  loss_function='MultiClass',eval_metric='Accuracy',od_type='Iter',cat_features=categorical_columns)

    # Train the model
    model.fit(X_train, y_train, eval_set=(X_val, y_val))

    # Make predictions on the validation set
    y_pred = model.predict(X_val)

    # Calculate accuracy as the evaluation metric
    accuracy = accuracy_score(y_val, y_pred)

    return accuracy


In [14]:
import optuna
# Create an Optuna study
study = optuna.create_study(direction='maximize')  # Maximize accuracy

# Run the optimization for a specified number of trials
n_trials = 10  
study.optimize(objective, n_trials=n_trials)

# Get the best hyperparameters found by Optuna
best_params = study.best_params

[I 2023-11-06 07:01:18,379] A new study created in memory with name: no-name-f7ffbc6e-be3c-445f-b8dd-f677309bf298


0:	learn: 0.3800403	test: 0.3642473	best: 0.3642473 (0)	total: 270ms	remaining: 1m 42s
1:	learn: 0.4445565	test: 0.4146505	best: 0.4146505 (1)	total: 473ms	remaining: 1m 29s
2:	learn: 0.4754704	test: 0.4388441	best: 0.4388441 (2)	total: 677ms	remaining: 1m 25s
3:	learn: 0.4705981	test: 0.4489247	best: 0.4489247 (3)	total: 877ms	remaining: 1m 22s
4:	learn: 0.4934476	test: 0.4751344	best: 0.4751344 (4)	total: 1.07s	remaining: 1m 20s
5:	learn: 0.5100806	test: 0.4899194	best: 0.4899194 (5)	total: 1.26s	remaining: 1m 18s
6:	learn: 0.5092406	test: 0.4818548	best: 0.4899194 (5)	total: 1.46s	remaining: 1m 17s
7:	learn: 0.5077285	test: 0.4872312	best: 0.4899194 (5)	total: 1.66s	remaining: 1m 17s
8:	learn: 0.4968078	test: 0.4711022	best: 0.4899194 (5)	total: 1.85s	remaining: 1m 16s
9:	learn: 0.5020161	test: 0.4791667	best: 0.4899194 (5)	total: 2.05s	remaining: 1m 15s
10:	learn: 0.5030242	test: 0.4724462	best: 0.4899194 (5)	total: 2.24s	remaining: 1m 15s
11:	learn: 0.5087366	test: 0.4744624	best:

[I 2023-11-06 07:02:14,238] Trial 0 finished with value: 0.8111559139784946 and parameters: {'iterations': 380, 'learning_rate': 0.004312804397370834, 'depth': 6, 'l2_leaf_reg': 0.08188005175345094}. Best is trial 0 with value: 0.8111559139784946.


265:	learn: 0.7340390	test: 0.8111559	best: 0.8111559 (245)	total: 55.6s	remaining: 23.8s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.811155914
bestIteration = 245

Shrink model to first 246 iterations.
0:	learn: 0.3553427	test: 0.3346774	best: 0.3346774 (0)	total: 206ms	remaining: 1m 23s
1:	learn: 0.3902890	test: 0.3575269	best: 0.3575269 (1)	total: 400ms	remaining: 1m 21s
2:	learn: 0.3970094	test: 0.3709677	best: 0.3709677 (2)	total: 595ms	remaining: 1m 20s
3:	learn: 0.4356519	test: 0.3951613	best: 0.3951613 (3)	total: 789ms	remaining: 1m 19s
4:	learn: 0.4344758	test: 0.3924731	best: 0.3951613 (3)	total: 984ms	remaining: 1m 19s
5:	learn: 0.4358199	test: 0.3971774	best: 0.3971774 (5)	total: 1.18s	remaining: 1m 18s
6:	learn: 0.4427083	test: 0.4099462	best: 0.4099462 (6)	total: 1.37s	remaining: 1m 18s
7:	learn: 0.4479167	test: 0.4079301	best: 0.4099462 (6)	total: 1.56s	remaining: 1m 18s
8:	learn: 0.4627016	test: 0.4348118	best: 0.4348118 (8)	total: 1.76s	remainin

[I 2023-11-06 07:02:27,605] Trial 1 finished with value: 0.46841397849462363 and parameters: {'iterations': 407, 'learning_rate': 0.0016435646233643943, 'depth': 5, 'l2_leaf_reg': 0.01778069358598884}. Best is trial 0 with value: 0.8111559139784946.


67:	learn: 0.4936156	test: 0.4630376	best: 0.4684140 (47)	total: 13.2s	remaining: 1m 6s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.4684139785
bestIteration = 47

Shrink model to first 48 iterations.
0:	learn: 0.3818884	test: 0.3360215	best: 0.3360215 (0)	total: 125ms	remaining: 20.6s
1:	learn: 0.4057460	test: 0.3568548	best: 0.3568548 (1)	total: 253ms	remaining: 20.6s
2:	learn: 0.4650538	test: 0.4247312	best: 0.4247312 (2)	total: 374ms	remaining: 20.2s
3:	learn: 0.4729503	test: 0.4375000	best: 0.4375000 (3)	total: 521ms	remaining: 21s
4:	learn: 0.4810148	test: 0.4327957	best: 0.4375000 (3)	total: 654ms	remaining: 20.9s
5:	learn: 0.4845430	test: 0.4267473	best: 0.4375000 (3)	total: 774ms	remaining: 20.5s
6:	learn: 0.4961358	test: 0.4448925	best: 0.4448925 (6)	total: 894ms	remaining: 20.2s
7:	learn: 0.5084005	test: 0.4610215	best: 0.4610215 (7)	total: 1.01s	remaining: 19.8s
8:	learn: 0.5134409	test: 0.4623656	best: 0.4623656 (8)	total: 1.13s	remaining: 19.6s
9:	l

[I 2023-11-06 07:02:35,304] Trial 2 finished with value: 0.5060483870967742 and parameters: {'iterations': 165, 'learning_rate': 0.0019330196895249066, 'depth': 6, 'l2_leaf_reg': 0.8712095584637509}. Best is trial 0 with value: 0.8111559139784946.


58:	learn: 0.5372984	test: 0.5033602	best: 0.5060484 (38)	total: 7.58s	remaining: 13.6s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.5060483871
bestIteration = 38

Shrink model to first 39 iterations.
0:	learn: 0.4615255	test: 0.4072581	best: 0.4072581 (0)	total: 343ms	remaining: 1m 44s
1:	learn: 0.4924395	test: 0.4287634	best: 0.4287634 (1)	total: 685ms	remaining: 1m 44s
2:	learn: 0.5225134	test: 0.4663978	best: 0.4663978 (2)	total: 1.03s	remaining: 1m 44s
3:	learn: 0.5270497	test: 0.4724462	best: 0.4724462 (3)	total: 1.38s	remaining: 1m 44s
4:	learn: 0.5403226	test: 0.4905914	best: 0.4905914 (4)	total: 1.72s	remaining: 1m 43s
5:	learn: 0.5549395	test: 0.5026882	best: 0.5026882 (5)	total: 2.07s	remaining: 1m 43s
6:	learn: 0.5719086	test: 0.5154570	best: 0.5154570 (6)	total: 2.41s	remaining: 1m 43s
7:	learn: 0.5823253	test: 0.5208333	best: 0.5208333 (7)	total: 2.8s	remaining: 1m 44s
8:	learn: 0.5868616	test: 0.5208333	best: 0.5208333 (7)	total: 3.15s	remaining: 1

[I 2023-11-06 07:03:49,963] Trial 3 finished with value: 0.8682795698924731 and parameters: {'iterations': 307, 'learning_rate': 0.01902339391543707, 'depth': 8, 'l2_leaf_reg': 1.05283360024909}. Best is trial 3 with value: 0.8682795698924731.


206:	learn: 0.8232527	test: 0.8655914	best: 0.8682796 (186)	total: 1m 14s	remaining: 36s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.8682795699
bestIteration = 186

Shrink model to first 187 iterations.
0:	learn: 0.5139449	test: 0.4375000	best: 0.4375000 (0)	total: 983ms	remaining: 3m 39s
1:	learn: 0.5712366	test: 0.4684140	best: 0.4684140 (1)	total: 1.95s	remaining: 3m 36s
2:	learn: 0.6236559	test: 0.5026882	best: 0.5026882 (2)	total: 2.91s	remaining: 3m 34s
3:	learn: 0.6396169	test: 0.5329301	best: 0.5329301 (3)	total: 3.86s	remaining: 3m 32s
4:	learn: 0.6565860	test: 0.5510753	best: 0.5510753 (4)	total: 4.81s	remaining: 3m 30s
5:	learn: 0.6801075	test: 0.5685484	best: 0.5685484 (5)	total: 5.75s	remaining: 3m 29s
6:	learn: 0.6777554	test: 0.5665323	best: 0.5685484 (5)	total: 6.71s	remaining: 3m 28s
7:	learn: 0.6809476	test: 0.5712366	best: 0.5712366 (7)	total: 7.66s	remaining: 3m 26s
8:	learn: 0.6922043	test: 0.5866935	best: 0.5866935 (8)	total: 8.68s	remainin

[I 2023-11-06 07:07:33,685] Trial 4 finished with value: 0.823252688172043 and parameters: {'iterations': 224, 'learning_rate': 0.002581739935582529, 'depth': 10, 'l2_leaf_reg': 0.09511471770321948}. Best is trial 3 with value: 0.8682795698924731.


223:	learn: 0.8304772	test: 0.8225806	best: 0.8232527 (220)	total: 3m 43s	remaining: 0us

bestTest = 0.8232526882
bestIteration = 220

Shrink model to first 221 iterations.
0:	learn: 0.3800403	test: 0.3642473	best: 0.3642473 (0)	total: 210ms	remaining: 1m 33s
1:	learn: 0.4464046	test: 0.4153226	best: 0.4153226 (1)	total: 409ms	remaining: 1m 30s
2:	learn: 0.4784946	test: 0.4415323	best: 0.4415323 (2)	total: 607ms	remaining: 1m 29s
3:	learn: 0.4754704	test: 0.4489247	best: 0.4489247 (3)	total: 803ms	remaining: 1m 28s
4:	learn: 0.5050403	test: 0.4818548	best: 0.4818548 (4)	total: 1s	remaining: 1m 28s
5:	learn: 0.5142809	test: 0.4986559	best: 0.4986559 (5)	total: 1.2s	remaining: 1m 27s
6:	learn: 0.5152890	test: 0.4905914	best: 0.4986559 (5)	total: 1.39s	remaining: 1m 27s
7:	learn: 0.5151210	test: 0.4899194	best: 0.4986559 (5)	total: 1.58s	remaining: 1m 26s
8:	learn: 0.5095766	test: 0.4885753	best: 0.4986559 (5)	total: 1.78s	remaining: 1m 26s
9:	learn: 0.5144489	test: 0.4858871	best: 0.4986

[I 2023-11-06 07:08:16,375] Trial 5 finished with value: 0.8602150537634409 and parameters: {'iterations': 445, 'learning_rate': 0.013870478250145873, 'depth': 6, 'l2_leaf_reg': 0.018620080016469285}. Best is trial 3 with value: 0.8682795698924731.


209:	learn: 0.7933468	test: 0.8588710	best: 0.8602151 (189)	total: 42.5s	remaining: 47.6s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.8602150538
bestIteration = 189

Shrink model to first 190 iterations.
0:	learn: 0.4089382	test: 0.3790323	best: 0.3790323 (0)	total: 265ms	remaining: 2m 6s
1:	learn: 0.4702621	test: 0.4435484	best: 0.4435484 (1)	total: 517ms	remaining: 2m 3s
2:	learn: 0.5178091	test: 0.4724462	best: 0.4724462 (2)	total: 778ms	remaining: 2m 3s
3:	learn: 0.5250336	test: 0.4872312	best: 0.4872312 (3)	total: 1.03s	remaining: 2m 3s
4:	learn: 0.5346102	test: 0.5060484	best: 0.5060484 (4)	total: 1.29s	remaining: 2m 2s
5:	learn: 0.5364583	test: 0.5067204	best: 0.5067204 (5)	total: 1.55s	remaining: 2m 2s
6:	learn: 0.5635081	test: 0.5168011	best: 0.5168011 (6)	total: 1.85s	remaining: 2m 4s
7:	learn: 0.5757728	test: 0.5174731	best: 0.5174731 (7)	total: 2.1s	remaining: 2m 4s
8:	learn: 0.5777890	test: 0.5248656	best: 0.5248656 (8)	total: 2.36s	remaining: 2m 3s

[I 2023-11-06 07:09:05,550] Trial 6 finished with value: 0.8736559139784946 and parameters: {'iterations': 480, 'learning_rate': 0.02630274179272176, 'depth': 7, 'l2_leaf_reg': 0.005138790616315653}. Best is trial 6 with value: 0.8736559139784946.


178:	learn: 0.8459341	test: 0.8723118	best: 0.8736559 (158)	total: 49s	remaining: 1m 22s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.873655914
bestIteration = 158

Shrink model to first 159 iterations.
0:	learn: 0.3818884	test: 0.3360215	best: 0.3360215 (0)	total: 126ms	remaining: 17.6s
1:	learn: 0.4037298	test: 0.3561828	best: 0.3561828 (1)	total: 244ms	remaining: 17s
2:	learn: 0.4650538	test: 0.4233871	best: 0.4233871 (2)	total: 365ms	remaining: 16.8s
3:	learn: 0.4711022	test: 0.4381720	best: 0.4381720 (3)	total: 487ms	remaining: 16.7s
4:	learn: 0.4818548	test: 0.4314516	best: 0.4381720 (3)	total: 606ms	remaining: 16.5s
5:	learn: 0.4838710	test: 0.4267473	best: 0.4381720 (3)	total: 726ms	remaining: 16.3s
6:	learn: 0.4974798	test: 0.4462366	best: 0.4462366 (6)	total: 853ms	remaining: 16.3s
7:	learn: 0.5060484	test: 0.4610215	best: 0.4610215 (7)	total: 968ms	remaining: 16.1s
8:	learn: 0.5112567	test: 0.4630376	best: 0.4630376 (8)	total: 1.08s	remaining: 15.9s
9:

[I 2023-11-06 07:09:13,613] Trial 7 finished with value: 0.5047043010752689 and parameters: {'iterations': 141, 'learning_rate': 0.002297573036710349, 'depth': 6, 'l2_leaf_reg': 1.2116257154879804}. Best is trial 6 with value: 0.8736559139784946.


64:	learn: 0.5381384	test: 0.5040323	best: 0.5047043 (45)	total: 7.83s	remaining: 9.16s
65:	learn: 0.5398185	test: 0.5033602	best: 0.5047043 (45)	total: 7.95s	remaining: 9.04s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.5047043011
bestIteration = 45

Shrink model to first 46 iterations.
0:	learn: 0.4089382	test: 0.3823925	best: 0.3823925 (0)	total: 255ms	remaining: 1m 17s
1:	learn: 0.4420363	test: 0.4052419	best: 0.4052419 (1)	total: 508ms	remaining: 1m 17s
2:	learn: 0.4721102	test: 0.4536290	best: 0.4536290 (2)	total: 768ms	remaining: 1m 17s
3:	learn: 0.4865591	test: 0.4623656	best: 0.4623656 (3)	total: 1.02s	remaining: 1m 17s
4:	learn: 0.5085685	test: 0.4899194	best: 0.4899194 (4)	total: 1.27s	remaining: 1m 16s
5:	learn: 0.5026882	test: 0.4845430	best: 0.4899194 (4)	total: 1.52s	remaining: 1m 16s
6:	learn: 0.5188172	test: 0.4986559	best: 0.4986559 (6)	total: 1.78s	remaining: 1m 16s
7:	learn: 0.5236895	test: 0.5006720	best: 0.5006720 (7)	total: 2.04s	remaining:

[I 2023-11-06 07:10:11,253] Trial 8 finished with value: 0.8682795698924731 and parameters: {'iterations': 307, 'learning_rate': 0.016554259489948035, 'depth': 7, 'l2_leaf_reg': 7.404625222789819}. Best is trial 6 with value: 0.8736559139784946.


0:	learn: 0.4848790	test: 0.4254032	best: 0.4254032 (0)	total: 495ms	remaining: 2m
1:	learn: 0.5483871	test: 0.4758065	best: 0.4758065 (1)	total: 986ms	remaining: 1m 59s
2:	learn: 0.5804772	test: 0.4952957	best: 0.4952957 (2)	total: 1.48s	remaining: 1m 59s
3:	learn: 0.6081989	test: 0.5288978	best: 0.5288978 (3)	total: 1.96s	remaining: 1m 58s
4:	learn: 0.6087030	test: 0.5383065	best: 0.5383065 (4)	total: 2.45s	remaining: 1m 57s
5:	learn: 0.6196237	test: 0.5483871	best: 0.5483871 (5)	total: 2.95s	remaining: 1m 57s
6:	learn: 0.6179435	test: 0.5430108	best: 0.5483871 (5)	total: 3.45s	remaining: 1m 57s
7:	learn: 0.6174395	test: 0.5490591	best: 0.5490591 (7)	total: 3.94s	remaining: 1m 56s
8:	learn: 0.6250000	test: 0.5551075	best: 0.5551075 (8)	total: 4.42s	remaining: 1m 56s
9:	learn: 0.6248320	test: 0.5530914	best: 0.5551075 (8)	total: 4.91s	remaining: 1m 55s
10:	learn: 0.6342406	test: 0.5692204	best: 0.5692204 (10)	total: 5.4s	remaining: 1m 54s
11:	learn: 0.6337366	test: 0.5732527	best: 0.5

[I 2023-11-06 07:12:05,098] Trial 9 finished with value: 0.8615591397849462 and parameters: {'iterations': 245, 'learning_rate': 0.008475555535088565, 'depth': 9, 'l2_leaf_reg': 0.364253605182471}. Best is trial 6 with value: 0.8736559139784946.


216:	learn: 0.8277890	test: 0.8608871	best: 0.8615591 (196)	total: 1m 53s	remaining: 14.7s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.8615591398
bestIteration = 196

Shrink model to first 197 iterations.


In [15]:
# best_params = {'iterations': 193, 'learning_rate': 0.07244825805971003, 'depth': 7, 'l2_leaf_reg': 0.8716465893603699}

In [16]:
# Train the CatBoost model with the best hyperparameters on the entire dataset
best_model = CatBoostClassifier(**best_params, eval_metric='Accuracy',loss_function='MultiClass', od_type='Iter',cat_features=categorical_columns)

sm = SMOTENC(random_state=42, categorical_features=categorical_columns)
X_res, y_res = sm.fit_resample(data, Y)
best_model.fit(X_res, y_res)

0:	learn: 0.4263781	total: 334ms	remaining: 2m 40s
1:	learn: 0.4794487	total: 660ms	remaining: 2m 37s
2:	learn: 0.4903288	total: 983ms	remaining: 2m 36s
3:	learn: 0.5048356	total: 1.31s	remaining: 2m 35s
4:	learn: 0.5223646	total: 1.64s	remaining: 2m 35s
5:	learn: 0.5304642	total: 1.97s	remaining: 2m 35s
6:	learn: 0.5384429	total: 2.29s	remaining: 2m 35s
7:	learn: 0.5476306	total: 2.62s	remaining: 2m 34s
8:	learn: 0.5488395	total: 2.95s	remaining: 2m 34s
9:	learn: 0.5506528	total: 3.27s	remaining: 2m 33s
10:	learn: 0.5554884	total: 3.6s	remaining: 2m 33s
11:	learn: 0.5594778	total: 3.92s	remaining: 2m 33s
12:	learn: 0.5634671	total: 4.26s	remaining: 2m 32s
13:	learn: 0.5673356	total: 4.58s	remaining: 2m 32s
14:	learn: 0.5721712	total: 4.91s	remaining: 2m 32s
15:	learn: 0.5764023	total: 5.23s	remaining: 2m 31s
16:	learn: 0.5836557	total: 5.55s	remaining: 2m 31s
17:	learn: 0.6161750	total: 5.91s	remaining: 2m 31s
18:	learn: 0.6473646	total: 6.23s	remaining: 2m 31s
19:	learn: 0.6699710	to

<catboost.core.CatBoostClassifier at 0x7923384f4b80>

**Catboost can provide a list of important features that are covered in most trees.**

In [17]:
# feature importance 
feat_import = [t for t in zip(data.columns, best_model.get_feature_importance())]
feat_import_df = pd.DataFrame(feat_import, columns=['Feature', 'VarImp'])
feat_import_df = feat_import_df.sort_values('VarImp', ascending=False)
print(feat_import_df.head(50))

             Feature     VarImp
1             artist  19.994772
10        reputation  11.684634
0              album   9.639137
6   instrumentalness   8.769661
13      release_year   7.789347
8         naturality   7.753390
3       danceability   7.544912
11       speechiness   5.497423
4             energy   5.329124
9       positiveness   5.026067
7           loudness   3.889516
12             tempo   3.772550
5          happening   1.732617
2       adaptibility   1.576851


In [18]:
data_test['release_date'] = pd.to_datetime(data_test['release_date'],format='mixed')

data_test['release_year'] = data_test['release_date'].dt.year

data_test.drop(columns=['release_date'],inplace=True)

In [19]:
song_id=data_test['id']

x_test = data_test[data.columns]
y_pred = best_model.predict(x_test)

In [20]:
df = pd.DataFrame()
df['song_id']= song_id
df['target']= y_pred[:,0]
df.to_csv('submission.csv',index=False)

In [21]:
df

Unnamed: 0,song_id,target
0,7c61FpilqRU/3Ley,Country
1,EmqUjbC3coby/LZy,EDM
2,lvF5H8aYwo+TlFJe,Lofi
3,O+oGRFmYSUbebxCK,Indie
4,rUR7HzUw1p41lUUn,R&B
...,...,...
3303,ObfXKLfo3N9IuZGw,Blues
3304,qCxgC5trW/Xl/wC8,Indie
3305,z8dKvyoqkEVA1aKZ,HipHop
3306,s2RNjtkc0Rzt5smL,Lofi
