In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
np.random.seed(0)
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import sklearn

import category_encoders as ce
from catboost import CatBoostClassifier

In [2]:

def scaler(scaler, data, test=None):
    scaler.fit(data)  # Apply transform to both the training set and the test set.
    train_scale = scaler.transform(data)
    if test is not None:
        test_scale = scaler.fit_transform(test)

    return train_scale, test_scale, scaler

def train_model(classifier, X_tr, y_tr, X_te, y_te):
    print('start training...')
    classifier.fit(X_tr, y_tr)
    print('evaluation...')
    y_p = classifier.predict(X_te)
    score = evaluate(y_te, y_p)
    print(f'score is {score}')
    return classifier, score

def evaluate(y_true, y_pred):
    return f1_score(y_true, y_pred, average="macro")

In [3]:
data = pd.read_table('data/dev.tsv')
test = pd.read_table('data/eval.tsv')

In [4]:
df = data.copy()

In [5]:
eval = test.copy()

In [6]:
from scipy import stats
def happy_sad(x):
    if x>df['valence'].mean():
        return 'happy'
    else:
        return 'sad'

def popularity_cat(x):
    if x>= 7:
        return 'high'
    elif x >= 4 and x < 7:
        return 'med'
    else:
        return 'low'

df['boringness'] = df['loudness'] + df['tempo'] + (df['energy']*100) + (df['danceability']*100)
df['valence_happy_sad'] = df['valence'].apply(lambda x: happy_sad(x))
df['loudness_plus_60'] = df['loudness'].apply(lambda x: x+60)
df['loudness_pos'] = df['loudness'].apply(lambda x: -1*x)
df['loudness_pos'] = np.sqrt(df['loudness_pos'])
df['boringness_plus_60'] = df['boringness'].apply(lambda x: x+60)
df['duration_ms_box_cox_trans'] = stats.boxcox(df['duration_ms'])[0]
df['acousticness_sqrt_trans'] = np.sqrt(df['acousticness'])
df['liveness_sqrt_trans'] = np.sqrt(df['liveness'])
df['popularity_sqrt_trans'] = np.sqrt(df['popularity'])
df['popularity_sqrt_trans_cat'] = df['popularity_sqrt_trans'].apply(lambda x: popularity_cat(x))
df['speechiness_sqrt_trans'] = np.sqrt(df['speechiness'])


df = df.fillna(value=0)
# df.describe().T

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [7]:
import seaborn as sns
import matplotlib.pyplot as plt
def dist_plot_box_cox_by_class(df,col):
    plt.figure(figsize=(16,6))
    plt.title("Distribution of "+col+" box cox transformation")
    sns.distplot(df[df['mode']==0][col],
                 color="green", kde=True,bins=120, label='mode 0')
    sns.distplot(df[df['mode']==1][col],color="red", kde=True,bins=120, label='mode 1')
    plt.legend()
    plt.show()
def dist_plot_box_cox(df,col):
    plt.figure(figsize=(16,6))
    plt.title("Distribution of "+col+" box cox transformation")
    sns.distplot(stats.boxcox(df[col])[0],
                 color="green", kde=True,bins=120, label='mode 0')
    plt.legend()
    plt.show()
# dist_plot_box_cox_by_class(df,'duration_ms_box_cox_trans_per_class')

In [54]:

col = [
        'valence',
       #'valence_happy_sad',
        'year',
        # 'acousticness',
        'acousticness_sqrt_trans',
        'artists',
         'danceability',
        'duration_ms',
      # 'duration_ms_box_cox_trans_per_class',
     #  'duration_ms_box_cox_trans',
        'energy',
        'explicit',
        # 'id',
        'instrumentalness',
        'key',
       'liveness',
       #'liveness_sqrt_trans',
        # 'loudness',
        #'loudness_plus_60',
         'loudness_pos',
         #'popularity',
       # 'popularity_sqrt_trans_cat'
       # 'popularity_sqrt_trans',
        # 'speechiness',
     'speechiness_sqrt_trans',
        'tempo',
       #  'mode',
       # 'boringness',
       # 'boringness_plus_60',




        ]


X = df[col]
y = df['mode']


X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.1,
                                                    random_state=0,shuffle=True)




In [9]:
from collections import Counter

counter = Counter(y)

# estimate scale_pos_weight value
estimate = counter[0]/counter[1]
print('Estimate: %.3f' % estimate)
print(counter,counter[0])

Estimate: 0.415
Counter({1: 96508, 0: 40014}) 40014


In [89]:
from time import time
import pprint
# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer
# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

# Reporting util for different optimizers
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optmizers

    optimizer = a sklearn or a skopt optimizer
    X = the training set
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start,
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  best_score_std))
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params



search_spaces = {
    'iterations': Integer(10, 1000),
                 'depth': Integer(1, 8),
                 'learning_rate': Real(0.01, 1.0, 'log-uniform'),
                 'random_strength': Real(1e-9, 10, 'log-uniform'),
                 'bagging_temperature': Real(0.0, 1.0),
                 'border_count': Integer(1, 255),
                 'l2_leaf_reg': Integer(2, 30)

                 }

In [90]:
clf = CatBoostClassifier(auto_class_weights='SqrtBalanced',
                         random_state=0,
                        verbose= False,
                        task_type= 'GPU',
                         od_type = "Iter",
                         cat_features=['artists','key'],
                         od_wait = 100)

roc_auc = make_scorer(roc_auc_score, greater_is_better=True, needs_threshold=True)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring=roc_auc,
                    cv=skf,
                    n_iter=100,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=42)
best_params = report_perf(opt, X_train, y_train,'CatBoost',
                          callbacks=[VerboseCallback(100),
                                     DeadlineStopper(60*10)])

Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 33.0999
Function value obtained: -0.7550
Current minimum: -0.7550
Iteration No: 2 started. Searching for the next optimal point.
Iteration No: 2 ended. Search finished for the next optimal point.
Time taken: 28.2651
Function value obtained: -0.7482
Current minimum: -0.7550
Iteration No: 3 started. Searching for the next optimal point.
Iteration No: 3 ended. Search finished for the next optimal point.
Time taken: 9.9618
Function value obtained: -0.7511
Current minimum: -0.7550
Iteration No: 4 started. Searching for the next optimal point.
Iteration No: 4 ended. Search finished for the next optimal point.
Time taken: 33.3697
Function value obtained: -0.7496
Current minimum: -0.7550
Iteration No: 5 started. Searching for the next optimal point.
Iteration No: 5 ended. Search finished for the next optimal point.
Time taken: 33.0756
Function value obta

In [91]:
best_params

OrderedDict([('bagging_temperature', 0.6170793585023163),
             ('border_count', 198),
             ('depth', 4),
             ('iterations', 876),
             ('l2_leaf_reg', 19),
             ('learning_rate', 0.15461088645165447),
             ('random_strength', 0.0030745042451505583)])

In [92]:
model = CatBoostClassifier(auto_class_weights='SqrtBalanced',
                           random_state=0,
                           bagging_temperature=0.6170793585023163,
                           border_count=198,
                           depth=4,
                           iterations=876,
                           l2_leaf_reg=19,
                           learning_rate=0.15461088645165447,
                           random_strength=0.0030745042451505583,
                           task_type= 'GPU',
                         od_type = "Iter",
                         od_wait = 100)


print(f'start training... {model}')
X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train,

                                                    test_size=0.1,
                                                  random_state=0,shuffle=True)

hist = model.fit(X_train_val,y_train_val,
               cat_features=['artists','key'],
               eval_set=(X_val,y_val))
print('evaluation...')
y_p = model.predict(X_test)
score = evaluate(y_test, y_p)
print(f'score is {score}')



start training... <catboost.core.CatBoostClassifier object at 0x7f43cf0847c0>
0:	learn: 0.6724879	test: 0.6658439	best: 0.6658439 (0)	total: 9.83ms	remaining: 8.6s
1:	learn: 0.6578029	test: 0.6452558	best: 0.6452558 (1)	total: 19ms	remaining: 8.32s
2:	learn: 0.6475053	test: 0.6306003	best: 0.6306003 (2)	total: 27.4ms	remaining: 7.97s
3:	learn: 0.6364395	test: 0.6172259	best: 0.6172259 (3)	total: 36.3ms	remaining: 7.91s
4:	learn: 0.6310803	test: 0.6095652	best: 0.6095652 (4)	total: 45.4ms	remaining: 7.91s
5:	learn: 0.6247344	test: 0.6022698	best: 0.6022698 (5)	total: 55.7ms	remaining: 8.07s
6:	learn: 0.6202142	test: 0.5969423	best: 0.5969423 (6)	total: 65.9ms	remaining: 8.18s
7:	learn: 0.6178102	test: 0.5932077	best: 0.5932077 (7)	total: 75.3ms	remaining: 8.17s
8:	learn: 0.6146549	test: 0.5896963	best: 0.5896963 (8)	total: 83.9ms	remaining: 8.08s
9:	learn: 0.6123972	test: 0.5868698	best: 0.5868698 (9)	total: 92.8ms	remaining: 8.03s
10:	learn: 0.6113113	test: 0.5853318	best: 0.5853318 (1

In [93]:
print(f' score {score} {model}')

 score 0.6884629299460583 <catboost.core.CatBoostClassifier object at 0x7f43cf0847c0>


In [94]:
model.save_model(fname='{:.3f}_model.cbm'.format(score),format='cbm')

In [95]:
def WriteOnFile(name, y_eval):
    f = open(name, "w")
    f.write("Id,Predicted\n")
    for index, i in enumerate(y_eval):
        f.write(f"{index},{i}\n")
    f.close

eval['boringness'] = eval['loudness'] + eval['tempo'] + (eval['energy']*100) + (eval['danceability']*100)
eval['valence_happy_sad'] = eval['valence'].apply(lambda x: happy_sad(x))
eval['loudness_plus_60'] = eval['loudness'].apply(lambda x: x+60)
eval['loudness_pos'] = eval['loudness'].apply(lambda x: -1*x)
eval['loudness_pos'] = np.sqrt(eval['loudness_pos'])
eval['boringness_plus_60'] = eval['boringness'].apply(lambda x: x+60)
eval['duration_ms_box_cox_trans'] = stats.boxcox(eval['duration_ms'])[0]
eval['acousticness_sqrt_trans'] = np.sqrt(eval['acousticness'])
eval['liveness_sqrt_trans'] = np.sqrt(eval['liveness'])
eval['popularity_sqrt_trans'] = np.sqrt(eval['popularity'])
eval['speechiness_sqrt_trans'] = np.sqrt(eval['speechiness'])


eval = eval.fillna(value=0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [97]:
test = eval[col]
# test = encoder.transform(test)
# test_scal = x_scaler.transform(test)
# test_svd = tsvd.transform(test_scal)
y_pred = model.predict(test)
WriteOnFile('submission.csv',y_pred)

In [14]:
test


Unnamed: 0,valence,year,artists,danceability,duration_ms,energy,explicit,instrumentalness,key,liveness,loudness,tempo,acousticness_sqrt_trans,speechiness_sqrt_trans
0,0.741,1962,['Frank Proffitt'],0.465,147173,0.389,0,0.506000,11,0.1100,-14.770,125.010,0.997497,0.198746
1,0.928,1938,"['Billie Holiday', 'Teddy Wilson']",0.578,165133,0.234,0,0.000391,0,0.0784,-12.305,172.403,0.958123,0.319374
2,0.733,1998,['Monica'],0.681,333960,0.726,0,0.000019,9,0.1190,-8.421,147.925,0.356371,0.189209
3,0.914,2001,['Los Acosta'],0.724,224427,0.731,0,0.003430,8,0.1770,-7.159,101.602,0.391152,0.180278
4,0.228,2015,['Lil Uzi Vert'],0.594,172704,0.850,1,0.000000,2,0.1100,-6.525,100.002,0.170587,0.308869
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34126,0.783,1965,['Bob Dylan'],0.770,155427,0.646,0,0.000000,9,0.1370,-10.723,118.771,0.754983,0.261534
34127,0.425,1975,['Camel'],0.373,320387,0.423,0,0.527000,0,0.1410,-12.577,120.941,0.502991,0.189209
34128,0.142,2000,['Lara Fabian'],0.156,315267,0.279,0,0.007810,0,0.1170,-10.108,72.555,0.979285,0.200000
34129,0.844,1982,['Huey Lewis & The News'],0.438,225000,0.418,0,0.000000,2,0.3300,-12.556,98.173,0.146287,0.198494


In [15]:
# cols = [
#     'valence',
#         'year',
#         # 'acousticness',
#         # 'artists',
#          'danceability',
#        # 'duration_ms',
#         'energy',
#         'explicit',
#         # 'id',
#         'instrumentalness',
#         'key',
#        'liveness',
#         # 'loudness',
#         # 'popularity',
#         # 'speechiness',
#         'tempo',
#        #  'mode',
#        # 'loudness_plus_60',
#         'loudness_pos',
#          # 'boringness',
#        #  'valence_happy_sad',
#        # 'boringness_plus_60',
#         'duration_ms_box_cox_trans',
#        'acousticness_sqrt_trans',
#        #  'liveness_sqrt_trans',
#        # 'popularity_sqrt_trans',
#         'speechiness_sqrt_trans',
#       # 'duration_ms_box_cox_trans_per_class'
#         ]
