In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [186]:
data = pd.read_csv('valence_features.csv', index_col=0)
targets = pd.read_csv('valence_targets.csv', index_col=0)

In [3]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [175]:
from sklearn.model_selection import train_test_split

In [187]:
x_train, x_test, y_train, y_test = train_test_split(data, targets, test_size=0.1)

## XGBoost

In [177]:
import xgboost

In [188]:
d_train = xgboost.DMatrix(x_train, y_train)
d_test = xgboost.DMatrix(x_test, y_test)

In [195]:
params = {
    #'tree_method' : 'hist',
    'max_depth' : 4,
    #'max_leaves' : 20,
    'objective' : 'binary:logistic',
    #'grow_policy' : 'lossguide',
    'eta' : 0.001,
    'eval_metric': 'auc',
    'subsample':0.7,
    'colsample_bytree':0.3
    }
evallist = [(d_train, 'train'), (d_test, 'test')]
num_round = 5000

In [190]:
cv_train = xgboost.DMatrix(data, targets)

In [191]:
def acc_eval(predt, dtrain):
    y = dtrain.get_label()
    predt[predt<=0.5] = 0
    predt[predt>0.5] = 1
    
    return 'accuracy', accuracy_score(y, predt)

In [118]:
def f1_eval(predt, dtrain):
    y = dtrain.get_label()
    predt[predt<=0.5] = 0
    predt[predt>0.5] = 1
    
    return 'f1_score', f1_score(y, predt)

In [192]:
xgb_cv = xgboost.cv(dtrain=cv_train, params=params, nfold=10, num_boost_round=num_round, early_stopping_rounds=15, feval=acc_eval, seed=123, verbose_eval=1, maximize=True)

[0]	train-accuracy:0.647483+0.0079602	train-auc:0.611983+0.0165142	test-accuracy:0.601562+0.057197	test-auc:0.519608+0.0524202
[1]	train-accuracy:0.662587+0.0072424	train-auc:0.654418+0.0151534	test-accuracy:0.627344+0.040902	test-auc:0.523761+0.0686334
[2]	train-accuracy:0.663021+0.0062693	train-auc:0.686896+0.0103867	test-accuracy:0.627344+0.0390703	test-auc:0.511566+0.0802787
[3]	train-accuracy:0.659549+0.00499855	train-auc:0.713773+0.00744682	test-accuracy:0.634375+0.0398053	test-auc:0.521007+0.0634779
[4]	train-accuracy:0.656684+0.00434434	train-auc:0.733769+0.00762009	test-accuracy:0.635156+0.0356389	test-auc:0.518383+0.0629294
[5]	train-accuracy:0.65599+0.0054493	train-auc:0.750667+0.00882385	test-accuracy:0.632813+0.0373043	test-auc:0.51758+0.0636192
[6]	train-accuracy:0.653733+0.00472987	train-auc:0.76579+0.010214	test-accuracy:0.63125+0.0355963	test-auc:0.522106+0.0594501
[7]	train-accuracy:0.652778+0.00562558	train-auc:0.780374+0.0106551	test-accuracy:0.630469+0.0349474	test

In [43]:
display(xgb_cv.iloc[-1])

train-accuracy-mean    0.659809
train-accuracy-std     0.007749
train-auc-mean         0.741060
train-auc-std          0.009860
test-accuracy-mean     0.627344
test-accuracy-std      0.030066
test-auc-mean          0.507790
test-auc-std           0.054201
Name: 16, dtype: float64

In [205]:
model = xgboost.train(params=params, dtrain=d_train, num_boost_round=num_round, early_stopping_rounds=15, evals=evallist, verbose_eval=1)

y_pred = model.predict(d_test, ntree_limit=model.best_ntree_limit)
#y_pred[y_pred<=0.5] = 0
#y_pred[y_pred>0.5] = 1
#f1 = f1_score(y_test, y_pred)
#aa = accuracy_score(y_test, y_pred)
#print("-----------")
#print("Final F1 score = {}".format(f1))
#print("Final accuracy = {}".format(aa))

[0]	train-auc:0.646092	test-auc:0.522604
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 15 rounds.
[1]	train-auc:0.700213	test-auc:0.588349
[2]	train-auc:0.722979	test-auc:0.544433
[3]	train-auc:0.739129	test-auc:0.539783
[4]	train-auc:0.771378	test-auc:0.564583
[5]	train-auc:0.779592	test-auc:0.571945
[6]	train-auc:0.792507	test-auc:0.601137
[7]	train-auc:0.805294	test-auc:0.576854
[8]	train-auc:0.808728	test-auc:0.596745
[9]	train-auc:0.811176	test-auc:0.591191
[10]	train-auc:0.81632	test-auc:0.610437
[11]	train-auc:0.814941	test-auc:0.592612
[12]	train-auc:0.816613	test-auc:0.591708
[13]	train-auc:0.8215	test-auc:0.579824
[14]	train-auc:0.823314	test-auc:0.598166
[15]	train-auc:0.822376	test-auc:0.588091
[16]	train-auc:0.820208	test-auc:0.582408
[17]	train-auc:0.825463	test-auc:0.579953
[18]	train-auc:0.827057	test-auc:0.583828
[19]	train-auc:0.830275	test-auc:0.591062
[20]	train-auc:0.829982	test-auc

In [206]:
display(y_pred)

array([0.5021173 , 0.50203705, 0.5020481 , 0.50173676, 0.5004623 ,
       0.5002152 , 0.5005008 , 0.5022989 , 0.49923918, 0.50091785,
       0.50029683, 0.5022898 , 0.50191635, 0.50121313, 0.5013219 ,
       0.5008583 , 0.50139207, 0.5011585 , 0.50193226, 0.5016739 ,
       0.50144374, 0.50182647, 0.5009037 , 0.5010909 , 0.5014434 ,
       0.50216484, 0.5018168 , 0.5019927 , 0.500431  , 0.50090134,
       0.50133204, 0.501643  , 0.5017298 , 0.50220865, 0.50167024,
       0.50193834, 0.5015104 , 0.5019116 , 0.5017222 , 0.5015655 ,
       0.501385  , 0.5019355 , 0.50190115, 0.50175023, 0.5016032 ,
       0.50227904, 0.50217   , 0.50143504, 0.50235295, 0.5008337 ,
       0.50186694, 0.5021171 , 0.50219154, 0.50103194, 0.5019216 ,
       0.5018757 , 0.5017222 , 0.5020079 , 0.5018129 , 0.5022775 ,
       0.50179327, 0.5007682 , 0.5022409 , 0.50170904, 0.5020004 ,
       0.5002436 , 0.50181943, 0.5014474 , 0.5017326 , 0.50095505,
       0.50218165, 0.5018512 , 0.50225705, 0.5008935 , 0.50144

# CatBoost

In [6]:
import catboost

In [164]:
cat_crossval = catboost.Pool(data=data, label=targets)

In [129]:
cat_params = {
    'loss_function':'Logloss',
    'eval_metric':'Accuracy',
    'learning_rate':0.001,
    'depth':5,
    'subsample': 0.8
}

In [165]:
cat_cv = catboost.cv(pool=cat_crossval,
                    params=cat_params,
                    num_boost_round=5000,
                    nfold=10,
                    verbose_eval=1,
                    early_stopping_rounds=15
                    )

0:	learn: 0.6460067	test: 0.6234406	best: 0.6234406 (0)	total: 435ms	remaining: 36m 13s
1:	learn: 0.6519104	test: 0.6319976	best: 0.6319976 (1)	total: 831ms	remaining: 34m 37s
2:	learn: 0.6512173	test: 0.6367220	best: 0.6367220 (2)	total: 1.23s	remaining: 34m 11s
3:	learn: 0.6488738	test: 0.6359530	best: 0.6367220 (2)	total: 1.69s	remaining: 35m 16s
4:	learn: 0.6450539	test: 0.6359530	best: 0.6367220 (2)	total: 2.17s	remaining: 36m 11s
5:	learn: 0.6443593	test: 0.6367342	best: 0.6367342 (5)	total: 2.65s	remaining: 36m 42s
6:	learn: 0.6438381	test: 0.6367342	best: 0.6367342 (5)	total: 3.03s	remaining: 36m 3s
7:	learn: 0.6426222	test: 0.6382785	best: 0.6382785 (7)	total: 3.4s	remaining: 35m 20s
8:	learn: 0.6426223	test: 0.6390598	best: 0.6390598 (8)	total: 3.77s	remaining: 34m 53s
9:	learn: 0.6418408	test: 0.6382906	best: 0.6390598 (8)	total: 4.16s	remaining: 34m 34s
10:	learn: 0.6413199	test: 0.6390658	best: 0.6390658 (10)	total: 4.56s	remaining: 34m 27s
11:	learn: 0.6405386	test: 0.639

In [64]:
from catboost import CatBoostClassifier

In [170]:
model = CatBoostClassifier( loss_function='Logloss',
                            learning_rate=0.001,
                            depth=5,
                            subsample=0.5,
                            num_boost_round=10,
                            #early_stopping_rounds=15,
                            verbose=True)

In [171]:
model.fit(x_train, y_train)

0:	learn: 0.6930366	total: 14ms	remaining: 126ms
1:	learn: 0.6929235	total: 28ms	remaining: 112ms
2:	learn: 0.6927796	total: 40.5ms	remaining: 94.5ms
3:	learn: 0.6926530	total: 49ms	remaining: 73.6ms
4:	learn: 0.6925523	total: 60.5ms	remaining: 60.5ms
5:	learn: 0.6924413	total: 70.2ms	remaining: 46.8ms
6:	learn: 0.6923345	total: 79.4ms	remaining: 34ms
7:	learn: 0.6922518	total: 88.8ms	remaining: 22.2ms
8:	learn: 0.6921485	total: 97.3ms	remaining: 10.8ms
9:	learn: 0.6920496	total: 108ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x23993474288>

In [172]:
y_pred = model.predict(x_test)

In [173]:
print("-----------")
print("Final Accuracy = {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Final F1 score = {:.3f}".format(f1_score(y_test, y_pred)))

-----------
Final Accuracy = 0.695
Final F1 score = 0.820
