In [65]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
import xgboost as xgb
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, auc
import pandas as pd
import numpy as np
random_state = 1

In [66]:
df = pd.read_csv('./data.csv')
del df['Unnamed: 0']

In [67]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=random_state)
df_full_train = df_full_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = df_full_train['target']
y_test = df_test['target']


del df_test['target']
del df_test['song_title']
del df_full_train['target']
del df_full_train['song_title']
# del df_test['artist']
# del df_full_train['artist']


In [68]:
dicts_full_train = df_full_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_full_train = dv.fit_transform(dicts_full_train)

dicts_test = df_test.to_dict(orient='records')
X_test = dv.transform(dicts_test)

In [69]:

features = list(dv.get_feature_names_out())

dfull_train = xgb.DMatrix(X_full_train, label=y_full_train, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

In [70]:
# We will hold ETA at 0.3 and max depth at 3
xgb_params_final = {
    'eta': 0.3, 
    'max_depth': 3,
    'min_child_weight': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}
evals_result = {}
watchlist = [(dfull_train, 'train'), (dtest, 'test')]


In [71]:
def parse_xgb_output(output):
    results = []
    
    for line in output.strip().split('\n'):
        num, train, test = line.split('\t')
        it = int(num.strip('[]'))
        train = float(train.split(':')[1])
        test = float(test.split(':')[1])
        
        results.append((it, train, test))
    
    columns = ['num_of_iterations', 'train_auc', 'test_auc']
    df_results = pd.DataFrame(results, columns=columns)
    
    return df_results


In [72]:
%%capture output
# this will allow us to capture the output. Line Magics must be at the beginning of the cell


In [73]:
output_string = (output.stdout)


In [74]:
xgb_model = xgb.train(params=xgb_params_final,
                  dtrain=dfull_train,
                  num_boost_round=100,
                  verbose_eval=5,
                  evals=watchlist,
                  evals_result=evals_result)
xgb_pred = xgb_model.predict(dtest)
xgb_liked = (xgb_pred >= 0.5)

[0]	train-auc:0.77736	test-auc:0.78511
[5]	train-auc:0.85847	test-auc:0.84496
[10]	train-auc:0.88737	test-auc:0.85813
[15]	train-auc:0.89924	test-auc:0.86149
[20]	train-auc:0.91111	test-auc:0.86460
[25]	train-auc:0.91997	test-auc:0.87193
[30]	train-auc:0.92400	test-auc:0.87117
[35]	train-auc:0.93383	test-auc:0.86694
[40]	train-auc:0.94110	test-auc:0.87043
[45]	train-auc:0.94608	test-auc:0.86986
[50]	train-auc:0.95138	test-auc:0.87072
[55]	train-auc:0.95607	test-auc:0.86761
[60]	train-auc:0.95961	test-auc:0.86908
[65]	train-auc:0.96420	test-auc:0.86726
[70]	train-auc:0.96789	test-auc:0.86459
[75]	train-auc:0.97175	test-auc:0.86371
[80]	train-auc:0.97383	test-auc:0.86574
[85]	train-auc:0.97663	test-auc:0.86812
[90]	train-auc:0.98060	test-auc:0.86464
[95]	train-auc:0.98458	test-auc:0.86520
[99]	train-auc:0.98602	test-auc:0.86403


In [76]:
columns = ['iter', 'train_auc', 'test_auc']
train_aucs = list(evals_result['train'].values())[0]
test_aucs = list(evals_result['test'].values())[0]

df_scores = pd.DataFrame(
    list(zip(
        range(1, len(train_aucs) + 1),
        train_aucs,
        test_aucs
    )), columns=columns)

# plt.plot(df_scores.iter, df_scores.train_auc, label='train')
# plt.plot(df_scores.iter, df_scores.test_aucs, label='test')
# plt.legend()