In [1]:
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [2]:
import xgboost
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plot

In [3]:
# Load dataframes

train = pd.read_hdf('df_data2/train_final.hdf')
val = pd.read_hdf('df_data2/val_final.hdf')
test = pd.read_hdf('df_data2/test_final.hdf')

In [4]:
xgtrain = xgboost.DMatrix(train.drop(['pid', 'tid', 'target'], axis=1), train.target)

In [5]:
xgval = xgboost.DMatrix(val.drop(['pid', 'tid', 'target'], axis=1), val.target)

In [6]:
xgtest = xgboost.DMatrix(test.drop(['pid', 'tid'], axis=1))

In [None]:
%%time

# Train XGBoost model
# Manual parameter tuning

max_auc = 0.0
best_params = None

# Tuning max-depth
for max_depth in [5, 6, 7, 8, 9]:
    print('max_depth: {}'.format(max_depth))
    
    result = {}
    
    params = {
        'objective':'binary:logistic',
        'eta':0.1,
        'booster':'gbtree',
        'max_depth': max_depth,
        'nthread':16,
        'seed':1,
        'eval_metric':'auc',
    }

    model = xgboost.train(
        params=list(params.items()),
        early_stopping_rounds=30, 
        verbose_eval=10, 
        dtrain=xgtrain,
        evals=[(xgtrain, 'train'), (xgval, 'test')],
        evals_result=result,
        num_boost_round=300,
    )
    best_score = float(model.attributes()['best_score'])
    last_train = result['train']['auc'][-1]
    last_test = result['test']['auc'][-1]
    print('Last train-AUC: {}, last test-AUC: {}, best iteration: {}, best score: {}'\
          .format(last_train, last_test, model.attributes()['best_iteration'], model.attributes()['best_score']))
    if best_score > max_auc:
        max_auc = best_score
        best_params = max_depth
        model.save_model('models2/trial2_best.model')
        
print('Best params: max_depth={}'.format(best_params))

max_depth: 5
[0]	train-auc:0.75284	test-auc:0.74941
[10]	train-auc:0.77044	test-auc:0.76676
[20]	train-auc:0.77159	test-auc:0.76779
[30]	train-auc:0.77463	test-auc:0.77051
[40]	train-auc:0.77680	test-auc:0.77205
[50]	train-auc:0.77837	test-auc:0.77011
[60]	train-auc:0.78062	test-auc:0.76657
[69]	train-auc:0.78351	test-auc:0.76354
Last train-AUC: 0.783512, last test-AUC: 0.763541, best iteration: 39, best score: 0.772056
max_depth: 6
[0]	train-auc:0.75975	test-auc:0.75579
[10]	train-auc:0.77431	test-auc:0.76990
[20]	train-auc:0.77566	test-auc:0.77111
[30]	train-auc:0.77735	test-auc:0.77228
[40]	train-auc:0.77889	test-auc:0.77204
[50]	train-auc:0.78146	test-auc:0.76980
[60]	train-auc:0.78482	test-auc:0.76625
[69]	train-auc:0.78913	test-auc:0.76355
Last train-AUC: 0.789132, last test-AUC: 0.763549, best iteration: 39, best score: 0.772466
max_depth: 7
[0]	train-auc:0.76602	test-auc:0.76144
[10]	train-auc:0.77745	test-auc:0.77225
[20]	train-auc:0.77844	test-auc:0.77293
[30]	train-auc:0.779

In [None]:
# %%time

# # Grid search using xgboost.cv

# best_params = None

# grid_params = [
#     (max_depth, min_child_weight)
#     for max_depth in [7, 8, 9]
#     for min_child_weight in [0.5, 1, 1.5, 2]
# ]

# max_auc = 0.0

# for max_depth, min_child_weight in grid_params:
#     result = {}
    
#     print("CV with max_depth={}, min_child_weight={}".format(
#                              max_depth,
#                              min_child_weight))
#     # Update our parameters
#     params = {
#         'objective':'binary:logistic',
#         'eta':0.1,
#         'booster':'gbtree',
#         'max_depth': max_depth,
#         'min_child_weight': min_child_weight,
#         'nthread':16,
#         'seed':1,
#         'eval_metric':'auc',
#     }
# #     params['max_depth'] = max_depth
# #     params['min_child_weight'] = min_child_weight
#     print('Run CV...')
#     # Run CV
#     model = xgboost.train(
#         params=list(params.items()),
#         dtrain=xgtrain,
#         verbose_eval=10, 
#         num_boost_round=300,
#         evals_result=result,
#         evals=[(xgtrain, 'train'), (xgval, 'test')],
#         early_stopping_rounds=30,
#     )
#     # Update best AUC
#     best_score = float(model.attributes()['best_score'])
# #     mean_auc = cv_results['test-auc-mean'].max()
# #     boost_rounds = cv_results['test-auc-mean'].argmax()
# #     print("\tAUC {} for {} rounds".format(mean_auc, boost_rounds))
#     last_train = result['train']['auc'][-1]
#     last_test = result['test']['auc'][-1]
#     print('Last train-AUC: {}, last test-AUC: {}, best iteration: {}, best score: {}'\
#           .format(last_train, last_test, model.attributes()['best_iteration'], model.attributes()['best_score']))
#     if best_score > max_auc:
#         max_auc = best_score
#         best_params = (max_depth, min_child_weight)
#         model.save_model('models2/trial3_best.model')
        
# print("Best params: {}, {}, AUC: {}".format(best_params[0], best_params[1], max_auc))

CV with max_depth=7, min_child_weight=0.5
Run CV...
[0]	train-auc:0.76602	test-auc:0.76144
[10]	train-auc:0.77744	test-auc:0.77225
[20]	train-auc:0.77843	test-auc:0.77293
[30]	train-auc:0.77966	test-auc:0.77319
[40]	train-auc:0.78182	test-auc:0.77180
[50]	train-auc:0.78543	test-auc:0.76921
[56]	train-auc:0.78795	test-auc:0.76687
Last train-AUC: 0.787953, last test-AUC: 0.766868, best iteration: 26, best score: 0.773342
CV with max_depth=7, min_child_weight=1
Run CV...
[0]	train-auc:0.76602	test-auc:0.76144
[10]	train-auc:0.77745	test-auc:0.77225
[20]	train-auc:0.77844	test-auc:0.77293
[30]	train-auc:0.77965	test-auc:0.77314
[40]	train-auc:0.78165	test-auc:0.77163
[50]	train-auc:0.78535	test-auc:0.76915
[55]	train-auc:0.78765	test-auc:0.76716
Last train-AUC: 0.787645, last test-AUC: 0.76716, best iteration: 25, best score: 0.773287
CV with max_depth=7, min_child_weight=1.5
Run CV...
[0]	train-auc:0.76611	test-auc:0.76152
[10]	train-auc:0.77754	test-auc:0.77235
[20]	train-auc:0.77837	tes

In [9]:
# Use trial 3's best model to generate candidates

bst = xgboost.Booster({'nthread': 16})  # init model
bst.load_model('models2/trial2_best.model')  # load data

In [10]:
# Initialize pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master('local[*]')\
                    .appName("LightFM Features") \
                    .getOrCreate()

In [11]:
path_to_df = 'gs://thesis_apc_bucket/df_data2'

df_tracks = spark.read.orc(path_to_df + '/df_tracks.orc')

In [12]:
test['score'] = bst.predict(xgtest)
test = test.sort_values(['pid', 'score'], ascending=[True, False])
recs = test.groupby('pid').tid.apply(lambda x: x.values[:500])
tracks_info = df_tracks.toPandas()
track_uri = tracks_info.track_uri

In [13]:
submission = open('submissions/submission-ps2.csv', 'w')
submission.write('team_info,main,Exin,martin.alinggajaya@gmail.com\n')

for pid, tids in recs.items():
    submission.write('{}, '.format(pid) + ', '.join(track_uri.loc[tids].values) + '\n')
    
submission.close()

In [14]:
# Show best features

feature_gain = bst.get_score(importance_type='gain')
sorted_feature_gain = {k: v for k, v in sorted(feature_gain.items(), key=lambda x: x[1], reverse=True)}
sorted_feature_gain

{'lightfm_rank': 582.819659129849,
 'co_occurrence_norm_mean': 418.6964036832044,
 'co_occurrence_norm_max': 186.09384870139706,
 'tracks_holdout': 97.02953851312373,
 'co_occurrence_norm_median': 87.076182221016,
 'lightfm_dot_product_text': 84.14416692737292,
 'co_occurrence_mean': 62.49528894717198,
 'lightfm_rank_text': 50.21511059995837,
 'mean_artist_in_playlist': 34.72684822015673,
 'co_occurrence_max': 27.76235828795481,
 'co_occurrence_norm_min': 25.281429473975425,
 'sim_artist_in_playlist': 23.165439228507147,
 'pid_count': 21.05384223465555,
 'co_occurrence_median': 20.3485696234379,
 'share_of_unique_album': 19.369356472521357,
 'pid_bias': 18.71556147980607,
 'lightfm_dot_product': 17.916580977885385,
 'artist_count': 17.55405916934812,
 'tid_bias': 16.479972917338866,
 'co_occurrence_min': 15.411945017539773,
 'share_of_unique_artist': 14.617043014609012,
 'mean_album_in_playlist': 14.441811934021942,
 'pid_bias_text': 12.162134601808507,
 'tid_bias_text': 11.85202970322