In [2]:
import numpy as np
import pandas as pd

from scipy import sparse, stats
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score

from tqdm import tqdm

import lightgbm as lgbm
import implicit

from dataset import ALS_BPR_Dataset
from cv import cross_validation_score_statement, cross_validate_model
from model import LGBM_model

## Get data

In [3]:
TRAIN_PATH = 'train/'
TEST_PATH = 'test/'

In [4]:
X1 = pd.read_csv(TRAIN_PATH + 'X1.csv')
X2 = pd.read_csv(TRAIN_PATH + 'X2.csv')
X3 = pd.read_csv(TRAIN_PATH + 'X3.csv')

Y = pd.read_csv(TRAIN_PATH + 'Y.csv')

In [5]:
X1_test = pd.read_csv(TEST_PATH + 'X1.csv')
X2_test = pd.read_csv(TEST_PATH + 'X2.csv')
X3_test = pd.read_csv(TEST_PATH + 'X3.csv')

Concatenate X1 and X1_test and sort them by ids.

In [6]:
X1_no_id = X1.drop('id', axis='columns')
X1__test_no_id = X1_test.drop('id', axis='columns')

X1_all = pd.concat((X1, X1_test)).sort_values(by='id').reset_index().drop('index', axis=1)

Get amount of ids visited by each user and add this feature to X1 and X1_test.

In [7]:
X2_user_ids, X2_counts = np.unique(X2['id'], return_counts=True)

url_count = np.hstack((X2_user_ids.reshape(-1, 1), X2_counts.reshape(-1, 1)))
url_count_df = pd.DataFrame(data=url_count, columns=['id', 'counter'])

X1_url_counter = X1.merge(url_count_df, on='id', how='inner')
X1_url_counter_no_id = X1_url_counter.drop('id', axis='columns')

In [8]:
X2_user_ids_test, X2_counts_test = np.unique(X2_test['id'], return_counts=True)

url_count_test = np.hstack((X2_user_ids_test.reshape(-1, 1), X2_counts_test.reshape(-1, 1)))
url_count_df_test = pd.DataFrame(data=url_count_test, columns=['id', 'counter'])

X1_url_counter_test = X1_test.merge(url_count_df_test, on='id', how='inner')
X1_url_counter_no_id_test = X1_url_counter_test.drop('id', axis=1)

Concatenate X1_url_counter and X1_url_counter_test and sort them by ids.

In [9]:
X1_url_counter_all = pd.concat((X1_url_counter, X1_url_counter_test)).sort_values(by='id').reset_index().drop('index', axis=1)

Concatenate X2 and X2_test and sort the DataFrame by id.

In [10]:
X2_all = pd.concat((X2, X2_test), axis=0)

Incorporate information from X2 via implicit matrix factorization.

In [20]:
ALS_params = {'factors':40, 'iterations':120}
BPR_params = {'factors':350, 'iterations':200}
config = ['als', 'cat', 'cat', 'bpr', 'bpr']
item_user_emb = ['item', {'als':'item', 'bpr':'user'}, {'als':'item', 'bpr':'user'}, 'user', 'user'] 

dataset_1 = ALS_BPR_Dataset(ALS_params=ALS_params, BPR_params=BPR_params, config=config, item_user_emb=item_user_emb)

dataset_1.fit(X1_url_counter_all, X2_all)

HBox(children=(IntProgress(value=0, max=120), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [21]:
ALS_params = {'factors':30, 'iterations':60}
BPR_params = {'factors':350, 'iterations':200}
config = ['cat', 'bpr', 'bpr', 'cat', 'bpr']
item_user_emb = [{'als':'user', 'bpr':'user'}, 'user', 'user', {'als':'user', 'bpr':'user'}, 'user'] 

dataset_2 = ALS_BPR_Dataset(ALS_params=ALS_params, BPR_params=BPR_params, config=config, item_user_emb=item_user_emb)

dataset_2.fit(X1_url_counter_all, X2_all)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [22]:
ALS_params = {'factors':40, 'iterations':60}
BPR_params = {'factors':350, 'iterations':170}
config = ['cat', 'bpr', 'bpr', 'bpr', 'cat']
item_user_emb = [{'als':'item', 'bpr':'item'}, 'item', 'item', 'item', {'als':'item', 'bpr':'item'}] 

dataset_3 = ALS_BPR_Dataset(ALS_params=ALS_params, BPR_params=BPR_params, config=config, item_user_emb=item_user_emb)

dataset_3.fit(X1_url_counter_all, X2_all)

HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




HBox(children=(IntProgress(value=0, max=170), HTML(value='')))




## Evaluate model's perfomance by cross-validation

In [23]:
lgbm_params_1 = [{'learning_rate':0.0017, 'n_estimators':550, 'max_depth':4, 'feature_fraction':0.75}] \
              + 4 * [{'learning_rate':0.004, 'n_estimators':760, 'max_depth':3, 'feature_fraction':0.55}]

_ = cross_validate_model(dataset_1, X1.id, Y, lgbm_params_1, use_same_params=False)

Target 1: mean = 0.6061, std = 0.0246
Target 2: mean = 0.6315, std = 0.0087
Target 3: mean = 0.6294, std = 0.0135
Target 4: mean = 0.6233, std = 0.0124
Target 5: mean = 0.6337, std = 0.0163
All targets: mean = 0.6248, std = 0.0100


In [24]:
lgbm_params_2 = {'learning_rate':0.004, 'n_estimators':760, 'max_depth':3, 'feature_fraction':0.55}

_ = cross_validate_model(dataset_2, X1.id, Y, lgbm_params_2, use_same_params=True)

Target 1: mean = 0.6028, std = 0.0243
Target 2: mean = 0.6407, std = 0.0125
Target 3: mean = 0.6292, std = 0.0167
Target 4: mean = 0.6226, std = 0.0112
Target 5: mean = 0.6359, std = 0.0108
All targets: mean = 0.6262, std = 0.0132


In [25]:
lgbm_params_3 = {'learning_rate':0.004, 'n_estimators':760, 'max_depth':3, 'feature_fraction':0.55}

_ = cross_validate_model(dataset_3, X1.id, Y, lgbm_params_3, use_same_params=True)

Target 1: mean = 0.6077, std = 0.0270
Target 2: mean = 0.6335, std = 0.0146
Target 3: mean = 0.6253, std = 0.0139
Target 4: mean = 0.6233, std = 0.0184
Target 5: mean = 0.6255, std = 0.0153
All targets: mean = 0.6231, std = 0.0085


## Fit models with 10 random seeds and make final predictions

In [28]:
lgbm_model_1 = LGBM_model(lgbm_params_1, use_same_params=False)

preds_1 = lgbm_model_1.fit_predict_n_random_seed(dataset_1, X1.id, Y, X1_test.id)

In [29]:
lgbm_model_2 = LGBM_model(lgbm_params_2, use_same_params=True)

preds_2 = lgbm_model_2.fit_predict_n_random_seed(dataset_2, X1.id, Y, X1_test.id)

In [30]:
lgbm_model_3 = LGBM_model(lgbm_params_3, use_same_params=True)

preds_3 = lgbm_model_3.fit_predict_n_random_seed(dataset_3, X1.id, Y, X1_test.id)

In [31]:
final_preds = (preds_1 + preds_2 + preds_3) / 3

## Make a submission.

In [33]:
answers_df = pd.DataFrame(data=final_preds, columns=['1', '2', '3', '4', '5'])
answers_df = pd.concat((X1_test['id'], answers_df), axis=1)

answers_df.to_csv("final_submission.csv", index=False)