In [9]:
import os
import utils
import sklearn.model_selection
import numpy as np
import pandas as pd

RANDOM_STATE = 1337
NUM_JOBS = -1

DIR = "--DIR-THAT-CONTAINS-THIS-FILE--"
DATA_DIR = os.path.join(DIR, 'data')
REPORTS_DIR = os.path.join(DIR, 'reports')

DATA_DIR, REPORTS_DIR

('C:\\Users\\yahry\\Downloads\\proj\\data',
 'C:\\Users\\yahry\\Downloads\\proj\\reports')

In [7]:
utils.LightFM_Config.__dict__

{'experiment': <Experiment.LIGHT_FM: 0>,
 'split_strategy': TimeSortSplit(num_interactions='all', first_stage_train_split=0.6, second_stage_train_split=0.2, test_split=0.2),
 'filter_strategy': [MinNumInteractionsFilter(min_user_ints=20, min_item_ints=500),
  OnlyLastInteractionsFilter(filter_column='user_id', n_last=20)],
 'concat_stages': True,
 'use_popular_penalty': False}

In [2]:
data = utils.load_data(utils.LightFM_Config)
data.train_interactions.head()

Data after filter:
Len of train interactions with period [2023-09-18 00:59:49 / 2023-09-30 18:55:18] - 5856731
Len of test interactions with period [2022-04-10 21:12:58 / 2023-09-29 22:01:52] - 1464183


Unnamed: 0,user_id,item_id,timestamp,weight,index
0,67741701,tt11125620,2023-09-30 18:55:18,0.906272,2928366
1,954501,tt1922777,2023-08-25 08:07:16,0.487424,2928367
2,33880201,tt13133936,2022-03-23 00:43:28,0.99,2928368
3,18471701,tt6301712,2022-08-14 17:43:34,0.749275,2928369
4,20923001,tt2381249,2023-09-16 04:20:39,0.01,2928370


In [None]:
dataset = data.get_lightfm_dataset(
    list_values_columns=['genres'],
    scalar_values_columns=['lifetime']
)

params = {
    'no_components': [128, 256],
    'item_alpha': [0.0005, 0.0001],
    'num_epochs': [5, 7, 9]
}

estimator_params = utils.build_estimator_params(data, dataset)
report_users = utils.get_users_for_test(data.train_interactions, min_n_interactions=10,max_n_interactions=10, top_n_hist=10)

for p in sklearn.model_selection.ParameterGrid(params):
    model = utils.SklearnEstimatorLightFM(random_state=RANDOM_STATE, **p)
    model.fit(data.train_interactions, **estimator_params)

    utils.users_report(model, report_users, 10, dataset, data, '_' + str(p), REPORTS_DIR);

In [None]:
estimator = utils.SklearnEstimatorLightFM(random_state=1337)
scorer = utils.build_scorer(data, dataset)

grid_search = sklearn.model_selection.GridSearchCV(
    estimator,
    params,
    n_jobs=NUM_JOBS,
    refit=False,
    scoring=scorer,
    cv=sklearn.model_selection.KFold(3, shuffle=True, random_state=1337),
    verbose=10
)
grid_search.fit(data.train_interactions, **estimator_params);

In [None]:
cv_data = pd.DataFrame({k: v for k, v in grid_search.cv_results_.items() if k.startswith('split') or k=='params'})
cv_data = pd.concat([pd.DataFrame.from_records(cv_data['params'].values, index=cv_data['params'].index), cv_data.drop(columns='params')], axis=1)
cv_data.to_csv('grid_search.csv')

cv_data.head(len(cv_data))

Unnamed: 0,item_alpha,no_components,split0_test_MAP,split1_test_MAP,split2_test_MAP,split0_test_Recall,split1_test_Recall,split2_test_Recall,split0_test_MeanInvUserFreq,split1_test_MeanInvUserFreq,split2_test_MeanInvUserFreq
0,0.0,32,0.0128,0.013136,0.012941,0.039973,0.040136,0.039717,5.102781,5.112524,5.107895
1,0.0001,32,0.007401,0.011272,0.007078,0.021507,0.035856,0.018465,5.538709,5.243351,5.618066
