In [1]:
import os
import sklearn.model_selection
import numpy as np
import pandas as pd
import utils
import rectools.models
import rectools.metrics
import implicit
import hueristics
import matplotlib.pyplot as plt
from IPython.display import clear_output


RANDOM_STATE = 1337
NUM_JOBS = -1

os.environ['DIR'] = "/home/ml/softezza_ml/"
os.environ['DB_ENDPOINT'] = "apollo-api-staging-f82be878-d243-4113-8052-ef36565618e0.cpljy7lbflfq.eu-west-1.rds.amazonaws.com"
os.environ['DB_PORT'] = '3306'
os.environ['DB_USER'] = "admin"
os.environ['DB_PASSWORD'] = 'zsfZMSpS0SGz8gp203QJ4r3bqpVNxwmG'
os.environ['DB_NAME'] = "vapor"

DATA_DIR = os.path.join(os.environ['DIR'], 'data')
REPORTS_DIR = os.path.join(os.environ['DIR'], 'reports')

DATA_DIR, REPORTS_DIR

('/home/ml/softezza_ml/data', '/home/ml/softezza_ml/reports')

In [2]:
config = utils.DataConfig(
    experiment=utils.Experiment.LIGHT_FM,
    split_strategy=utils.TimeSortSplit('all', 0.6, 0.2, 0.2),
    filter_strategy=[
        utils.MinNumInteractionsFilter(10, 500),
        utils.OnlyLastInteractionsFilter('user_id', 20)
    ],
    concat_stages=True,
    use_popular_penalty=False
)

data = utils.load_data(config)
train_dataset, test_dataset = data.get_rectools_dataset();

data.train_interactions.head()

Data after filter:
Len of train interactions with period [['2023-08-24T00:28:10.000000000'] / ['2023-08-24T00:28:09.000000000']] - 6002853
Len of test interactions with period [['2023-09-14T06:04:34.000000000'] / ['2023-10-04T11:31:55.000000000']] - 509348
Num of uniq users 364765Num of uniq items 8223


Unnamed: 0,user_id,item_id,timestamp,weight,index
0,15847001,tt0317219,2023-08-24 00:28:09,0.929345,3384613
1,29835903,tt0947798,2023-08-24 00:28:07,0.952778,3384614
2,24085201,tt5761544,2023-08-24 00:28:04,0.95084,3384615
3,68446701,tt0099472,2023-08-24 00:28:04,0.99,3384616
4,19975701,tt13932162,2023-08-24 00:28:04,0.939482,3384617


### Проверка лучшей KION-модели

In [3]:
bm25_train_dataset, bm25_test_dataset = data.get_rectools_dataset()

bm25_model = rectools.models.ImplicitItemKNNWrapperModel(
    implicit.nearest_neighbours.BM25Recommender(50, 0.1, 0.75, num_threads=12),
    verbose=1
)

bm25_model.fit(bm25_train_dataset);

100%|██████████| 8223/8223 [00:00<00:00, 33944.81it/s]


In [11]:
recos = bm25_model.recommend(
    k=10,
    users=data.all_users,
    dataset=bm25_train_dataset,
    filter_viewed=True,
    add_rank_col=True,
)

In [18]:
metrics = rectools.metrics.calc_metrics(
    {
        'MAP@10': rectools.metrics.MAP(10),
        'Recall@10': rectools.metrics.Recall(10),
        'Siren@10': rectools.metrics.Serendipity(10),
        'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
    },
    reco=recos,
    interactions=data.test_interactions,
    prev_interactions=data.train_interactions,
    catalog=data.all_items
)

metrics['PopInt@10'] = PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)

pd.DataFrame.from_records((metrics,))

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10
0,0.153368,0.063986,4.140743,0.000104,0.236565


In [13]:
test_users = utils.get_users_for_test(data.train_interactions, min_n_interactions=5, max_n_interactions=7)

utils.users_report(bm25_train_dataset, bm25_model, test_users).head(21)

100%|██████████| 1/1 [00:00<00:00, 3795.75it/s]
100%|██████████| 1/1 [00:00<00:00, 3401.71it/s]
100%|██████████| 1/1 [00:00<00:00, 2568.47it/s]
100%|██████████| 1/1 [00:00<00:00, 5698.78it/s]
100%|██████████| 1/1 [00:00<00:00, 5769.33it/s]
100%|██████████| 1/1 [00:00<00:00, 5526.09it/s]
100%|██████████| 1/1 [00:00<00:00, 4514.86it/s]
100%|██████████| 1/1 [00:00<00:00, 5133.79it/s]
100%|██████████| 1/1 [00:00<00:00, 2790.62it/s]
100%|██████████| 1/1 [00:00<00:00, 5753.50it/s]


Unnamed: 0,user_id,hist_item_id,hist_title,hist_genres,pred_item_id,pred_title,pred_genres
0,72749402.0,tt1745960,Top Gun: Maverick,"""Action / Drama""",tt9247314,Spiral,"""Drama / Horror / Mystery / Thriller"""
1,,tt9247314,Spiral,"""Drama / Horror / Mystery / Thriller""",tt2179136,American Sniper,"""Action / Biography / Drama / War"""
2,,tt3704428,Elvis,"""Biography / Drama / Music""",tt9100018,Cobweb,"""Horror / Thriller"""
3,,tt2106476,The Hunt,"""Drama""",tt2106476,The Hunt,"""Drama"""
4,,tt12593682,Bullet Train,"""Action / Comedy / Thriller""",tt19858164,Little Bone Lodge,"""Crime / Horror / Thriller"""
5,,tt13495458,The Inspection,"""Drama""",tt10406596,You Are Not My Mother,"""Drama / Horror"""
6,,tt2179136,American Sniper,"""Action / Biography / Drama / War""",tt13405778,Insidious: The Red Door,"""Horror / Mystery / Thriller"""
7,,,,,tt7631568,The Devil to Pay,"""Thriller"""
8,,,,,tt11908982,Jakob's Wife,"""Horror"""
9,,,,,tt15671028,No Hard Feelings,"""Comedy / Romance"""


### Проверка по параметрам разных моделей

In [9]:
grids = {
    'cosine': {
        'model': implicit.nearest_neighbours.CosineRecommender,
        'grid': {
            'K': [10, 20, 50, 100, 150, 200]
        }
    },

    'tf_idf': {
        'model': implicit.nearest_neighbours.TFIDFRecommender,
        'grid': {
            'K': [10, 20, 50, 100, 150, 200]
        }
    },

    'bm25': {
        'model': implicit.nearest_neighbours.BM25Recommender,
        'grid': {
            'K': [50, 100, 200],
            'K1': [0.1, 1.2, 5.],
            'B': [0.1, 0.5, 0.7]
        }
    }
}

results = []

for label, params in grids.items():

    for p in sklearn.model_selection.ParameterGrid(params['grid']):
        
        rectools
        
        model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(params['model'](num_threads=12, **p))
        model.fit(train_dataset)
 
        recos = model.recommend(
            k=10,
            users=data.all_users,
            dataset=train_dataset,
            filter_viewed=True,
            add_rank_col=True,
        )

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@10': rectools.metrics.MAP(10),
                'Recall@10': rectools.metrics.Recall(10),
                'Siren@10': rectools.metrics.Serendipity(10),
                'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@10'] = utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@10'] = utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics = {**metrics, **p}
        results.append(metrics)
        print(metrics)


grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('grid_report.csv', index=False)
clear_output(wait=True)

grid_data.sort_values(['RecallNoPop@10', 'PopInt@10', 'Recall@10', 'MAP@10'], ascending=[False, True, False, False]).head(40)


Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,RecallNoPop@10,model,K,B,K1
28,0.086603,0.030829,5.921746,0.000263,0.053398,0.063613,bm25,200,0.5,1.2
25,0.081184,0.029504,5.974909,0.000265,0.050316,0.062835,bm25,100,0.5,1.2
22,0.078193,0.028899,6.087458,0.000268,0.049978,0.060414,bm25,50,0.5,1.2
37,0.06488,0.023855,6.630329,0.000309,0.026052,0.058542,bm25,200,0.7,1.2
14,0.147778,0.060658,4.232687,0.000112,0.21869,0.057454,bm25,50,0.1,5.0
29,0.062165,0.022976,6.744681,0.000315,0.02321,0.05729,bm25,200,0.5,5.0
34,0.062624,0.023414,6.680587,0.000308,0.026293,0.057258,bm25,100,0.7,1.2
17,0.155154,0.063576,4.16199,0.000108,0.233245,0.057177,bm25,100,0.1,5.0
20,0.156688,0.063702,4.161702,0.000106,0.238141,0.056745,bm25,200,0.1,5.0
26,0.060268,0.022662,6.793215,0.000314,0.023293,0.056147,bm25,100,0.5,5.0


### Проверка по параметрам bm25 модели 1

In [4]:
train_dataset, test_dataset = data.get_rectools_dataset()

grids = {
    'bm25': {
        'model': implicit.nearest_neighbours.BM25Recommender,
        'grid': {
            'K': [50],
            'K1': [0.5, 1.25, 2.5, 5.],
            'B': [0.25, 0.5, 0.75, 1.]
        }
    }
}

results = []

for label, params in grids.items():

    for p in sklearn.model_selection.ParameterGrid(params['grid']):
        model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(params['model'](num_threads=12, **p))
        model.fit(train_dataset)
 
        recos = model.recommend(
            k=10,
            users=data.all_users,
            dataset=train_dataset,
            filter_viewed=True,
            add_rank_col=True,
        )

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@10': rectools.metrics.MAP(10),
                'Recall@10': rectools.metrics.Recall(10),
                'Siren@10': rectools.metrics.Serendipity(10),
                'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@10'] = utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@10'] = utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics = {**metrics, **p}
        results.append(metrics)
        print(metrics)


grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('grid_report_bm25.csv', index=False)
clear_output(wait=True)

grid_data.sort_values(['RecallNoPop@10', 'PopInt@10', 'Recall@10', 'MAP@10'], ascending=[False, True, False, False]).head(40)

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,RecallNoPop@10,model,B,K,K1
1,0.119698,0.045357,4.929836,0.000179,0.131305,0.063335,bm25,0.25,50,1.25
4,0.105114,0.039211,5.2723,0.000209,0.100415,0.063,bm25,0.5,50,0.5
2,0.104354,0.038846,5.294268,0.00021,0.099507,0.062826,bm25,0.25,50,2.5
3,0.095467,0.035281,5.529936,0.000228,0.080604,0.062541,bm25,0.25,50,5.0
8,0.080963,0.029969,5.996568,0.000264,0.054755,0.060886,bm25,0.75,50,0.5
5,0.077278,0.02858,6.121813,0.000271,0.048105,0.060224,bm25,0.5,50,1.25
0,0.145543,0.059337,4.299071,0.000119,0.208165,0.058708,bm25,0.25,50,0.5
12,0.065951,0.024806,6.601037,0.000302,0.032909,0.056798,bm25,1.0,50,0.5
6,0.06566,0.02469,6.594119,0.000299,0.032672,0.056359,bm25,0.5,50,2.5
7,0.058302,0.022359,6.90803,0.000315,0.024174,0.053616,bm25,0.5,50,5.0


### Проверка по параметрам bm25 модели 2

In [None]:
grids = {
    'bm25': {
        'model': implicit.nearest_neighbours.BM25Recommender,
        'grid': {
            'K': [50, 100, 200],
            'K1': [0.5, 1.25, 2.5],
            'B': [0.25, 0.5]
        }
    }
}

results = []

for label, params in grids.items():

    for p in sklearn.model_selection.ParameterGrid(params['grid']):
        model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(params['model'](num_threads=12, **p))
        model.fit(train_dataset)
 
        recos = model.recommend(
            k=10,
            users=data.all_users,
            dataset=train_dataset,
            filter_viewed=True,
            add_rank_col=True,
        )

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@10': rectools.metrics.MAP(10),
                'Recall@10': rectools.metrics.Recall(10),
                'Siren@10': rectools.metrics.Serendipity(10),
                'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@10'] = utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@10'] = utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics = {**metrics, **p}
        results.append(metrics)
        print(metrics)


grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('grid_report_bm25_2.csv', index=False)
clear_output(wait=True)

grid_data.sort_values(['RecallNoPop@10', 'PopInt@10', 'Recall@10', 'MAP@10'], ascending=[False, True, False, False]).head(40)

### Проверка по параметрам bm25 модели 3

In [7]:
grids = {
    'bm25': {
        'model': implicit.nearest_neighbours.BM25Recommender,
        'grid': {
            'K': [100, 150, 250, 400],
            'K1': [0.5],
            'B': [0.5]
        }
    }
}

results = []

for label, params in grids.items():

    for p in sklearn.model_selection.ParameterGrid(params['grid']):
        model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(params['model'](num_threads=12, **p))
        model.fit(train_dataset)
 
        recos = model.recommend(
            k=10,
            users=data.all_users,
            dataset=train_dataset,
            filter_viewed=True,
            add_rank_col=True,
        )

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@10': rectools.metrics.MAP(10),
                'Recall@10': rectools.metrics.Recall(10),
                'Siren@10': rectools.metrics.Serendipity(10),
                'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@10'] = utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@10'] = utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics = {**metrics, **p}
        results.append(metrics)
        print(metrics)


grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('grid_report_bm25_3.csv', index=False)
clear_output(wait=True)

grid_data.sort_values(['RecallNoPop@10', 'PopInt@10', 'Recall@10', 'MAP@10'], ascending=[False, True, False, False]).head(40)

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,RecallNoPop@10,model,B,K,K1
2,0.11849,0.043307,5.103281,0.000198,0.115057,0.065422,bm25,0.5,250,0.5
1,0.115993,0.042639,5.118928,0.0002,0.111295,0.065279,bm25,0.5,150,0.5
0,0.112172,0.041432,5.158009,0.000203,0.104473,0.065268,bm25,0.5,100,0.5
3,0.117902,0.042944,5.112807,0.000198,0.115891,0.064873,bm25,0.5,400,0.5


### Проверка по параметрам bm25 модели 4

In [16]:
grids = {
    'bm25': {
        'model': implicit.nearest_neighbours.BM25Recommender,
        'grid': {
            'K': [100],
            'K1': [0.4, 0.5, 0.6],
            'B': [0.4, 0.5, 0.6]
        }
    }
}

results = []

for label, params in grids.items():

    for p in sklearn.model_selection.ParameterGrid(params['grid']):
        model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(params['model'](num_threads=12, **p))
        model.fit(train_dataset)
 
        recos = model.recommend(
            k=10,
            users=data.all_users,
            dataset=train_dataset,
            filter_viewed=True,
            add_rank_col=True,
        )

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@10': rectools.metrics.MAP(10),
                'Recall@10': rectools.metrics.Recall(10),
                'Siren@10': rectools.metrics.Serendipity(10),
                'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@10'] = utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@10'] = utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics = {**metrics, **p}
        results.append(metrics)
        print(metrics)


grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('grid_report_bm25_4.csv', index=False)
clear_output(wait=True)

grid_data.sort_values(['RecallNoPop@10', 'PopInt@10', 'Recall@10', 'MAP@10'], ascending=[False, True, False, False]).head(40)

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,RecallNoPop@10,model,B,K,K1
6,0.10975,0.040415,5.218978,0.000208,0.09941,0.065276,bm25,0.6,100,0.4
4,0.112172,0.041432,5.158009,0.000203,0.104473,0.065268,bm25,0.5,100,0.5
5,0.104415,0.038187,5.340286,0.000218,0.089027,0.065236,bm25,0.5,100,0.6
7,0.099985,0.036302,5.464859,0.000228,0.079933,0.065169,bm25,0.6,100,0.5
2,0.119483,0.044686,4.977873,0.000186,0.121507,0.064638,bm25,0.4,100,0.6
3,0.121917,0.045793,4.927165,0.000182,0.127004,0.064529,bm25,0.5,100,0.4
8,0.092217,0.033363,5.663779,0.000243,0.067315,0.064382,bm25,0.6,100,0.6
1,0.126605,0.048094,4.811482,0.000171,0.139574,0.063851,bm25,0.4,100,0.5
0,0.135979,0.052757,4.606971,0.000151,0.165355,0.062682,bm25,0.4,100,0.4


### Добавление косинусного расстояния

In [3]:
item_genres = pd.read_csv('/home/ml/softezza_ml/static_mappers/item_id2meta.csv', usecols=['item_id', 'genres'])
items_features = pd.DataFrame.from_records(item_genres['genres'].apply(lambda raw: { g : 1 for g in raw.replace('"', '').replace(' ', '').split('/')})).fillna(0)
items_features['item_id'] = item_genres['item_id']

items_features

Unnamed: 0,Drama,History,War,Horror,Mystery,Thriller,Fantasy,Romance,Crime,Documentary,...,Sci-Fi,Musical,Biography,Film-Noir,Short,Reality-TV,Talk-Show,News,Game-Show,item_id
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0004972
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0006864
2,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0010323
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0011237
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0011841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15803,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt9894470
15804,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt9900092
15805,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt9902160
15806,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt9907782


In [4]:
cosine = hueristics.Cosine()
cosine.fit(data.train_interactions, items_features=items_features)
cosine._HueristicsWrapper__fitted = True

hueristic = hueristics.HueristicsWrapper(enres_cosine_05=(cosine, 0.5));

In [5]:
grids = {
    'bm25': {
        'model': implicit.nearest_neighbours.BM25Recommender,
        'grid': {
            'K': [200],
            'K1': [1.25],
            'B': [0.25]
        }
    }
}

results = []

for label, params in grids.items():

    for p in sklearn.model_selection.ParameterGrid(params['grid']):
        model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(params['model'](num_threads=12, **p))
        model.fit(train_dataset)
 
        recos = model.recommend(
            k=100,
            users=data.all_users,
            dataset=train_dataset,
            filter_viewed=True,
            add_rank_col=False,
        )
        
        recos = hueristic.rerank(
            k=10,
            reco=recos
        )

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@10': rectools.metrics.MAP(10),
                'Recall@10': rectools.metrics.Recall(10),
                'Siren@10': rectools.metrics.Serendipity(10),
                'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@10'] = utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@10'] = utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics = {**metrics, **p}
        results.append(metrics)
        print(metrics)


grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('grid_report_bm25_4.csv', index=False)
clear_output(wait=True)

grid_data.sort_values(['RecallNoPop@10', 'PopInt@10', 'Recall@10', 'MAP@10'], ascending=[False, True, False, False]).head(40)

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,RecallNoPop@10,model,B,K,K1
0,0.077123,0.028209,5.234674,0.000145,0.05715,0.05613,bm25,0.25,200,1.25


In [6]:
import pickle

with open(f"/home/ml/softezza_ml/models/knn/bm25_K{200}_K1{1.25}_B{0.25}.pickle", mode='+xb') as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

### Сохранение лучших моделей

In [13]:
best_params = [
    {'K': 50, 'B': 0.25, 'K1': 2.50},
    {'K': 50, 'B': 0.50, 'K1': 0.50},
    {'K': 100, 'B': 0.25, 'K1': 2.50},
    {'K': 100, 'B': 0.50, 'K1': 0.50},
    {'K': 200, 'B': 0.25, 'K1': 2.50},
    {'K': 200, 'B': 0.50, 'K1': 0.50},
    {'K': 50, 'B': 0.25, 'K1': 1.25},
    {'K': 100, 'B': 0.25, 'K1': 1.25},
    {'K': 200, 'B': 0.25, 'K1': 1.25},
]

best_params = pd.DataFrame.from_records(best_params).sort_values(['K', 'B', 'K1'], ascending=False).reset_index(drop=True)

best_params.head(20)

Unnamed: 0,K,B,K1
0,200,0.5,0.5
1,200,0.25,2.5
2,200,0.25,1.25
3,100,0.5,0.5
4,100,0.25,2.5
5,100,0.25,1.25
6,50,0.5,0.5
7,50,0.25,2.5
8,50,0.25,1.25


In [20]:
import pickle

results = []

for _, row in best_params.iterrows():
    model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(implicit.nearest_neighbours.BM25Recommender(num_threads=12, **row))
    model.fit(train_dataset)

    recos = model.recommend(
        k=10,
        users=data.all_users,
        dataset=train_dataset,
        filter_viewed=True,
        add_rank_col=True,
    )

    metrics = rectools.metrics.calc_metrics(
        {
            'MAP@10': rectools.metrics.MAP(10),
            'Recall@10': rectools.metrics.Recall(10),
            'Siren@10': rectools.metrics.Serendipity(10),
            'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
        },
        reco=recos,
        interactions=data.test_interactions,
        prev_interactions=data.train_interactions,
        catalog=data.all_items
    )
    metrics['PopInt@10'] = utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
    metrics['RecallNoPop@10'] = utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
    metrics['model'] = 'bm25'
    metrics = {**metrics, **row}
    results.append(metrics)

    with open(f"/home/ml/softezza_ml/models/knn/bm25_K{row['K']}_K1{row['K1']}_B{row['B']}.pickle", mode='+xb') as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)


grid_data = pd.DataFrame.from_records(results).sort_values(['PopInt@10', 'Recall@10', 'MAP@10'], ascending=[True, False, False])
grid_data.to_csv('/home/ml/softezza_ml/models/knn/meta.csv')

grid_data.head(40)

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,model,K,B,K1
7,0.104393,0.038852,5.294475,0.00021,0.099451,bm25,50.0,0.25,2.5
6,0.105175,0.039246,5.27361,0.000209,0.100029,bm25,50.0,0.5,0.5
4,0.110683,0.040791,5.185683,0.000203,0.101525,bm25,100.0,0.25,2.5
3,0.112202,0.041445,5.158078,0.000202,0.104386,bm25,100.0,0.5,0.5
1,0.116439,0.042601,5.134638,0.000199,0.110904,bm25,200.0,0.25,2.5
0,0.117926,0.043239,5.106162,0.000198,0.113707,bm25,200.0,0.5,0.5
8,0.119831,0.045402,4.93102,0.000179,0.131334,bm25,50.0,0.25,1.25
5,0.125847,0.047755,4.824942,0.000171,0.1381,bm25,100.0,0.25,1.25
2,0.131121,0.049233,4.790733,0.000167,0.147111,bm25,200.0,0.25,1.25


### Добавление косинусного растояния между жанрами

### SkLearn Cross-Validate

In [None]:
dataset = data.get_lightfm_dataset(
    list_values_columns=['genres'],
    scalar_values_columns=['lifetime']
)

params = {
    'no_components': [128, 256],
    'item_alpha': [0.0005, 0.0001],
    'num_epochs': [5, 7, 9]
}

estimator_params = utils.build_estimator_params(data, dataset)
report_users = utils.get_users_for_test(data.train_interactions, min_n_interactions=10,max_n_interactions=10, top_n_hist=10)

for p in sklearn.model_selection.ParameterGrid(params):
    model = utils.SklearnEstimatorLightFM(random_state=RANDOM_STATE, **p)
    model.fit(data.train_interactions, **estimator_params)

    utils.users_report(model, report_users, 10, dataset, data, '_' + str(p), REPORTS_DIR);

In [None]:
estimator = utils.SklearnEstimatorLightFM(random_state=1337)
scorer = utils.build_scorer(data, dataset)

grid_search = sklearn.model_selection.GridSearchCV(
    estimator,
    params,
    n_jobs=NUM_JOBS,
    refit=False,
    scoring=scorer,
    cv=sklearn.model_selection.KFold(3, shuffle=True, random_state=1337),
    verbose=10
)
grid_search.fit(data.train_interactions, **estimator_params);

In [None]:
cv_data = pd.DataFrame({k: v for k, v in grid_search.cv_results_.items() if k.startswith('split') or k=='params'})
cv_data = pd.concat([pd.DataFrame.from_records(cv_data['params'].values, index=cv_data['params'].index), cv_data.drop(columns='params')], axis=1)
cv_data.to_csv('grid_search.csv')

cv_data.head(len(cv_data))

Unnamed: 0,item_alpha,no_components,split0_test_MAP,split1_test_MAP,split2_test_MAP,split0_test_Recall,split1_test_Recall,split2_test_Recall,split0_test_MeanInvUserFreq,split1_test_MeanInvUserFreq,split2_test_MeanInvUserFreq
0,0.0,32,0.0128,0.013136,0.012941,0.039973,0.040136,0.039717,5.102781,5.112524,5.107895
1,0.0001,32,0.007401,0.011272,0.007078,0.021507,0.035856,0.018465,5.538709,5.243351,5.618066
