# Поиск подходящих параметров для KNN-Based моделей

In [1]:
import os
import ml_utils
import implicit

import sklearn.model_selection
import rectools.models
import rectools.metrics
import pandas as pd

from pandarallel import pandarallel
pandarallel.initialize()


from IPython.display import clear_output, clear_output, HTML


RANDOM_STATE = 1337
NUM_JOBS = -1

os.environ['DIR'] = "/home/ml/softezza_ml/"
os.environ['DB_ENDPOINT'] = "apollo-api-staging-f82be878-d243-4113-8052-ef36565618e0.cpljy7lbflfq.eu-west-1.rds.amazonaws.com"
os.environ['DB_PORT'] = '3306'
os.environ['DB_USER'] = "admin"
os.environ['DB_PASSWORD'] = 'zsfZMSpS0SGz8gp203QJ4r3bqpVNxwmG'
os.environ['DB_NAME'] = "vapor"

DATA_DIR = os.path.join(os.environ['DIR'], 'data')
REPORTS_DIR = os.path.join(os.environ['DIR'], 'reports')

DATA_DIR, REPORTS_DIR

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


('/home/ml/softezza_ml/data', '/home/ml/softezza_ml/reports')

In [None]:
config = ml_utils.DataConfig(
    split_strategy=ml_utils.TimeSortSplit('all', splits=(0.8, 0.2)),
    filter_strategy=[
        ml_utils.MinNumInteractionsFilter(min_user_ints=10, min_item_ints=500),
        ml_utils.OnlyLastInteractionsFilter('user_id', n_last=20)
    ],
    weight_strategy=ml_utils.ViewRatioBasedWeight(),
    features_config=ml_utils.FeaturesConfig(use_labels=False)
)

data = ml_utils.load_data(config)
train_dataset = data.get_rectools_dataset()

data.train_interactions.head()

In [5]:
data = ml_utils.Data.fast_load('/home/ml/softezza_ml/fast_load')
train_dataset = data.get_rectools_dataset()

data.train_interactions.head()

Unnamed: 0,user_id,item_id,timestamp,weight,index
0,3518601,tt8201852,2023-09-14 06:04:32,0.947491,1692307
1,80783501,tt0455944,2023-09-14 06:04:30,0.261237,1692308
2,17678705,tt10366206,2023-09-14 06:04:28,0.908876,1692309
3,45173701,tt14308636,2023-09-14 06:04:27,0.00919,1692310
4,52970501,tt0468569,2023-09-14 06:04:25,0.170943,1692311


## Проверка лучшей KION-модели

In [3]:
bm25_train_dataset, bm25_test_dataset = data.get_rectools_dataset()

bm25_model = rectools.models.ImplicitItemKNNWrapperModel(
    implicit.nearest_neighbours.BM25Recommender(50, 0.1, 0.75, num_threads=12),
    verbose=1
)

bm25_model.fit(bm25_train_dataset);

100%|██████████| 8223/8223 [00:00<00:00, 33944.81it/s]


In [11]:
recos = bm25_model.recommend(
    k=10,
    users=data.all_users,
    dataset=bm25_train_dataset,
    filter_viewed=True,
    add_rank_col=True,
)

In [18]:
metrics = rectools.metrics.calc_metrics(
    {
        'MAP@10': rectools.metrics.MAP(10),
        'Recall@10': rectools.metrics.Recall(10),
        'Siren@10': rectools.metrics.Serendipity(10),
        'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
    },
    reco=recos,
    interactions=data.test_interactions,
    prev_interactions=data.train_interactions,
    catalog=data.all_items
)

metrics['PopInt@10'] = ml_utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)

pd.DataFrame.from_records((metrics,))

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10
0,0.153368,0.063986,4.140743,0.000104,0.236565


## Проверка по параметрам Cosine и TF-IDF моделей

In [22]:
grids = {
    'cosine': {
        'model': implicit.nearest_neighbours.CosineRecommender,
        'grid': {
            'K': [25, 50, 100, 200]
        }
    },

    'tf_idf': {
        'model': implicit.nearest_neighbours.TFIDFRecommender,
        'grid': {
            'K': [25, 50, 100, 200]
        }
    },
}

results = []

for label, params in grids.items():

    for p in sklearn.model_selection.ParameterGrid(params['grid']):

        model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(params['model'](num_threads=12, **p))
        model.fit(train_dataset)

        recos = model.recommend(
            k=10,
            users=train_dataset.user_id_map.external_ids,
            dataset=train_dataset,
            filter_viewed=True,
            add_rank_col=True,
        )

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@10': rectools.metrics.MAP(10),
                'Recall@10': rectools.metrics.Recall(10),
                'Siren@10': rectools.metrics.Serendipity(10),
                'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@10'] = ml_utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@10'] = ml_utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics = {**metrics, **p}
        results.append(metrics)
        clear_output(wait=True)
        display(HTML(pd.DataFrame.from_records(results).fillna('').head(100).to_html()))


grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('reports/grid_report_cos_tfidf.csv', index=False)
clear_output(wait=True)

(
    grid_data.sort_values(['RecallNoPop@10', 'PopInt@10', 'Recall@10', 'MAP@10'], ascending=[False, True, False, False])
        .head(50)
        .style.text_gradient(
            axis=0,
            cmap='PiYG',
            subset=['Recall@10', 'MAP@10', 'MIUF@10', 'Siren@10', 'PopInt@10', 'RecallNoPop@10']
        )
)

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,RecallNoPop@10,model,K
3,0.145178,0.057882,3.860775,0.000243,0.46617,0.055253,cosine,200
7,0.147115,0.05932,3.7767,0.000232,0.478662,0.054842,tf_idf,200
0,0.136355,0.054607,4.129464,0.000268,0.425952,0.054529,cosine,25
2,0.143821,0.057655,3.897128,0.000246,0.461257,0.054477,cosine,100
4,0.138057,0.055943,4.046993,0.00026,0.435716,0.054269,tf_idf,25
6,0.146124,0.059193,3.808549,0.000235,0.474254,0.054196,tf_idf,100
5,0.142855,0.058151,3.886501,0.000243,0.458963,0.053737,tf_idf,50
1,0.140159,0.056512,3.975839,0.000252,0.446775,0.053649,cosine,50


## BM25 модель

### Проверка по параметрам bm25 модели 1

In [13]:
grids = {
    'bm25': {
        'model': implicit.nearest_neighbours.BM25Recommender,
        'grid': {
            'K': [50],
            'K1': [0.5, 1.25, 2.5, 5.],
            'B': [0.25, 0.5, 0.75, 1.]
        }
    }
}

results = []

for label, params in grids.items():

    for p in sklearn.model_selection.ParameterGrid(params['grid']):
        model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(params['model'](num_threads=12, **p))
        model.fit(train_dataset)

        recos = model.recommend(
            k=10,
            users=train_dataset.user_id_map.external_ids,
            dataset=train_dataset,
            filter_viewed=True,
            add_rank_col=True,
        )

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@10': rectools.metrics.MAP(10),
                'Recall@10': rectools.metrics.Recall(10),
                'Siren@10': rectools.metrics.Serendipity(10),
                'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@10'] = ml_utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@10'] = ml_utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics = {**metrics, **p}
        results.append(metrics)
        print(metrics)


grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('reports/grid_report_bm25_1.csv', index=False)
clear_output(wait=True)

(
    grid_data.sort_values(['RecallNoPop@10', 'PopInt@10', 'Recall@10', 'MAP@10'], ascending=[False, True, False, False])
        .head(50)
        .style.text_gradient(
            axis=0,
            cmap='PiYG',
            subset=['Recall@10', 'MAP@10', 'MIUF@10', 'Siren@10', 'PopInt@10', 'RecallNoPop@10']
        )
)

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,RecallNoPop@10,model,B,K,K1
0,0.103565,0.041708,4.398477,0.000229,0.206668,0.059539,bm25,0.25,50,0.5
1,0.086081,0.032324,5.032083,0.000319,0.12597,0.059126,bm25,0.25,50,1.25
4,0.076759,0.028361,5.362356,0.000361,0.096616,0.056612,bm25,0.5,50,0.5
2,0.075569,0.027859,5.389676,0.000363,0.093541,0.056456,bm25,0.25,50,2.5
3,0.069802,0.025539,5.624867,0.000389,0.074537,0.054866,bm25,0.25,50,5.0
8,0.059109,0.021555,6.073166,0.000439,0.048733,0.050537,bm25,0.75,50,0.5
5,0.056508,0.020652,6.194898,0.000449,0.043889,0.04929,bm25,0.5,50,1.25
12,0.047525,0.0177,6.656818,0.000493,0.028417,0.044462,bm25,1.0,50,0.5
6,0.047211,0.017573,6.655717,0.000489,0.027637,0.044385,bm25,0.5,50,2.5
7,0.042328,0.016096,6.955905,0.000512,0.021062,0.040987,bm25,0.5,50,5.0


### Проверка по параметрам bm25 модели 2

In [None]:
grids = {
    'bm25': {
        'model': implicit.nearest_neighbours.BM25Recommender,
        'grid': {
            'K': [200],
            'K1': [1.20, 1.25, 1.30],
            'B': [0.45, 0.50, 0.55]
        },
    }
}

results = []

for label, params in grids.items():
    grid = sklearn.model_selection.ParameterGrid(params['grid'])

    for train_index, p in enumerate(grid):
        print(f"Train {train_index+1}/{len(grid)}")

        model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(params['model'](num_threads=12, **p))
        model.fit(train_dataset)

        recos = model.recommend(
            k=100,
            users=train_dataset.user_id_map.external_ids,
            dataset=train_dataset,
            filter_viewed=True,
            add_rank_col=True,
        )

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@100': rectools.metrics.MAP(100),
                'Recall@100': rectools.metrics.Recall(100),
                'Siren@100': rectools.metrics.Serendipity(100),
                'MIUF@100': rectools.metrics.MeanInvUserFreq(100)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@100'] = ml_utils.PopularIntersect(100).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@100'] = ml_utils.RecallNoPop(100).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics['min_user_inters'] = params['min_user_inters']
        metrics = {**metrics, **p}
        results.append(metrics)

        clear_output(wait=True)
        display(HTML(pd.DataFrame.from_records(results).fillna('').head(100).to_html()))


grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('reports/grid_report_bm25_2.csv', index=False)
clear_output(wait=True);

In [8]:
(
    grid_data.sort_values(['RecallNoPop@100', 'PopInt@100', 'Recall@100', 'MAP@100'], ascending=[False, True, False, False])
        .head(50)
        .style.text_gradient(
            axis=0,
            cmap='PiYG',
            subset=['Recall@100', 'MAP@100', 'MIUF@100', 'Siren@100', 'PopInt@100', 'RecallNoPop@100']
        )
)

Unnamed: 0,Recall@100,MAP@100,MIUF@100,Siren@100,PopInt@100,RecallNoPop@100,model,min_user_inters,B,K,K1
8,0.271601,0.030685,6.150778,0.000136,0.340985,0.12456,bm25,10,0.55,200,1.3
7,0.273029,0.030968,6.127759,0.000135,0.345632,0.124183,bm25,10,0.55,200,1.25
6,0.274888,0.0313,6.103304,0.000134,0.350662,0.123811,bm25,10,0.55,200,1.2
5,0.281725,0.032622,6.016528,0.000129,0.36881,0.122629,bm25,10,0.5,200,1.3
4,0.28302,0.032922,5.995488,0.000128,0.3731,0.122201,bm25,10,0.5,200,1.25
3,0.284615,0.033294,5.972497,0.000127,0.378193,0.121644,bm25,10,0.5,200,1.2
2,0.29196,0.034852,5.880068,0.000122,0.398621,0.120287,bm25,10,0.45,200,1.3
1,0.293385,0.035181,5.860202,0.000121,0.403212,0.119816,bm25,10,0.45,200,1.25
0,0.29469,0.035532,5.840091,0.00012,0.407689,0.119461,bm25,10,0.45,200,1.2


### Добавление косинусного расстояния между жанрами

In [18]:
item_features = pd.read_csv('/home/ml/softezza_ml/data/item_features_bin.csv')

item_features.head()

Unnamed: 0,item_id,-1980,2000-2010,2010-2020,1980-2000,+2020,6.0-8.0,8.0+,-6.0,Drama,...,Adventure,Sci-Fi,long,normal,short,N,TV-14,TV-MA,PG-13,R
0,tt0004972,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0006864,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,tt0010323,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0011237,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,tt0011841,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
grids = {
    'bm25_100_condidates + genres_cosine': {
        'n_condidates': 100,
        'baseline': rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(implicit.nearest_neighbours.BM25Recommender(num_threads=12, K=100, K1=1.25, B=0.25), verbose=1),
        'heuristic': ml_utils.Cosine(features_columns=['Drama', 'Mystery', 'Thriller', 'Fantasy', 'Romance', 'Crime', 'Comedy', 'Action', 'Adventure', 'Sci-Fi']),
        'grid': {
            'weight': [0.1, 0.25, 0.5, 1]
        },
    }
}

results = []

for label, params in grids.items():

    params['baseline'].fit(train_dataset)
    condidates = params['baseline'].recommend(
        k=params['n_condidates'],
        users=train_dataset.user_id_map.external_ids,
        dataset=train_dataset,
        filter_viewed=True,
        add_rank_col=True,
    )

    params['heuristic'].fit(
        data.train_interactions,
        data=data
    )

    for p in sklearn.model_selection.ParameterGrid(params['grid']):
        heuristic = ml_utils.Reranker(
            cosine=(params['heuristic'], p['weight'])
        )

        recos = heuristic.rerank(reco=condidates, data=data, k=10)

        metrics = rectools.metrics.calc_metrics(
            {
                'MAP@10': rectools.metrics.MAP(10),
                'Recall@10': rectools.metrics.Recall(10),
                'Siren@10': rectools.metrics.Serendipity(10),
                'MIUF@10': rectools.metrics.MeanInvUserFreq(10)
            },
            reco=recos,
            interactions=data.test_interactions,
            prev_interactions=data.train_interactions,
            catalog=data.all_items
        )
        metrics['PopInt@10'] = ml_utils.PopularIntersect(10).calc(reco=recos, prev_interactions=data.train_interactions)
        metrics['RecallNoPop@10'] = ml_utils.RecallNoPop(10).calc(reco=recos, interactions=data.test_interactions, prev_interactions=data.train_interactions)
        metrics['model'] = label
        metrics = {**metrics, **p}
        results.append(metrics)

        clear_output(wait=True)
        display(HTML(pd.DataFrame.from_records(results).fillna('').head(100).to_html()))

In [21]:
grid_data = pd.DataFrame.from_records(results).fillna('')
grid_data.to_csv('reports/grid_report_bm25_3.csv', index=False)
clear_output(wait=True)

(
    grid_data.sort_values(['RecallNoPop@10', 'PopInt@10', 'Recall@10', 'MAP@10'], ascending=[False, True, False, False])
        .head(50)
        .style.text_gradient(
            axis=0,
            cmap='PiYG',
            subset=['Recall@10', 'MAP@10', 'MIUF@10', 'Siren@10', 'PopInt@10', 'RecallNoPop@10']
        )
)

Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,RecallNoPop@10,model,weight
0,0.070994,0.027148,5.097765,0.000285,0.097274,0.053085,bm25_100_condidates + genres_cosine,0.1
1,0.057928,0.022074,5.24268,0.000259,0.071191,0.046495,bm25_100_condidates + genres_cosine,0.25
2,0.04868,0.017914,5.347035,0.000235,0.054816,0.041037,bm25_100_condidates + genres_cosine,0.5
3,0.042353,0.01483,5.414499,0.000215,0.044889,0.03697,bm25_100_condidates + genres_cosine,1.0
