# Визуальный анализ нескольких моделей

In [1]:
import os
import ml_utils
import rectools.models
import implicit
import pandas as pd

from pandarallel import pandarallel
pandarallel.initialize()


os.environ['DIR'] = "/home/ml/softezza_ml/"

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
data = ml_utils.Data.fast_load('/home/ml/softezza_ml/fast_load')
train_dataset = data.get_rectools_dataset()
test_users = ml_utils.get_users_for_test(data.train_interactions, min_n_interactions=5, max_n_interactions=7)

data.train_interactions.head()

Unnamed: 0,user_id,item_id,timestamp,weight,index
0,3518601,tt8201852,2023-09-14 06:04:32,0.947491,1692307
1,80783501,tt0455944,2023-09-14 06:04:30,0.261237,1692308
2,17678705,tt10366206,2023-09-14 06:04:28,0.908876,1692309
3,45173701,tt14308636,2023-09-14 06:04:27,0.00919,1692310
4,52970501,tt0468569,2023-09-14 06:04:25,0.170943,1692311


In [3]:
len(data.train_interactions), len(data.test_interactions), len(data.train_interactions) + len(data.test_interactions)

(6769224, 1692307, 8461531)

### Без реранжирования

In [13]:
train_dataset = data.get_rectools_dataset()

user_features = data.user_features[['user_id', 'lifetime']]
item_features = data.item_features[['item_id']]
train_dataset = data.get_rectools_dataset(item_features, user_features)

model2recos = {}

for p in [
    {'model': 'bm25',	'B': 0.4, 'K': 200,	'K1': 1.00},
    {'model': 'bm25',	'B': 0.5, 'K': 200,	'K1': 1.25}
]:
    model_name = p.pop('model')

    model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(implicit.nearest_neighbours.BM25Recommender(num_threads=2, **p))
    model.fit(train_dataset)

    recos = model.recommend(
        k=10,
        users=test_users.users_idx,
        dataset=train_dataset,
        filter_viewed=True,
        add_rank_col=True,
    )

    model2recos[model_name + str(tuple(p.items()))] = recos

model2recos

{"bm25(('B', 0.4), ('K', 200), ('K1', 1.0))":      user_id     item_id        score  rank
 0   79783905   tt2953050   288.271503     1
 1   79783905   tt6467266   233.980015     2
 2   79783905   tt8115900   225.207784     3
 3   79783905   tt2463208   219.762652     4
 4   79783905   tt6705162   217.703069     5
 ..       ...         ...          ...   ...
 95  38757001   tt5294518  1207.910809     6
 96  38757001   tt1001520  1188.943892     7
 97  38757001  tt11209212  1181.416383     8
 98  38757001   tt6535880  1018.833129     9
 99  38757001   tt4157728   977.911264    10
 
 [100 rows x 4 columns],
 "bm25(('B', 0.5), ('K', 200), ('K1', 1.25))":      user_id     item_id       score  rank
 0   79783905   tt2953050  173.334743     1
 1   79783905   tt2463208  149.619653     2
 2   79783905   tt6705162  148.805863     3
 3   79783905   tt6467266  141.492867     4
 4   79783905   tt8115900  141.123199     5
 ..       ...         ...         ...   ...
 95  38757001   tt9100018  914.249

### С реранжированием

In [5]:
item_features = pd.read_csv('/home/ml/softezza_ml/data/item_features_bin.csv')

item_features.head()

Unnamed: 0,item_id,-1980,2000-2010,2010-2020,1980-2000,+2020,6.0-8.0,8.0+,-6.0,Drama,...,Adventure,Sci-Fi,long,normal,short,N,TV-14,TV-MA,PG-13,R
0,tt0004972,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0006864,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,tt0010323,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0011237,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,tt0011841,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [14]:
train_dataset = data.get_rectools_dataset(item_features, user_features)

genres_cosine = ml_utils.Cosine(features_columns=['Drama', 'Mystery', 'Thriller', 'Fantasy', 'Romance', 'Crime', 'Comedy', 'Action', 'Adventure', 'Sci-Fi'])
genres_cosine.fit(data.train_interactions[data.train_interactions['user_id'].isin(set(test_users.users_idx))], data=data)

heuristic = ml_utils.Reranker(
    genres_cosine_01=(genres_cosine, 0.1)
)

for p in [
    {'model': 'bm25_cosine', 'B': 0.4, 'K': 200, 'K1': 1.00},
    {'model': 'bm25_cosine', 'B': 0.5, 'K': 200, 'K1': 1.25}
]:
    model_name = p.pop('model')

    model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(implicit.nearest_neighbours.BM25Recommender(num_threads=2, **p))
    model.fit(train_dataset)

    recos = model.recommend(
        k=100,
        users=test_users.users_idx,
        dataset=train_dataset,
        filter_viewed=True,
        add_rank_col=True,
    )

    recos = heuristic.rerank(
        k=10,
        reco=recos,
        data=data
    )

    model2recos[model_name + str(tuple(p.items()))] = recos

model2recos

  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


{"bm25(('B', 0.4), ('K', 200), ('K1', 1.0))":      user_id     item_id        score  rank
 0   79783905   tt2953050   288.271503     1
 1   79783905   tt6467266   233.980015     2
 2   79783905   tt8115900   225.207784     3
 3   79783905   tt2463208   219.762652     4
 4   79783905   tt6705162   217.703069     5
 ..       ...         ...          ...   ...
 95  38757001   tt5294518  1207.910809     6
 96  38757001   tt1001520  1188.943892     7
 97  38757001  tt11209212  1181.416383     8
 98  38757001   tt6535880  1018.833129     9
 99  38757001   tt4157728   977.911264    10
 
 [100 rows x 4 columns],
 "bm25(('B', 0.5), ('K', 200), ('K1', 1.25))":      user_id     item_id       score  rank
 0   79783905   tt2953050  173.334743     1
 1   79783905   tt2463208  149.619653     2
 2   79783905   tt6705162  148.805863     3
 3   79783905   tt6467266  141.492867     4
 4   79783905   tt8115900  141.123199     5
 ..       ...         ...         ...   ...
 95  38757001   tt9100018  914.249

### Визуализация

In [12]:
import visualisation

visualisation.visualise_models(
    models_recos=model2recos,
    train_interactions=data.train_interactions,
    test_interactions=data.test_interactions,
    test_users=test_users,
    test_items=data.all_items,
    mappers_dir='/home/ml/softezza_ml/static_mappers'
)

