### Подготовка

In [1]:
import os
import utils
import pickle
import lightfm
import visualisation
import rectools.models

import pandas as pd

from pandarallel import pandarallel
pandarallel.initialize()


os.environ['DIR'] = "/home/ml/softezza_ml/"

models_dir = '/home/ml/softezza_ml/models/knn'
models_meta = pd.read_csv('models/knn/meta.csv').drop(columns='Unnamed: 0')

models_meta

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,model,K,B,K1
0,0.104393,0.038852,5.294475,0.00021,0.099451,bm25,50.0,0.25,2.5
1,0.105175,0.039246,5.27361,0.000209,0.100029,bm25,50.0,0.5,0.5
2,0.110683,0.040791,5.185683,0.000203,0.101525,bm25,100.0,0.25,2.5
3,0.112202,0.041445,5.158078,0.000202,0.104386,bm25,100.0,0.5,0.5
4,0.116439,0.042601,5.134638,0.000199,0.110904,bm25,200.0,0.25,2.5
5,0.117926,0.043239,5.106162,0.000198,0.113707,bm25,200.0,0.5,0.5
6,0.119831,0.045402,4.93102,0.000179,0.131334,bm25,50.0,0.25,1.25
7,0.125847,0.047755,4.824942,0.000171,0.1381,bm25,100.0,0.25,1.25
8,0.131121,0.049233,4.790733,0.000167,0.147111,bm25,200.0,0.25,1.25


In [2]:
config = utils.DataConfig(
    split_strategy=utils.TimeSortSplit(num_interactions='all', splits=(.8, .2)),
    filter_strategy=[
        utils.MinNumInteractionsFilter(20, 500),
        utils.OnlyLastInteractionsFilter('user_id', 20)
    ],
    features_config=utils.FeaturesConfig(use_labels=False)
)

data = utils.load_data(config)

data.train_interactions.head()

: 

In [3]:
item_genres = pd.read_csv('/home/ml/softezza_ml/static_mappers/item_id2meta.csv', usecols=['item_id', 'genres'])
items_features = pd.DataFrame.from_records(item_genres['genres'].apply(lambda raw: { g : 1 for g in raw.replace('"', '').replace(' ', '').split('/')})).fillna(0)
items_features['item_id'] = item_genres['item_id']

items_features

Unnamed: 0,Drama,History,War,Horror,Mystery,Thriller,Fantasy,Romance,Crime,Documentary,...,Sci-Fi,Musical,Biography,Film-Noir,Short,Reality-TV,Talk-Show,News,Game-Show,item_id
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0004972
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0006864
2,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0010323
3,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0011237
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt0011841
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15803,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt9894470
15804,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt9900092
15805,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt9902160
15806,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tt9907782


In [4]:
test_users = utils.get_users_for_test(data.train_interactions, min_n_interactions=3, max_n_interactions=5)

train_dataset, test_dataset = data.get_rectools_dataset()
params2model_data = {}

for p in models_meta[['model', 'K', 'K1', 'B']].itertuples(False, None):
    
    with open(os.path.join(models_dir, f"{p[0]}_K{p[1]}_K1{p[2]}_B{p[3]}.pickle"), mode='rb') as f:
        params2model_data[p] = {
            'model': pickle.load(f)
        }
    
    params2model_data[p]['reco'] = params2model_data[p]['model'].recommend(
        k=10,
        users=test_users.users_idx,
        dataset=train_dataset,
        filter_viewed=True,
        add_rank_col=True,
    )

list(params2model_data.items())[0]

(('bm25', 50.0, 2.5, 0.25),
 {'model': <rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x7fe7b09f9700>,
  'reco':      user_id     item_id       score  rank
  0   73535002   tt3797512  575.016027     1
  1   73535002   tt5433138  402.317152     2
  2   73535002   tt0283111  387.834316     3
  3   73535002   tt0115736  366.590653     4
  4   73535002  tt14109724  354.085681     5
  ..       ...         ...         ...   ...
  95  72518404  tt10648342  315.146717     6
  96  72518404   tt0317919  304.753188     7
  97  72518404  tt13276352  295.196295     8
  98  72518404   tt3110958  286.654690     9
  99  72518404   tt0102492  282.038739    10
  
  [100 rows x 4 columns]})

In [5]:
train_dataset, test_dataset = data.get_rectools_dataset();

cosine = hueristics.Cosine()
cosine.fit(data.train_interactions, items_features=items_features)

model = rectools.models.implicit_knn.ImplicitItemKNNWrapperModel(implicit.nearest_neighbours.BM25Recommender(num_threads=12, K=100, K1=1.25, B=0.25), verbose=1)
model.fit(train_dataset)

hueristic = hueristics.HueristicsWrapper(genres_cosine_05=(cosine, 0.1))

recos = model.recommend(
    k=200,
    users=data.train_interactions['user_id'].unique(),
    dataset=train_dataset,
    filter_viewed=True,
    add_rank_col=False,
)
        
recos = hueristic.rerank(
    k=10,
    reco=recos
)

params2model_data = {
    ('bm25', 200, 1.25, 0.25): {
        'model': model,
        'reco': recos
    }
}

recos.head()

  0%|          | 0/8223 [00:00<?, ?it/s]

  0%|          | 0/364765 [00:00<?, ?it/s]

  0%|          | 0/72936971 [00:00<?, ?it/s]

Unnamed: 0,user_id,item_id,score,rank
48161499,37527505,tt0373889,96207.8404,1
10802188,73793202,tt0417741,93598.889585,1
42974316,25690201,tt0373889,95387.615601,1
12324474,20354601,tt0373889,93531.847783,1
53232079,35810103,tt0373889,93904.918656,1


In [6]:
item_id2meta = pd.read_csv('/home/ml/softezza_ml/static_mappers/item_id2meta.csv').set_index('item_id', drop=True)
item_id2title = pd.read_csv('/home/ml/softezza_ml/static_mappers/item_id2title.csv').set_index('item_id', drop=True)['title']

item_id2meta

Unnamed: 0_level_0,type,year,rank,runtime,genres,company,director,writer,cast,mppa,num_views
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tt0004972,1,1915,6.2,195,"""Drama / History / War""","""David W. Griffith Corp., Epoch Producing Corp...","""D.W. Griffith""","""Thomas Dixon Jr., Thomas Dixon Jr., Thomas Di...","""Lillian Gish, Mae Marsh, Henry B. Walthall""","""TV-PG""",2580
tt0006864,1,1916,7.7,163,"""Drama / History""","""D.W. Griffith Productions""","""D.W. Griffith""","""Hettie Grey Baker, Tod Browning, D.W. Griffith""","""Lillian Gish, Mae Marsh, Robert Harron""","""N""",70
tt0010323,1,1920,8.0,76,"""Horror / Mystery / Thriller""","""Decla-Bioscop AG""","""Robert Wiene""","""Carl Mayer, Hans Janowitz""","""Werner Krauss, Conrad Veidt, Friedrich Feher""","""TV-PG""",551
tt0011237,1,1920,7.2,76,"""Fantasy / Horror""","""Projektions-AG Union (PAGU)""","""Paul Wegener, Carl Boese""","""Paul Wegener, Henrik Galeen""","""Paul Wegener, Albert Steinrück, Lyda Salmonova""","""N""",57
tt0011841,1,1920,7.3,145,"""Drama / Romance""","""D.W. Griffith Productions""","""D.W. Griffith""","""Lottie Blair Parker, William A. Brady, Joseph...","""Lillian Gish, Richard Barthelmess, Mrs. David...","""N""",40
...,...,...,...,...,...,...,...,...,...,...,...
tt9894470,1,2019,6.1,92,"""Action / Crime / Horror""","""Fangoria, Channel 83 Films, Media Finance Cap...","""Joe Begos""","""Max Brallier, Matthew McArdle""","""Stephen Lang, William Sadler, Fred Williamson""","""N""",2109
tt9900092,2,2020,7.3,42,"""Drama / Fantasy / Sci-Fi / Thriller""","""""","""""","""Eliot Laurence, Eliot Laurence, Eliot Laurence""","""Taylor Hickson, Amalia Holm, Demetria McKinney""","""TV-14""",0
tt9902160,1,2020,7.0,97,"""Drama""","""BBC Films, British Film Institute (BFI), Elem...","""Phyllida Lloyd""","""Clare Dunne, Malcolm Campbell, Clare Dunne""","""Molly McCann, Clare Dunne, Ruby Rose O'Hara""","""R""",5112
tt9907782,1,2021,6.2,111,"""Fantasy / Horror / Mystery""","""LD Entertainment, Piste Rouge""","""Sean Ellis""","""Sean Ellis""","""Boyd Holbrook, Kelly Reilly, Alistair Petrie""","""R""",13096


In [19]:
def image_html(item_id: int) -> str:
    return f"<img src='https://media.tv4.live/{item_id}.movie.poster.jpg' style=max-height:150px;/>"


def bold_html_rounded(score: int) -> str:
    return f"<p style='color:#3B9C9C;'>{round(score, 2)}</p>"

### Визуализация

In [20]:
named_recos = []

for params, model_data in params2model_data.items():
    r = model_data['reco']
    r['model_name'] = f"{params[0]} [K: {params[1]}, K1: {params[2]}, B: {params[3]}]"
    named_recos.append(r)


test_interactions = []

for user_id, history in test_users.users_histories.items():
    hist_data = pd.DataFrame({'item_id': history})
    hist_data['user_id'] = user_id
    test_interactions.append(hist_data)


vis_interactions = pd.concat(test_interactions)
vis_dict = dict(zip([f"user_{i}" for i in range(1, 11)], test_users.users_idx))

vis_recos = pd.concat(named_recos)
vis_recos['watch_date'] = pd.merge(vis_recos[['user_id', 'item_id']], data.test_interactions[['user_id', 'item_id', 'timestamp']], on=['user_id', 'item_id'], how='left')['timestamp']
vis_recos['watch_ratio'] = pd.merge(vis_recos[['user_id', 'item_id']], data.test_interactions[['user_id', 'item_id', 'weight']], on=['user_id', 'item_id'], how='left')['weight']

vis_items_data = pd.DataFrame({'item_id': data.all_items})
vis_items_data['title'] = vis_items_data['item_id'].apply(lambda iid: item_id2title.loc[iid])
vis_items_data['watched_in_all_time'] = vis_items_data['item_id'].apply(lambda iid: item_id2meta.loc[iid, 'num_views'])
vis_items_data['release_year'] = vis_items_data['item_id'].apply(lambda iid: item_id2meta.loc[iid, 'year'])
vis_items_data['genres'] = vis_items_data['item_id'].apply(lambda iid: item_id2meta.loc[iid, 'genres'])

visualisation.PROJECT_OPTIONS = visualisation.ShowcaseOptions(
    item_df_columns=[
        "item_id",
        "title",
        "genres",
        #"countries",
        "release_year",
        "watched_in_all_time",
    ],
    item_df_renaming={"watched_in_all_time": "watches", "item_id": "img"},
    formatters=dict(img=image_html, score=bold_html_rounded),
)

showcase = visualisation.Showcase(
    interactions=vis_interactions,
    full_recos=vis_recos,
    users_dict=vis_dict,
    item_data=vis_items_data,
    convert_ids_to_int=False
);

