### Подготовка

In [9]:
import os
import utils
import pickle
import implicit
import hueristics
import visualisation
import rectools.models

import pandas as pd

from pandarallel import pandarallel
pandarallel.initialize()


os.environ['DIR'] = "/home/ml/softezza_ml/"

models_meta = pd.read_csv('/home/ml/softezza_ml/models/lightfm/meta.csv').drop(columns='Unnamed: 0')

models_meta

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


Unnamed: 0,Recall@10,MAP@10,MIUF@10,Siren@10,PopInt@10,RecallNoPop@10,model,loss,max_sampled,no_components,epochs,with_feature
0,0.131547,0.058467,4.277105,0.000167,0.300746,0.062004,lightfm,warp,27,125,6,lifetime
1,0.129646,0.05746,4.3022,0.000171,0.292967,0.06192,lightfm,warp,25,125,6,lifetime
2,0.126615,0.05526,4.521738,0.000209,0.281923,0.061771,lightfm,warp,25,200,6,lifetime
3,0.127824,0.056147,4.477814,0.000201,0.2803,0.061683,lightfm,warp,27,175,6,lifetime
4,0.125952,0.054545,4.551806,0.000209,0.2737,0.06163,lightfm,warp,27,200,6,lifetime
5,0.122545,0.053266,4.703446,0.00023,0.262324,0.059554,lightfm,warp,27,200,7,lifetime


In [4]:
config = utils.DataConfig(
    split_strategy=utils.TimeSortSplit(num_interactions='all', splits=(.8, .2)),
    filter_strategy=[
        utils.MinNumInteractionsFilter(10, 500),
        utils.OnlyLastInteractionsFilter('user_id', 20)
    ],
    features_config=utils.FeaturesConfig(use_labels=False)
)

data = utils.load_data(config)

data.train_interactions.head()

Data after filter:
Len of train interactions with period [['2019-10-11T03:09:32.000000000'] / ['2023-09-14T06:04:32.000000000']] - 6769224
Len of test interactions with period [['2023-09-14T06:04:32.000000000'] / ['2023-10-23T12:10:54.000000000']] - 1692307
Num of uniq users 423917Num of uniq items 8260


Unnamed: 0,user_id,item_id,timestamp,weight,index
0,3518601,tt8201852,2023-09-14 06:04:32,0.947491,1692307
1,80783501,tt0455944,2023-09-14 06:04:30,0.261237,1692308
2,17678705,tt10366206,2023-09-14 06:04:28,0.908876,1692309
3,45173701,tt14308636,2023-09-14 06:04:27,0.01,1692310
4,52970501,tt0468569,2023-09-14 06:04:25,0.170943,1692311


In [10]:
def feature2columns(fstr: str) -> list:
    if fstr == 'device':
        return ['unknown', 'android', 'ios']
    
    if fstr == 'account_type':
        return ['account_type_-1.0', 'account_type_6.0', 'account_type_3.0', 'account_type_1.0', 'account_type_12.0', 'account_type_0.0']
    
    if fstr == 'year':
        return ['-1980', '2000-2010', '2010-2020', '1980-2000', '+2020']
    
    if fstr == 'rating':
        return [ '6.0-8.0', '8.0+', '-6.0']

    if fstr == 'genres':
        return ['Sci-Fi', 'Adventure', 'Action', 'Comedy', 'Crime', 'Romance', 'Fantasy', 'Thriller', 'Mystery', 'Drama']

    if fstr == 'time':
        return ['short', 'normal', 'long']

    if fstr == 'MPPA':
        return ['R', 'PG-13', 'TV-MA', 'TV-14', 'N']

    return [fstr]


def filter_cols(fcols: list, features: pd.DataFrame):
    if 'user_id' in features.columns:
        return list(set(features.columns).intersection(set(fcols))) + ['user_id',]
    else:
        return list(set(features.columns).intersection(set(fcols))) + ['item_id',]

In [12]:
test_users = utils.get_users_for_test(data.train_interactions, min_n_interactions=3, max_n_interactions=5)

train_dataset = data.get_rectools_dataset()
params2model_data = {}

feature_cols = feature2columns('lifetime')
user_features = data.user_features[filter_cols(feature_cols, data.user_features)]
item_features = data.item_features[filter_cols(feature_cols, data.item_features)]
train_dataset = data.get_rectools_dataset(item_features, user_features)

for _, p in models_meta[['model', 'loss', 'max_sampled', 'no_components', 'epochs', 'with_feature']].iterrows():

    with open(os.path.join('/home/ml/softezza_ml/models/lightfm', "{model}_loss:_{loss}_max_sampled:_{max_sampled}_no_components:_{no_components}_epochs:_{epochs}_with_feature:_{with_feature}.pickle").format(**p.to_dict()), mode='rb') as f:
        model = pickle.load(f)

        params2model_data[tuple(p)] = {
            'model': model
        }

    recos = model.recommend(
        k=10,
        users=train_dataset.user_id_map.external_ids,
        dataset=train_dataset,
        filter_viewed=True,
        add_rank_col=True,
    )

    params2model_data[tuple(p)] = {
        'reco': recos
    }

list(params2model_data.items())[0]

(('lightfm', 'warp', 27, 125, 6, 'lifetime'),
 {'reco':          user_id     item_id       score  rank
  0        3518601   tt0093493 -278.048950     1
  1        3518601  tt17663992 -279.137970     2
  2        3518601   tt0120131 -279.164032     3
  3        3518601   tt9100018 -279.345428     4
  4        3518601   tt1001520 -279.555603     5
  ...          ...         ...         ...   ...
  4239165   188101  tt15671028 -279.497589     6
  4239166   188101   tt6718170 -279.533051     7
  4239167   188101   tt7975244 -279.605652     8
  4239168   188101   tt3704428 -279.755157     9
  4239169   188101   tt0275847 -279.762177    10
  
  [4239170 rows x 4 columns]})

In [13]:
item_id2meta = pd.read_csv('/home/ml/softezza_ml/static_mappers/item_id2meta.csv').set_index('item_id', drop=True)
item_id2title = pd.read_csv('/home/ml/softezza_ml/static_mappers/item_id2title.csv').set_index('item_id', drop=True)['title']

item_id2meta

Unnamed: 0_level_0,type,year,rank,runtime,genres,company,director,writer,cast,mppa,num_views
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tt0004972,1,1915,6.2,195,"""Drama / History / War""","""David W. Griffith Corp., Epoch Producing Corp...","""D.W. Griffith""","""Thomas Dixon Jr., Thomas Dixon Jr., Thomas Di...","""Lillian Gish, Mae Marsh, Henry B. Walthall""","""TV-PG""",2580
tt0006864,1,1916,7.7,163,"""Drama / History""","""D.W. Griffith Productions""","""D.W. Griffith""","""Hettie Grey Baker, Tod Browning, D.W. Griffith""","""Lillian Gish, Mae Marsh, Robert Harron""","""N""",70
tt0010323,1,1920,8.0,76,"""Horror / Mystery / Thriller""","""Decla-Bioscop AG""","""Robert Wiene""","""Carl Mayer, Hans Janowitz""","""Werner Krauss, Conrad Veidt, Friedrich Feher""","""TV-PG""",551
tt0011237,1,1920,7.2,76,"""Fantasy / Horror""","""Projektions-AG Union (PAGU)""","""Paul Wegener, Carl Boese""","""Paul Wegener, Henrik Galeen""","""Paul Wegener, Albert Steinrück, Lyda Salmonova""","""N""",57
tt0011841,1,1920,7.3,145,"""Drama / Romance""","""D.W. Griffith Productions""","""D.W. Griffith""","""Lottie Blair Parker, William A. Brady, Joseph...","""Lillian Gish, Richard Barthelmess, Mrs. David...","""N""",40
...,...,...,...,...,...,...,...,...,...,...,...
tt9894470,1,2019,6.1,92,"""Action / Crime / Horror""","""Fangoria, Channel 83 Films, Media Finance Cap...","""Joe Begos""","""Max Brallier, Matthew McArdle""","""Stephen Lang, William Sadler, Fred Williamson""","""N""",2109
tt9900092,2,2020,7.3,42,"""Drama / Fantasy / Sci-Fi / Thriller""","""""","""""","""Eliot Laurence, Eliot Laurence, Eliot Laurence""","""Taylor Hickson, Amalia Holm, Demetria McKinney""","""TV-14""",0
tt9902160,1,2020,7.0,97,"""Drama""","""BBC Films, British Film Institute (BFI), Elem...","""Phyllida Lloyd""","""Clare Dunne, Malcolm Campbell, Clare Dunne""","""Molly McCann, Clare Dunne, Ruby Rose O'Hara""","""R""",5112
tt9907782,1,2021,6.2,111,"""Fantasy / Horror / Mystery""","""LD Entertainment, Piste Rouge""","""Sean Ellis""","""Sean Ellis""","""Boyd Holbrook, Kelly Reilly, Alistair Petrie""","""R""",13096


In [14]:
def image_html(item_id: int) -> str:
    return f"<img src='https://media.tv4.live/{item_id}.movie.poster.jpg' style=max-height:150px;/>"


def bold_html_rounded(score: int) -> str:
    return f"<p style='color:#3B9C9C;'>{round(score, 2)}</p>"

### Визуализация

In [15]:
named_recos = []

for params, model_data in params2model_data.items():
    r = model_data['reco']
    r['model_name'] = f"{params[0]} [K: {params[1]}, K1: {params[2]}, B: {params[3]}]"
    named_recos.append(r)


test_interactions = []

for user_id, history in test_users.users_histories.items():
    hist_data = pd.DataFrame({'item_id': history})
    hist_data['user_id'] = user_id
    test_interactions.append(hist_data)


vis_interactions = pd.concat(test_interactions)
vis_dict = dict(zip([f"user_{i}" for i in range(1, 11)], test_users.users_idx))

vis_recos = pd.concat(named_recos)
vis_recos['watch_date'] = pd.merge(vis_recos[['user_id', 'item_id']], data.test_interactions[['user_id', 'item_id', 'timestamp']], on=['user_id', 'item_id'], how='left')['timestamp']
vis_recos['watch_ratio'] = pd.merge(vis_recos[['user_id', 'item_id']], data.test_interactions[['user_id', 'item_id', 'weight']], on=['user_id', 'item_id'], how='left')['weight']

vis_items_data = pd.DataFrame({'item_id': data.all_items})
vis_items_data['title'] = vis_items_data['item_id'].apply(lambda iid: item_id2title.loc[iid])
vis_items_data['watched_in_all_time'] = vis_items_data['item_id'].apply(lambda iid: item_id2meta.loc[iid, 'num_views'])
vis_items_data['release_year'] = vis_items_data['item_id'].apply(lambda iid: item_id2meta.loc[iid, 'year'])
vis_items_data['genres'] = vis_items_data['item_id'].apply(lambda iid: item_id2meta.loc[iid, 'genres'])

visualisation.PROJECT_OPTIONS = visualisation.ShowcaseOptions(
    item_df_columns=[
        "item_id",
        "title",
        "genres",
        #"countries",
        "release_year",
        "watched_in_all_time",
    ],
    item_df_renaming={"watched_in_all_time": "watches", "item_id": "img"},
    formatters=dict(img=image_html, score=bold_html_rounded),
)

showcase = visualisation.Showcase(
    interactions=vis_interactions,
    full_recos=vis_recos,
    users_dict=vis_dict,
    item_data=vis_items_data,
    convert_ids_to_int=False
);

