In [86]:
import copy
import pickle
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import BM25Recommender, CosineRecommender, TFIDFRecommender

from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics
from rectools.models import PopularModel
from rectools.model_selection import TimeRangeSplitter

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [43]:
class UserKnn():
    """Class for fit-perdict UserKNN model 
       based on ItemKNN model from implicit.nearest_neighbours
    """
    
    def __init__(self, model: ItemItemRecommender, N_users: int = 50):
        self.N_users = N_users
        self.model = model
        self.is_fitted = False
        
    def get_mappings(self, train):
        self.users_inv_mapping = dict(enumerate(train['user_id'].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}
        
        self.items_inv_mapping = dict(enumerate(train['item_id'].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}
    
    def get_matrix(self, df: pd.DataFrame, 
                   user_col: str = 'user_id', 
                   item_col: str = 'item_id', 
                   weight_col: str = None, 
                   users_mapping: Dict[int, int] = None, 
                   items_mapping: Dict[int, int] = None):
    
        if weight_col:
            weights = df[weight_col].astype(np.float32)
        else:
            weights = np.ones(len(df), dtype=np.float32)

        self.interaction_matrix = sp.sparse.coo_matrix((
            weights, 
            (
                df[item_col].map(self.items_mapping.get),
                df[user_col].map(self.users_mapping.get)
            )
            ))
        
        self.watched = df\
            .groupby(user_col, as_index=False)\
            .agg({item_col: list})\
            .rename(columns={user_col: 'sim_user_id'})
        
        return self.interaction_matrix
        
    def idf(self, n: int, x: float):
        return np.log((1 + n) / (1 + x) + 1)
        
    def _count_item_idf(self, df: pd.DataFrame):
        item_cnt = Counter(df['item_id'].values)
        item_idf = pd.DataFrame.from_dict(item_cnt, orient='index', 
                                          columns=['doc_freq']).reset_index()
        item_idf['idf'] = item_idf['doc_freq'].apply(lambda x: self.idf(self.n, x))
        self.item_idf = item_idf 
    
    def fit(self, train: pd.DataFrame):
        self.user_knn = self.model
        self.get_mappings(train)
        self.weights_matrix = self.get_matrix(train, 
                                              users_mapping=self.users_mapping, 
                                              items_mapping=self.items_mapping)
        
        self.n = train.shape[0]
        self._count_item_idf(train)
        
        self.user_knn.fit(self.weights_matrix)
        self.is_fitted = True

    def _generate_recs_mapper(self, model: ItemItemRecommender, user_mapping: Dict[int, int], 
                              user_inv_mapping: Dict[int, int], N: int):
        def _recs_mapper(user):
            user_id = self.users_mapping[user]
            users, sim = model.similar_items(user_id, N=N)
            return [self.users_inv_mapping[user] for user in users], sim
        return _recs_mapper
    
    def predict(self, test: pd.DataFrame, N_recs: int = 10):
        
        if not self.is_fitted:
            raise ValueError("Please call fit before predict")
        
        mapper = self._generate_recs_mapper(
            model=self.user_knn, 
            user_mapping=self.users_mapping,
            user_inv_mapping=self.users_inv_mapping,
            N=self.N_users
        )

        recs = pd.DataFrame({'user_id': test['user_id'].unique()})
        recs['sim_user_id'], recs['sim'] = zip(*recs['user_id'].map(mapper))
        recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
        
        recs = recs[~(recs['user_id'] == recs['sim_user_id'])]\
            .merge(self.watched, on=['sim_user_id'], how='left')\
            .explode('item_id')\
            .sort_values(['user_id', 'sim'], ascending=False)\
            .drop_duplicates(['user_id', 'item_id'], keep='first')\
            .merge(self.item_idf, left_on='item_id', right_on='index', how='left')
        
        recs['score'] = recs['sim'] * recs['idf']
        recs = recs.sort_values(['user_id', 'score'], ascending=False)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1 
        return recs[recs['rank'] <= N_recs][['user_id', 'item_id', 'score', 'rank']]
    

# Загрузка датасета

In [6]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...', 
                        total=total_size_in_bytes, 
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

Downloading the kion dataset...:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [7]:
!unzip kion_train.zip -x '__MACOSX/*'

Archive:  kion_train.zip
replace data_original/interactions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [65]:
interactions_df = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True) 
# will cast types and save new pd.DataFrame inside in Interactions.df
interactions = Interactions(interactions_df)    

interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


## Задаем фолды для кросс-валидации

In [35]:
N_SPLITS = 4
TEST_SIZE = '7D'

In [36]:
# Init generator of folds
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [37]:
cv.get_test_fold_borders(interactions)

[(Timestamp('2021-07-26 00:00:00', freq='7D'),
  Timestamp('2021-08-02 00:00:00', freq='7D')),
 (Timestamp('2021-08-02 00:00:00', freq='7D'),
  Timestamp('2021-08-09 00:00:00', freq='7D')),
 (Timestamp('2021-08-09 00:00:00', freq='7D'),
  Timestamp('2021-08-16 00:00:00', freq='7D')),
 (Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]

## Задаем метрики и модели, по которым будем делать CV

In [44]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    'map@10': MAP(k=10),
    'novelty': MeanInvUserFreq(k=10),
}

# few simple models to compare
models = {
    'cosine_userknn': CosineRecommender(), # implicit 
    'tfidf_userknn': TFIDFRecommender(),
    'bm25': BM25Recommender()
    
}

In [45]:
%%time

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.df.iloc[train_ids].copy()
    df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=50)
        userknn_model.fit(df_train)
    
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)
        


{'end': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-07-26 00:00:00', freq='7D'),
 'test': 254565,
 'test_items': 6650,
 'test_users': 94910,
 'train': 3892558,
 'train_items': 15085,
 'train_users': 742256}


  0%|          | 0/742256 [00:00<?, ?it/s]

  0%|          | 0/742256 [00:00<?, ?it/s]

  0%|          | 0/742256 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}


  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}


  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 3,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

CPU times: user 2h 46min 23s, sys: 51.8 s, total: 2h 47min 15s
Wall time: 11h 33min 27s


# Метрики качества по фолдам 

In [47]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,map@10,novelty
0,0,cosine_userknn,0.004339,7.421055
1,0,tfidf_userknn,0.006852,7.539561
2,0,bm25,0.002793,9.157852
3,1,cosine_userknn,0.004246,7.476357
4,1,tfidf_userknn,0.006773,7.573736
5,1,bm25,0.002784,9.199335
6,2,cosine_userknn,0.003785,7.531878
7,2,tfidf_userknn,0.006279,7.631293
8,2,bm25,0.002679,9.28558
9,3,cosine_userknn,0.003685,7.614235


## Metrics mean 

Best model (without hyperparams tuning) based on map@10 metric is tfidf userknn

In [48]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,map@10,novelty
model,Unnamed: 1_level_1,Unnamed: 2_level_1
bm25,0.002722,9.251839
cosine_userknn,0.004014,7.510881
tfidf_userknn,0.006463,7.613839


# Учим TFIDFRecommender на всех данных и сохраняем в pickle и json для инференса

In [50]:
model = TFIDFRecommender()

In [51]:
userknn_model = UserKnn(model=model, N_users=50)

In [52]:
userknn_model.fit(interactions.df)

  0%|          | 0/962179 [00:00<?, ?it/s]

In [54]:
with open('user_knn.pickle', 'wb') as f:
    pickle.dump(userknn_model, f)

In [55]:
with open('user_knn.pickle', 'rb') as f:
    model = pickle.load(f)

In [57]:
part = interactions.df.iloc[:5]

In [59]:
%%time

recos = model.predict(part)

CPU times: user 158 ms, sys: 46.5 ms, total: 204 ms
Wall time: 213 ms


In [71]:
recos.iloc[:10]

Unnamed: 0,user_id,item_id,score,rank
258,699317,8886,2.48107,8
4,864613,10440,1.689577,10
1,864613,7638,4.652534,1
3,864613,12192,2.519292,7
8,864613,13668,3.856327,4


In [72]:
whole_recos = model.predict(interactions.df)

In [74]:
#save model for offline validation
import json

with open("userknn_model.json", "w") as f:
    json.dump(whole_recos.groupby("user_id")["item_id"].agg(list).to_dict(), f)

# Cчитаем метрики на cross validation для PopularModel

In [88]:
# Init generator of folds
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [89]:
class RecoService:
    def __init__(self, 
                 interactions: pd.DataFrame,
                 models: dict,
                 metrics: dict,
                 splitter: TimeRangeSplitter,
                 k: int,
                 n_splits: int = N_SPLITS    
                ):
        self.interactions = interactions
        
        self.models = models
        self.metrics = metrics
        
        self.splitter = splitter
        self.n_splits = n_splits
        
        self.k = k
    
    def train(self):
        results, last_models = list(), dict()
        
        cv = self.splitter.split(self.interactions)
        
        for train_ids, test_ids, fold_info in tqdm((cv), total=self.n_splits):
            print(f"\n==================== Fold {fold_info['i_split']}")
            print(fold_info)
            
            df_train = self.interactions.df.iloc[train_ids]
            dataset = Dataset.construct(df_train)

            df_test = self.interactions.df.iloc[test_ids][Columns.UserItem]
            test_users = np.unique(df_test[Columns.User])

            # Catalog is set of items that we recommend.
            # Sometimes we recommend not all items from train.
            catalog = df_train[Columns.Item].unique()

            for model_name, model in self.models.items():
                model = copy.deepcopy(model)
                model.fit(dataset)
                
                recos = model.recommend(
                    users=test_users,
                    dataset=dataset,
                    k=self.k,
                    filter_viewed=True,
                )
                
                metric_values = calc_metrics(
                    self.metrics,
                    reco=recos,
                    interactions=df_test,
                    prev_interactions=df_train,
                    catalog=catalog,
                )
                res = {"fold": fold_info["i_split"], "model": model_name}
                res.update(metric_values)
                results.append(res)
                last_models[model_name] = model
        
        pivot_results = pd.DataFrame(results).drop(columns="fold").groupby(["model"], sort=False).agg("mean")
        
        return {
                'results': pivot_results,
                'models': last_models
        }
    

In [90]:
pop_models = {'Popular': PopularModel()}

In [92]:
rec_service = RecoService(
        interactions = interactions,
        models = pop_models,
        metrics = metrics,
        splitter = cv,
        k=10
)

In [93]:
results = rec_service.train()

  0%|          | 0/4 [00:00<?, ?it/s]


{'i_split': 0, 'start': Timestamp('2021-07-26 00:00:00', freq='7D'), 'end': Timestamp('2021-08-02 00:00:00', freq='7D')}

{'i_split': 1, 'start': Timestamp('2021-08-02 00:00:00', freq='7D'), 'end': Timestamp('2021-08-09 00:00:00', freq='7D')}

{'i_split': 2, 'start': Timestamp('2021-08-09 00:00:00', freq='7D'), 'end': Timestamp('2021-08-16 00:00:00', freq='7D')}

{'i_split': 3, 'start': Timestamp('2021-08-16 00:00:00', freq='7D'), 'end': Timestamp('2021-08-23 00:00:00', freq='7D')}


In [94]:
results['results']

Unnamed: 0_level_0,map@10,novelty
model,Unnamed: 1_level_1,Unnamed: 2_level_1
Popular,0.085682,3.715195


# Учим Popular Model на всех данных и сохраняем в json для инференса

In [91]:
dataset = Dataset.construct(interactions_df)

In [96]:
popular_model = PopularModel()

In [97]:
popular_model.fit(dataset)

<rectools.models.popular.PopularModel at 0x346316370>

In [101]:
user_ids = interactions.df.user_id.unique()

In [102]:
whole_recos = popular_model.recommend(
                    users=user_ids,
                    dataset=dataset,
                    k=10,
                    filter_viewed=True,
                )

In [105]:
#save model for offline validation
import json

with open("popular_model.json", "w") as f:
    json.dump(whole_recos.groupby("user_id")["item_id"].agg(list).to_dict(), f)