## Import libraries

In [2]:
import numpy as np
import pandas as pd

import implicit
from tqdm.auto import trange
from functools import lru_cache
from scipy.sparse import csr_matrix
from implicit.nearest_neighbours import TFIDFRecommender



In [3]:
def headtail(df):
    return pd.concat([df.head(), df.tail()])

## Import data

In [4]:
with open('../input/recsys-yandex-cup-2022/train') as f:
    train = f.read().split('\n')[:-1]
    print(len(train))

1160084


In [6]:
with open('../input/recsys-yandex-cup-2022/test') as f:
    tests = f.read().split('\n')[:-1]
    print(len(tests))

289914


In [8]:
df = pd.read_csv('../input/recsys-yandex-cup-2022/dataframe.csv/dataframe.csv')

# set new name for user_id
max_train_user_id = df[df.is_train == 1].user_id.max()
df.loc[df.is_train == 0, "user_id"] += max_train_user_id

#  Moderation of attribute composition
df = df[['user_id', 'track_id', 'reversed_rank']]
df = df.rename(columns={"user_id": "user", "track_id": "item", "reversed_rank": "order"})
headtail(df)

Unnamed: 0,user,item,order
0,0,333396,53
1,0,267089,52
2,0,155959,51
3,0,353335,50
4,0,414000,49
117450829,1449996,448288,4
117450830,1449996,1343,3
117450831,1449996,86420,2
117450832,1449996,186436,1
117450833,1449996,8474,0


## UsersKFoldLeavePOut

### conditions:
- There must be no shared users between `train` and `test`
- The number of fouls is specified using the `n_folds` class parameter
- There must be no more than `p` last tracks in `test` (class parameter p)

In [9]:
class UsersKFoldPOut():
    def __init__(self, n_folds, p, random_seed=23):
        self.n_folds = n_folds
        self.p = p
        self.random_seed = random_seed
    
    def split(self, df):
        users = df.user.unique()
        users_count = len(users)
        np.random.seed(self.random_seed)
        np.random.shuffle(users)
        
        fold_sizes = np.full(self.n_folds, users_count // self.n_folds, dtype=int)
        fold_sizes[: users_count % self.n_folds] += 1
        
        curent = 0
        for f in fold_sizes:
            start, stop = curent, curent + f
            test_folds_users = users[start:stop]
            
            # For the test, the first p interactions of users caught in the test portion of the fold are taken
            test_mask = (df.user.isin(test_folds_users)) & (df.order < self.p)
            
            # All user interactions that are not in the test portion of the fold are taken for the train
            train_mask = ~df.user.isin(test_folds_users)
            
            yield train_mask, test_mask

In [14]:
kfold_user = UsersKFoldPOut(n_folds=5, p=1)

for i, (train_mask, test_mask) in enumerate(kfold_user.split(df)):
    train = df[train_mask]
    test = df[test_mask]
    print(f'Fold#{i} | Train: {train.shape[0]}, Test: {test.shape[0]}')

Fold#0 | Train: 93987938, Test: 290000
Fold#1 | Train: 93987938, Test: 290000
Fold#2 | Train: 93988194, Test: 289999
Fold#3 | Train: 93988194, Test: 289999
Fold#4 | Train: 93988194, Test: 289999


## Data processing: