In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as td

import tensorboard
import pytorch_lightning as pl
from pytorch_lightning import callbacks

from tqdm.autonotebook import tqdm
import json

import typing as tp
# import faiss
import shutil
import random
import glob
from collections import Counter

np.random.seed(31337)

# Подготовка фичей

In [2]:
users_df = pd.read_csv("data/data_kion/users_processed.csv")
items_df = pd.read_csv("data/data_kion/items_processed.csv")
interactions_df = pd.read_csv("data/data_kion/interactions_processed.csv")

Remap IDs to be consequent from 0 to number of unique IDs

In [3]:
users_df.head()

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,M,True
1,962099,age_18_24,income_20_40,M,False
2,1047345,age_45_54,income_40_60,F,False
3,721985,age_45_54,income_20_40,F,False
4,704055,age_35_44,income_60_90,F,False


In [4]:
items_df.head()

Unnamed: 0,item_id,content_type,title,title_orig,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords,release_year_cat
0,10711,film,поговори с ней,Hable con ella,"['драмы', 'детективы', 'мелодрамы']",испания,False,16,unknown,педро альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ...",2000_2010
1,2508,film,голые перцы,Search Party,"['приключения', 'комедии']",сша,False,16,unknown,скот армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео...",2010_2020
2,10716,film,тактическая сила,Tactical Force,"['криминал', 'триллеры', 'боевики', 'комедии']",канада,False,16,unknown,адам п. калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг...",2010_2020
3,7868,film,45 лет,45 Years,"['драмы', 'мелодрамы']",великобритания,False,16,unknown,эндрю хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю...",2010_2020
4,16268,film,все решает мгновение,,"['драмы', 'спорт', 'мелодрамы']",ссср,False,12,ленфильм,виктор садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж...",1970_1980


In [5]:
items_df.columns

Index(['item_id', 'content_type', 'title', 'title_orig', 'genres', 'countries',
       'for_kids', 'age_rating', 'studios', 'directors', 'actors',
       'description', 'keywords', 'release_year_cat'],
      dtype='object')

In [6]:
interactions_df.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72
1,699317,1659,2021-05-29,8317,100
2,656683,7107,2021-05-09,10,0
3,864613,7638,2021-07-05,14483,100
4,964868,9506,2021-04-30,6725,100


In [7]:
user_cat_feats = ["age", "income", "sex", "kids_flg"]

users_ohe_df = users_df.user_id
for feat in user_cat_feats:
  ohe_feat_df = pd.get_dummies(users_df[feat], prefix=feat)
  users_ohe_df = pd.concat([users_ohe_df, ohe_feat_df], axis=1)

users_ohe_df.head()


Unnamed: 0,user_id,age_age_18_24,age_age_25_34,age_age_35_44,age_age_45_54,age_age_55_64,age_age_65_inf,age_age_unknown,income_income_0_20,income_income_150_inf,income_income_20_40,income_income_40_60,income_income_60_90,income_income_90_150,income_income_unknown,sex_F,sex_M,sex_sex_unknown,kids_flg_False,kids_flg_True
0,973171,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,True
1,962099,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,True,False
2,1047345,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,True,False
3,721985,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,True,False
4,704055,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False,True,False


In [8]:
item_cat_feats = ['content_type', 'release_year_cat',
                  'for_kids', 'age_rating', 'countries']

items_ohe_df = items_df.item_id

for feat in item_cat_feats:
  ohe_feat_df = pd.get_dummies(items_df[feat], prefix=feat)
  items_ohe_df = pd.concat([items_ohe_df, ohe_feat_df], axis=1) 

genres_dummies = (
    items_df['genres']
    .apply(lambda x: ','.join(x) if isinstance(x, list) else '')  # преобразуем списки в строки
    .str.get_dummies(sep=',')
    .add_prefix('genres_')
)

items_ohe_df = pd.concat([items_ohe_df, genres_dummies], axis=1).astype(np.int32)
items_ohe_df = items_ohe_df.set_index('item_id')
items_ohe_df.head()

Unnamed: 0_level_0,content_type_film,content_type_series,release_year_cat_1920_1930,release_year_cat_1930_1940,release_year_cat_1940_1950,release_year_cat_1950_1960,release_year_cat_1960_1970,release_year_cat_1970_1980,release_year_cat_1980_1990,release_year_cat_1990_2000,...,"countries_франция, япония",countries_хорватия,countries_чехия,countries_чили,countries_швейцария,countries_швеция,countries_эстония,countries_юар,countries_югославия,countries_япония
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10711,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2508,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10716,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7868,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16268,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Матрица взаимодействий

In [9]:
interactions_df.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72
1,699317,1659,2021-05-29,8317,100
2,656683,7107,2021-05-09,10,0
3,864613,7638,2021-07-05,14483,100
4,964868,9506,2021-04-30,6725,100


Фильтруем малоактивных юзеров и непопулярные фильмы.

In [10]:
interactions_df.item_id.value_counts()

item_id
10440    202457
15297    193123
9728     132865
13865    122119
4151      91167
          ...  
2435          1
7978          1
10642         1
13008         1
9286          1
Name: count, Length: 15706, dtype: int64

In [11]:
interactions_df.user_id.value_counts()

user_id
416206     1341
1010539     764
555233      685
11526       676
409259      625
           ... 
690921        1
255412        1
264195        1
150067        1
337469        1
Name: count, Length: 962179, dtype: int64

In [12]:
print(f"N users before: {interactions_df.user_id.nunique()}")
print(f"N items before: {interactions_df.item_id.nunique()}\n")

interactions_df = interactions_df[interactions_df.watched_pct > 10]

valid_users = []

c = Counter(interactions_df.user_id)
for user_id, entries in c.most_common():
  if entries >= 5:
    valid_users.append(user_id)

valid_items = []

c = Counter(interactions_df.item_id)
for item_id, entries in c.most_common():
  if entries >= 5:
    valid_items.append(item_id)

interactions_df = interactions_df[interactions_df.user_id.isin(valid_users)]
interactions_df = interactions_df[interactions_df.item_id.isin(valid_items)]

print(f"N users after: {interactions_df.user_id.nunique()}")
print(f"N items after: {interactions_df.item_id.nunique()}")

N users before: 962179
N items before: 15706

N users after: 207255
N items after: 8823


Соберем взаимодействия в матрицу user*item так, чтобы в строках этой матрицы были user_id, в столбцах - item_id, а на пересечениях строк и столбцов - единица, если пользователь взаимодействовал с айтемом и ноль, если нет.

Такую матрицу удобно собирать в numpy array, однако нужно помнить, что numpy array индексируется порядковыми индексами, а нам же удобнее использовать item_id и user_id.

Создадим некие внутренние индексы для user_id и item_id - uid и iid. Для этого просто соберем все user_id и item_id и пронумеруем их по порядку.

In [13]:
sorted(list(users_df.user_id.unique())) == sorted(list(range(len(users_df.user_id.unique())))), \
sorted(list(items_ohe_df.index.unique())) == sorted(list(range(len(items_ohe_df.index.unique()))))

(False, False)

In [14]:
# Step 1: Get unique IDs and create mappings
unique_user_ids = np.unique(np.concat([interactions_df["user_id"].unique(), users_df['user_id'].unique()]))
unique_item_ids = np.unique(np.concat([interactions_df["item_id"].unique(), items_df['item_id'].unique()]))

# Create mappings (old ID → new consecutive ID starting from 0)
user_id_map = {old_id: np.int32(new_id) for new_id, old_id in enumerate(unique_user_ids)}
user_id_map_to_orig = {np.int32(new_id): old_id for new_id, old_id in enumerate(unique_user_ids)}
item_id_map = {old_id: np.int32(new_id) for new_id, old_id in enumerate(unique_item_ids)}
item_id_map_to_orig = {np.int32(new_id): old_id for new_id, old_id in enumerate(unique_item_ids)}

In [15]:
interactions_df["uid"] = interactions_df['user_id'].map(user_id_map)
interactions_df["iid"] = interactions_df["item_id"].map(item_id_map)

print(sorted(interactions_df.iid.unique())[:5])
print(sorted(interactions_df.uid.unique())[:5])
interactions_df.head()

[np.int32(0), np.int32(1), np.int32(2), np.int32(3), np.int32(4)]
[np.int32(2), np.int32(3), np.int32(5), np.int32(9), np.int32(11)]


Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,uid,iid
0,176549,9506,2021-05-11,4250,72,141413,9178
1,699317,1659,2021-05-29,8317,100,560236,1604
3,864613,7638,2021-07-05,14483,100,692633,7376
5,1032142,6686,2021-05-13,11286,100,826986,6454
6,1016458,354,2021-08-14,1672,25,814480,343


# Interactions matrix

In [16]:
positives = interactions_df[interactions_df["watched_pct"] > 80].copy()
# del data
film_counts = positives.groupby("item_id").size()
films = set(film_counts[film_counts >= 5].index.values)

interactions_filt = positives[positives["item_id"].isin(films)]
del positives
len(interactions_filt), len(films)

(1491848, 6801)

In [17]:
interactions_filt

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,uid,iid
1,699317,1659,2021-05-29,8317,100,560236,1604
3,864613,7638,2021-07-05,14483,100,692633,7376
5,1032142,6686,2021-05-13,11286,100,826986,6454
11,988709,7571,2021-07-07,6558,100,792263,7309
14,5324,8437,2021-04-18,6598,92,4271,8140
...,...,...,...,...,...,...,...
5476240,802291,512,2021-08-08,6990,100,642768,496
5476241,1073802,9927,2021-08-07,6425,97,860383,9585
5476242,268216,3071,2021-04-21,5752,98,215082,2958
5476244,438585,7829,2021-08-02,6804,100,351522,7560


In [18]:
# interactions_matrix = pd.pivot_table(interactions_filt, values="watched_pct", index="user_id", columns="item_id").fillna(0)

# print("Interactions matrix: \nshape=" + str(interactions_matrix.shape))
# print(f"Density: {(interactions_matrix != 0).values.sum() / interactions_matrix.size}")
# from scipy.sparse import csr_matrix

# user_index = interactions_filt['user_id'].map(user_id_map)
# item_index = interactions_filt['item_id'].map(item_id_map)

# interactions_sparse = csr_matrix(
#     (interactions_filt['watched_pct'].values, (user_index, item_index)),
#     shape=(user_ids.cat.categories.size, item_ids.cat.categories.size),
#     dtype=np.float32
# )

# print("Sparse Interactions matrix:")
# print(f"Shape: {interactions_sparse.shape}")
# density = interactions_sparse.count_nonzero() / (interactions_sparse.shape[0] * interactions_sparse.shape[1])
# print(f"Density: {density:.6f}")

In [19]:
triplets = interactions_filt[["user_id", "item_id"]]
del interactions_filt

In [20]:
NUM_NEGATIVE_SAMPLES = 10
triplets = pd.concat([triplets] * NUM_NEGATIVE_SAMPLES).sort_index().reset_index(drop=True)
triplets["film_neg"] = np.random.choice(items_ohe_df.index.unique(), len(triplets))

In [21]:
triplets

Unnamed: 0,user_id,item_id,film_neg
0,699317,1659,3296
1,699317,1659,11111
2,699317,1659,14649
3,699317,1659,13216
4,699317,1659,4173
...,...,...,...
14918475,384202,16197,8394
14918476,384202,16197,12556
14918477,384202,16197,395
14918478,384202,16197,7101


In [22]:
triplets = triplets.rename(columns={ "item_id": "film_pos"})
triplets['film_pos'] = triplets['film_pos'].map(item_id_map)
triplets['film_neg'] = triplets['film_neg'].map(item_id_map)
triplets 

Unnamed: 0,user_id,film_pos,film_neg
0,699317,1604,3177
1,699317,1604,10720
2,699317,1604,14144
3,699317,1604,12754
4,699317,1604,4022
...,...,...,...
14918475,384202,15647,8101
14918476,384202,15647,12111
14918477,384202,15647,384
14918478,384202,15647,6854


In [23]:
rdm = np.random.random(len(triplets))
rdm2 = np.random.random(len(triplets))           # use only 20% of data
train_data = triplets[(rdm < 0.8) &              (rdm2 < 0.2)]
val_data = triplets[(rdm >= 0.8) & (rdm < 0.9) & (rdm2 < 0.2)]
test_data = triplets[(rdm >= 0.9) &              (rdm2 < 0.2)]

len(train_data), len(val_data), len(test_data)

(2388602, 298174, 299276)

In [24]:
items_df.shape, interactions_df.shape

((15963, 14), (2646592, 7))

In [25]:
NO_MOVIE = len(item_id_map) # arbitrary number bigger than largest item id

In [26]:
def pad_with_specific_value(lst, size, val):
    lst = list(set(lst))
    random.shuffle(lst)
    lst = lst[:size]
    return np.pad(lst, (0, size - len(lst)), 'constant', constant_values=(val))

padded_users = triplets.groupby("user_id").apply(lambda x: (
    pad_with_specific_value(x['film_pos'].tolist(), 30, NO_MOVIE).tolist()
))

padded_users = padded_users.reindex(range(2646592), fill_value=[NO_MOVIE] * 30)
padded_users = np.stack(padded_users.values)

  padded_users = triplets.groupby("user_id").apply(lambda x: (


In [27]:
padded_users

array([[15963, 15963, 15963, ..., 15963, 15963, 15963],
       [15963, 15963, 15963, ..., 15963, 15963, 15963],
       [ 7646,  8847,  6591, ..., 12509, 14671, 11129],
       ...,
       [15963, 15963, 15963, ..., 15963, 15963, 15963],
       [15963, 15963, 15963, ..., 15963, 15963, 15963],
       [15963, 15963, 15963, ..., 15963, 15963, 15963]],
      shape=(2646592, 30))

# Создаем датасет для обучения

In [6]:
class DSSMData(pl.LightningDataModule):
    def __init__(self, train_triplets, val_triplets, test_triplets, item_features, padded_users):
        super().__init__()
        self.train_triplets = train_triplets
        self.val_triplets = val_triplets
        self.test_triplets = test_triplets
        self.item_features = item_features
        self.padded_users = padded_users

    def _collect_data(self, triplets: pd.DataFrame):
        users = triplets["user_id"].values
        positives = triplets["film_pos"].values
        negatives = triplets["film_neg"].values
        print(f'Users: {users.shape}\nPositives: {positives.shape}\nNegatives: {negatives.shape}')
        # Wipe out positive interacted tracks from user listen history
        watched_movies = self.padded_users[users]
        watched_movies[np.isin(watched_movies, positives)] = NO_MOVIE
        watched_movies[np.isin(watched_movies, negatives)] = NO_MOVIE

        a = torch.from_numpy(watched_movies).to(dtype=torch.int32)
        # TODO: id которых нет в маппинге мапятся на 0, что плохо
        b = torch.from_numpy(items_ohe_df.loc[list(map(lambda id: item_id_map_to_orig.__getitem__(id), positives))].values).to(dtype=torch.float32)
        c = torch.from_numpy(items_ohe_df.loc[list(map(lambda id: item_id_map_to_orig.__getitem__(id), negatives))].values).to(dtype=torch.float32)
        print(a.shape, b.shape, c.shape)
        return td.TensorDataset(a, b, c)

    def prepare_data(self, stage=None):
        if stage == "fit" or stage is None:
            print('Get train data')
            self.train_dataset = self._collect_data(self.train_triplets)
            self.val_dataset = self._collect_data(self.val_triplets)
        elif stage == "test" or stage is None:
            self.test_dataset = self._collect_data(self.test_triplets)

    def train_dataloader(self):
        return td.DataLoader(self.train_dataset, batch_size=4096, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return td.DataLoader(self.val_dataset, batch_size=4096, num_workers=4)

    def test_dataloader(self):
        return td.DataLoader(self.test_dataset, batch_size=4096, shuffle=False, num_workers=4)

In [7]:
class ItemNet(nn.Module):
    def __init__(self,
                 dim_embedding: int,
                 dim_input: int,
                 dim_hidden: int = 64, # 32
                 activation: tp.Callable[[torch.Tensor], torch.Tensor] = nn.ReLU()
                 ) -> None:
        super().__init__()
        self.embedding_layer = nn.Linear(dim_input - 1, dim_hidden, bias=False)
        # self.dense_layer = nn.Linear(dim_hidden + 1, dim_embedding, bias=False)
        self.dense_block = nn.Sequential(
            nn.Linear(dim_hidden + 1, int(dim_embedding // 2)),
            nn.ReLU(),
            # nn.Linear(int(dim_embedding // 2), int(dim_embedding // 2)),
            # nn.ReLU(),
            nn.Linear(int(dim_embedding // 2), dim_embedding),
            activation
        )
        self.output_layer = nn.Linear(dim_embedding + dim_hidden, dim_embedding, bias=False)
        self.norm = nn.LayerNorm(dim_embedding)

    def forward(self, item_features: torch.Tensor) -> torch.Tensor:
        popularity = item_features[:, 0].view(-1, 1)
        genre_emb = self.embedding_layer(item_features[:, 1:])

        pop_genre = torch.concat([popularity, genre_emb], axis=1)
        features = self.dense_block(pop_genre)

        genre_features = torch.concat([genre_emb, features], axis=1)
        output = self.output_layer(genre_features)
        return self.norm(output)


class UserNet(nn.Module):
    def __init__(self,
                 dim_embedding: int,
                 num_items: int,
                 activation: tp.Callable[[torch.Tensor], torch.Tensor] = nn.ReLU()
                 ) -> None:                              # | +1 for the NO_MOVIE element
        super().__init__()                               # V
        self.track_embeddings = nn.EmbeddingBag(num_items + 1, dim_embedding, padding_idx=num_items)
        # self.dense_layer = nn.Linear(dim_embedding, dim_embedding, bias=False)
        self.dense_layer = nn.Sequential(
            nn.Linear(dim_embedding, int(dim_embedding // 2)),
            nn.ReLU(),
            # nn.Linear(int(dim_embedding // 2), int(dim_embedding // 2)),
            # nn.ReLU(),
            nn.Linear(int(dim_embedding // 2), dim_embedding),
            activation
        )
        self.output_layer = nn.Linear(dim_embedding + dim_embedding, dim_embedding, bias=False)
        self.norm = nn.LayerNorm(dim_embedding)
        self.num_items = num_items

    def forward(self, user_ids: torch.Tensor) -> torch.Tensor:
        # print(f'EMBEDDING BAG MAX INPUT: {user_ids.max()} while was ready for {self.num_items}')
        interactions_emb = self.track_embeddings(user_ids)
        features = self.dense_layer(interactions_emb)
        x = torch.concat([interactions_emb, features], axis=1)
        output = self.output_layer(x)
        return self.norm(output)

In [8]:
class DSSM(pl.LightningModule):
    def __init__(self,
                 dim_item_features: int,
                 num_items: int,
                 embedding_dim: int = 100,
                 lr: float = 1e-3,
                 triplet_loss_margin: float = 0.4,
                 weight_decay: float = 1e-3,
                 log_to_prog_bar: bool = True,
                 ) -> None:
        super().__init__()
        self.lr = lr
        self.triplet_loss_margin = triplet_loss_margin
        self.weight_decay = weight_decay
        self.log_to_prog_bar = log_to_prog_bar
        self.item_net = ItemNet(embedding_dim, dim_item_features)
        self.user_net = UserNet(embedding_dim, num_items)

    def forward(self,
                user_ids: torch.Tensor,
                item_features_pos: torch.Tensor,
                item_features_neg: torch.Tensor,
                ) -> tp.Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        anchor = self.user_net(user_ids)
        pos = self.item_net(item_features_pos)
        neg = self.item_net(item_features_neg)

        return anchor, pos, neg

    def _step(self, batch, batch_idx, metric, prog_bar=False):
        user_ids, pos, neg = batch
        anchor, positive, negative = self(user_ids, pos, neg)
        loss = F.triplet_margin_loss(anchor, positive, negative, margin=self.triplet_loss_margin)
        self.log(metric, loss, prog_bar=prog_bar)
        return loss

    def training_step(self, batch: tp.Sequence[torch.Tensor], batch_idx: int) -> torch.Tensor:
        return self._step(batch, batch_idx, "train_loss")

    def validation_step(self, batch: tp.Sequence[torch.Tensor], batch_idx: int) -> torch.Tensor:
        return self._step(batch, batch_idx, "val_loss", self.log_to_prog_bar)

    def test_step(self, batch, batch_idx, prog_bar=False):
        return self._step(batch, batch_idx, "test_loss", self.log_to_prog_bar)

    def inference(self, dataloader: td.DataLoader[tp.Any], mode: str = "item") -> np.ndarray:
        if mode == "user":
          model = self.user_net
        elif mode == "item":
          model = self.item_net
        else:
          raise ValueError(f"Unsupported mode {mode}!")

        self.eval()

        batches = []
        user_ids = []
        for ids, features in dataloader:
            with torch.no_grad():
                batch_embeddings = model(features.to(self.device))
            batches.append(batch_embeddings)
            user_ids.append(ids)
        vectors = torch.cat(batches, dim=0).cpu().numpy()
        vectors_ids = torch.cat(user_ids, dim=0).cpu().numpy()
        return vectors_ids, vectors

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        # lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, threshold=2.5e-2, threshold_mode='rel')
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, threshold=2e-3, threshold_mode='abs')
        scheduler = {
            'scheduler': lr_scheduler,
            'monitor': 'val_loss'
        }
        return [optimizer], [scheduler]

In [9]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

class RelativeEarlyStopping(EarlyStopping):
    def __init__(
        self,
        monitor: str = "val_loss",
        patience: int = 5,
        relative_threshold: float = 1e-4,  # 0.01%
        verbose: bool = True,
    ):
        super().__init__(
            monitor=monitor,
            patience=patience,
            min_delta=0,  # We handle delta ourselves
            mode="min",
            verbose=verbose,
        )
        self.relative_threshold = relative_threshold

    def _evaluate_stopping_criteria(self, current: torch.Tensor):
        if self.best_score is None:  # First evaluation
            return False, None
            
        # Calculate relative change
        relative_change = abs((current - self.best_score) / self.best_score)
        
        if relative_change < self.relative_threshold:
            self.wait_count += 1
            if self.verbose:
                print(f"Relative change {relative_change:.4%} < threshold {self.relative_threshold:.4%}")
        else:
            if self.verbose:
                print(f"Relative change {relative_change:.4%} >= threshold {self.relative_threshold:.4%}")
            self.wait_count = 0
            
        should_stop = self.wait_count >= self.patience
        reason = (f"Relative change {relative_change:.4%} < threshold {self.relative_threshold:.4%}" 
                 if should_stop else None)
        return should_stop, reason

# Обучение


In [10]:
EMBEDDING_DIM = 96
EXPERIMENT_NAME = 'dssm_layernorm_params_arch_5'

data_module = DSSMData(train_data, val_data, test_data, items_ohe_df, padded_users)
net = DSSM(dim_item_features=items_ohe_df.shape[1],
           num_items=len(item_id_map),
           embedding_dim=EMBEDDING_DIM
           ).to(dtype=torch.float32)

trainer = pl.Trainer(
    max_epochs=25,
    accelerator='cuda',
    devices=1,
    callbacks=[
        # RelativeEarlyStopping(monitor="val_loss", patience=5, relative_threshold=1.5e-2, verbose=True),
        callbacks.EarlyStopping(monitor="val_loss", patience=5, min_delta=2e-3, verbose=True),
        callbacks.LearningRateMonitor(logging_interval="step"),
        (checkpoint_callback := callbacks.ModelCheckpoint(monitor="val_loss")),
    ]
)

NameError: name 'train_data' is not defined

In [33]:
trainer.fit(
    net,
    data_module, 
    # ckpt_path='/home/serg_fedchn/Homework/6_semester/RecSys/recsys-course-spring-2025/jupyter/lightning_logs/version_13/checkpoints/epoch=32-step=63393.ckpt'
)

Get train data
Users: (2388602,)
Positives: (2388602,)
Negatives: (2388602,)
torch.Size([2388602, 30]) torch.Size([2388602, 579]) torch.Size([2388602, 579])
Users: (298174,)
Positives: (298174,)
Negatives: (298174,)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type    | Params | Mode 
---------------------------------------------
0 | item_net | ItemNet | 60.4 K | train
1 | user_net | UserNet | 1.6 M  | train
---------------------------------------------
1.6 M     Trainable params
0         Non-trainable params
1.6 M     Total params
6.484     Total estimated model params size (MB)
18        Modules in train mode
0         Modules in eval mode


torch.Size([298174, 30]) torch.Size([298174, 579]) torch.Size([298174, 579])


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.150


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.002 >= min_delta = 0.002. New best score: 0.148


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.004 >= min_delta = 0.002. New best score: 0.144


Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.144. Signaling Trainer to stop.


In [34]:
checkpoint_callback.best_model_path

'/home/serg_fedchn/Homework/6_semester/DL/reelrecs/lightning_logs/version_11/checkpoints/epoch=11-step=14004.ckpt'

In [35]:
shutil.move(checkpoint_callback.best_model_path, f"/home/serg_fedchn/Homework/6_semester/DL/reelrecs/model_weights/{EXPERIMENT_NAME}.ckpt")

'/home/serg_fedchn/Homework/6_semester/DL/reelrecs/model_weights/dssm_layernorm_params_arch_5.ckpt'

In [11]:
best_dssm = DSSM.load_from_checkpoint(f"/home/serg_fedchn/Homework/6_semester/DL/reelrecs/model_weights/{EXPERIMENT_NAME}.ckpt",
                                      dim_item_features=items_ohe_df.shape[1],
                                      num_items=len(item_id_map),
                                      embedding_dim=EMBEDDING_DIM)

NameError: name 'items_ohe_df' is not defined