In [1]:
import os
from pathlib import Path

import torch
import numpy as np
import pandas as pd
import wget
from torch import nn
from scipy.sparse import load_npz

os.sys.path.append('/hse_mlds_recsys_project')
from app.models.nn_models import NNModel, MatrixFactorization
from app.utils.metrics import precision_at_k

In [2]:
root_dir = Path('/hse_mlds_recsys_project/')

In [12]:
class MatrixFactorization(nn.Module):
    def __init__(self, n_users: int, n_items: int, n_factors=150):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.user_factors = torch.nn.Embedding(n_users, n_factors, sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, n_factors, sparse=True)

    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

    def predict(self, user_index) -> np.array:
        with torch.inference_mode():
            pred = self.forward(
                torch.tensor([user_index]),
                torch.arange(self.n_items)
            ).numpy()

        # sorting item scores descending
        return np.flip(pred.argsort())

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [7]:
user_item_matrix = load_npz(root_dir / 'app' / 'data' / 'train_user_item_matrix.npz').astype('float')
n_users, n_items = user_item_matrix.shape

criterion = nn.MSELoss()

net = MatrixFactorization(
    n_users=n_users,
    n_items=n_items,
    n_factors=150
).to(device)
optimizer = torch.optim.SparseAdam(net.parameters(), lr=1e-2)

mean_loss_log = []
std_loss_log = []

for epoch in tqdm_notebook(range(100)):
    losses_log = []
    for row in tqdm_notebook(list(range(n_users))):
        optimizer.zero_grad()

        rated_items = user_item_matrix[row].nonzero()[1]
        ratings = torch.tensor(user_item_matrix[row, rated_items].data, dtype=torch.float32).to(device)
        user = torch.tensor([row]).to(device)
        rated_items = torch.tensor(rated_items).to(device)

        pred = net(user, rated_items)

        loss = criterion(pred, ratings)
        loss.backward()
        losses_log.append(loss.item())

        optimizer.step()

    mean_loss_log.append(np.array(losses_log).mean())
    std_loss_log.append(np.array(losses_log).std())

In [None]:
torch.save({
    'model_state': net.state_dict(),
}, 'matrix_factorization_lr_1e2.pt')

#### Обучал на gpu в коллабе. Сохранил модель в google drive

In [8]:
test = pd.read_csv('../datasets/test.csv.zip', compression='zip')
result = test.groupby('user_id')['song'].unique().reset_index().rename(columns={'song': 'actual'})

In [13]:
model_id = '1-02guFWbJHBVCHBKR_IgfYlwDoEmGcnm'

model_params_path = wget.download(f'https://docs.google.com/uc?export=download&id={model_id}')
net_model = NNModel(MatrixFactorization, net_params_path=model_params_path, n_users=n_users, n_items=n_items)

Path(model_params_path).unlink()

In [14]:
net_model

NNModel(
  (net): MatrixFactorization(
    (user_factors): Embedding(13607, 150, sparse=True)
    (item_factors): Embedding(10000, 150, sparse=True)
  )
)

In [15]:
%%time
result['mf_150_lr_1e_2'] = result['user_id'].apply(lambda x: net_model.get_recommendations(x, n=20))
result.head()

CPU times: user 1min 35s, sys: 32.1 ms, total: 1min 35s
Wall time: 24.1 s


Unnamed: 0,user_id,actual,mf_150_lr_1e_2
0,00055176fea33f6e027cd3302289378b,"[Demi Lovato__Give Your Heart A Break, Ed Shee...","[Bridgit Mendler__Ready or Not, Maná__Clavado ..."
1,0007f3dd09c91198371454c608d47f22,"[Ben Howard__Keep Your Head Up, Ed Sheeran__Sm...","[Sara Bareilles__Winter Song, The Used__The Ta..."
2,000b0f32b5739f052b9d40fcc5c41079,[Lars Winnerbäck__Om du lämnade mig nu],"[Beck__The Golden Age, Agnes Obel__Dorian, Bif..."
3,000c11a16c89aa4b14b328080f5954ee,[Arctic Monkeys__Why'd You Only Call Me When Y...,"[Jason Mraz__Love Someone, Golden Earring__Rad..."
4,00123e0f544dee3ab006aa7f1e5725a7,[Foster The People__Don't Stop (Color on the W...,"[Queen__You're My Best Friend, Martin Tungevaa..."


In [16]:
result.apply(lambda row: precision_at_k(row['actual'], row['mf_150_lr_1e_2']), axis=1).mean()

0.001396340119056368