In [None]:

'''
Нужно реализовать (или использовать сделанный из предыдущих домашних работ)
1. SVD-разложение
2. Архитектуру Neural matrix factorization model (NeuMF) (см. лекцию и сем по нейронным сетям + https://arxiv.org/pdf/1708.05031.pdf +  https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation), на вход которой подаются:
• эмбединги длинны n по users 
• эмбединги длинны n по items  
3. Гибридную архитектуру нейронную сеть, которая на вход получает:
• эмбединги длинны n по users 
• эмбединги длинны n по items 
• признаки, которые можно извлечь из объекта (см.  baseline.ipynb, который есть в датасэт REKKO.)
• признаки по пользователю (подумайте какие например можно извлечь из bookmarks.csv)


Для гибридную архитектуру можно использовать NeuMF, рядом подать дополнительные входы. Их можно провязать полносвязанными слоями с основной частью. Можно это делать разными способами ближе к началу сети, или ближе концу. Попробуйте разные архитектуры, оцените их качество и скорость обучения. Обоснуйте свой финальный выбор.
Сравнить:
• SVD-разложения (построенного на только rating.csv) на  8, 10, 12 компонент
• Архитектуру NeuMF, где на вход подаются на SVD-разложения 8, 10, 12 компонент
• Гибридную архитектуру, где на вход подаются на SVD-разложения 8, 10, 12 компонент
Дополнительно можно поисследовать какие-то параметры архитектур. 
Для оценки качества используйте кросс-валидацию на 3 фолда.
'''

In [None]:
import pandas as pd
import numpy as np

rekko_data = pd.read_csv('/kaggle/input/rekko-rating/ratings.csv')
rekko_data.head()
rekko_data.drop('ts', axis=1, inplace=True)

In [None]:
unique_users = rekko_data.user_uid.unique()
user_to_index = {old: new for new, old in enumerate(unique_users)}
new_users = rekko_data.user_uid.map(user_to_index)

unique_movies = rekko_data.element_uid.unique()
movie_to_index = {old: new for new, old in enumerate(unique_movies)}
new_movies = rekko_data.element_uid.map(movie_to_index)

n_users = unique_users.shape[0]
n_elements = unique_movies.shape[0]

In [None]:
X = pd.DataFrame({'user_id': new_users, 'elem_id': new_movies})
y = rekko_data['rating'].astype(np.float32)


print(f'Embeddings: {n_users} users, {n_elements} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

In [None]:
class ReviewsIterator:
    
    def __init__(self, X, y, batch_size=32, shuffle=True):
        X, y = np.asarray(X), np.asarray(y)
        
        if shuffle:
            index = np.random.permutation(X.shape[0])
            X, y = X[index], y[index]
            
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n_batches = int(math.ceil(X.shape[0] // batch_size))
        self._current = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        return self.next()
    
    def next(self):
        if self._current >= self.n_batches:
            raise StopIteration()
        k = self._current
        self._current += 1
        bs = self.batch_size
        return self.X[k*bs:(k + 1)*bs], self.y[k*bs:(k + 1)*bs]

In [None]:
def batches(X, y, bs=32, shuffle=True):
    iterator = ReviewsIterator(X.index, y, bs, shuffle)  # Using X.index for DataFrame indices
    for indices, (xb_indices, yb) in enumerate(iterator):
        xb = X.loc[xb_indices].values  # Accessing rows by indices and converting to array
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1), xb_indices


In [None]:
import math
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F 
from torch.optim.lr_scheduler import _LRScheduler
from itertools import zip_longest


for x_batch, y_batch, indices in batches(X, y, bs=4):
    print(x_batch)
    print(y_batch)
    print(indices)
    break

**Смотрим на работу SVD**

In [None]:
#класс SVD для коллаборативной фильтрации на основе SGD 
import numpy as np
import pandas as pd

class SVDWithSGD:
    def __init__(self, num_factors=10, learning_rate=0.01, reg_param=0.02, num_epochs=20):
        self.num_factors = num_factors
        self.learning_rate = learning_rate
        self.reg_param = reg_param
        self.num_epochs = num_epochs

    def fit(self, ratings_df):
        self.user_col = 'user_uid'
        self.item_col = 'element_uid'
        self.rating_col = 'rating'

        self.users = ratings_df[self.user_col].unique()
        self.items = ratings_df[self.item_col].unique()

        self.num_users = len(self.users)
        self.num_items = len(self.items)

        self.user_to_idx = {user: idx for idx, user in enumerate(self.users)}
        self.item_to_idx = {item: idx for idx, item in enumerate(self.items)}

        self.U = np.random.rand(self.num_users, self.num_factors)
        self.V = np.random.rand(self.num_items, self.num_factors)

        for epoch in range(self.num_epochs):
            for _, row in ratings_df.iterrows():
                user_idx = self.user_to_idx[row[self.user_col]]
                item_idx = self.item_to_idx[row[self.item_col]]
                rating = row[self.rating_col]

                prediction = np.dot(self.U[user_idx], self.V[item_idx])
                error = rating - prediction

                # Update U and V using SGD
                u_update = self.learning_rate * (error * self.V[item_idx] - self.reg_param * self.U[user_idx])
                v_update = self.learning_rate * (error * self.U[user_idx] - self.reg_param * self.V[item_idx])

                self.U[user_idx] += u_update
                self.V[item_idx] += v_update
            print('epoch', epoch, 'happened')

    def predict(self, users, items):
        predictions = []
        for user, item in zip(users, items):
            if user in self.user_to_idx and item in self.item_to_idx:
                user_idx = self.user_to_idx[user]
                item_idx = self.item_to_idx[item]
                prediction = np.dot(self.U[user_idx], self.V[item_idx])

                # Ensure the prediction is within the range of the target variable
                # For example, rounding to the nearest integer for integer targets
                prediction = int(round(prediction))  # Adjust this according to your target variable type

                predictions.append(prediction)
            else:
                # Assign a default value or placeholder for missing users/items
                # For example, using a minimum possible value for integer targets
                predictions.append(0)  # Adjust this according to your target variable type
        return predictions


In [None]:
#делаем разбивку на 5 фолдов для кроссвалидации 
#обучим модельку 
#протестируем 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error


random_states = [40, 41, 42, 43, 44]
final_score = []

for i in random_states: 
    X = rekko_data.drop('rating', axis=1)
    y = rekko_data['rating']

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=i)
    print('Random state:', i)
    df_for_class = pd.concat([X_train, y_train], axis=1)
    num_users = df_for_class['user_uid'].nunique()
    num_items = df_for_class['element_uid'].nunique()

    svd = SVDWithSGD(num_factors=10, learning_rate=0.01, reg_param=0.02, num_epochs=10)
    svd.fit(df_for_class)
    
    preds = svd.predict(X_test['user_uid'], X_test['element_uid'])
    final_score.append(mean_squared_error(y_test, preds, squared=False))
    print('RMSE for SVD model is:', mean_squared_error(y_test, preds, squared=False))
    
print('RMSE in cross-validation of 5 folds:', np.mean(final_score))

**Теперь достанем новые данные из Rekko датасета помимо user-element пар**

In [None]:
#поработаем с bookmarks 
#что можно сделать - сделать бинарную переменную оставил ли пользователь фильм bookmark 

import pandas as pd
import numpy as np

bm_rekko_data = pd.read_csv('/kaggle/input/rekko-challenge/bookmarks.csv')
bm_rekko_data.head()
bm_rekko_data.drop('ts', axis=1, inplace=True)

In [None]:
bookmarked_items = bm_rekko_data.groupby('user_uid')['element_uid'].apply(set).reset_index()

def is_bookmarked(row):
    user_id = row['user_uid']
    item_id = row['element_uid']
    if user_id in bookmarked_items['user_uid'].values:
        return int(item_id in bookmarked_items[bookmarked_items['user_uid'] == user_id]['element_uid'].iloc[0])
    return 0

#сделаем переменную добавлял ли пользователь элемент в избранное 
rekko_data['bookmarked'] = rekko_data.apply(is_bookmarked, axis=1)

In [None]:
#подгружаем данные 

import os
import json

DATA_PATH = '/kaggle/input/rekko-challenge/'

with open(os.path.join(DATA_PATH, 'catalogue.json'), 'r') as f:
    catalogue = json.load(f)
    
catalogue = {int(k): v for k, v in catalogue.items()}

In [None]:
#выписываем колонки для различных характеристик элементов 
future_cols = ['type', 'availability', 'duration', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'attributes']

In [None]:
#делаем преобразования с данными 
element_attributes_subset = {
    element_id: {i: attributes[i] for i in future_cols} for element_id, attributes in catalogue.items()
}
rekko_data = rekko_data.merge(pd.DataFrame(element_attributes_subset).T, left_on='element_uid', right_index=True, how='left')

In [None]:
#преобразуем одну из переменных 
rekko_data['attributes_count'] = rekko_data['attributes'].apply(lambda x: len(x))
rekko_data.drop('attributes', axis=1, inplace=True)

In [None]:
#и ещё одну 
unique_values = set()
for sublist in rekko_data['availability']:
    unique_values.update(sublist)

for value in unique_values:
    rekko_data[value] = rekko_data['availability'].apply(lambda x: 1 if value in x else 0)

In [None]:
rekko_data.drop('availability', axis=1, inplace=True)

In [None]:
#делаем класс для Neural Collaborative Filtering с использованием Matrix Factorization 

class GMF(nn.Module):
    def __init__(self, n_users, n_elements, n_factors):
        super().__init__()
        self.u_gmf = nn.Embedding(n_users, n_factors)
        self.e_gmf = nn.Embedding(n_elements, n_factors)
        self._init()

    def _init(self):
        torch.nn.init.xavier_uniform_(self.u_gmf.weight)
        torch.nn.init.xavier_uniform_(self.e_gmf.weight)

    def forward(self, users, movies):
        u = self.u_gmf(users)
        e = self.e_gmf(movies)
        gmf_output = u * e  # Element-wise multiplication
        return gmf_output
    
    
class MyNet(nn.Module): 
    def __init__(self, n_users, n_elements, n_factors, activation,
                 hidden_units=10): 
        super().__init__()
        hidden = self.get_list(hidden_units)
        
        def gen_layers(n_in):
            """
            A generator that yields a sequence of hidden layers and 
            their activations/dropouts.
            
            Note that the function captures `hidden` and `dropouts` 
            values from the outer scope.
            """
            nonlocal hidden
            
            for n_out in hidden:
                yield nn.Linear(n_in, n_out)
                yield activation()
                n_in = n_out
        
        self.u = nn.Embedding(n_users, n_factors)
        self.e = nn.Embedding(n_elements, n_factors)
        self.hidden = nn.Sequential(*list(gen_layers(n_factors * 2)))
        n_last = hidden[-1]
        self.fc = nn.Linear(n_last, 1)
        self._init
        
        
    def get_list(self, n):
        if isinstance(n, (int, float)):
            return [n]
        elif hasattr(n, '__iter__'):
            return list(n)
        
    def _init(self):
        """
        Setup embeddings and hidden layers with reasonable initial values.
        """
        def init(m):
            if type(m) == nn.Linear:
                torch.nn.init.xavier_uniform_(m.weight)
                m.bias.data.fill_(0.01)
                
        self.u.weight.data.uniform_(-0.05, 0.05)
        self.m.weight.data.uniform_(-0.05, 0.05)
        self.hidden.apply(init)
        init(self.fc)
        
    def forward(self, users, movies, minmax=None):
        features = torch.cat([self.u(users), self.e(movies)], dim=1)
        x = self.hidden(features)
        out = torch.sigmoid(self.fc(x))
        if minmax is not None:
            min_rating, max_rating = minmax
            out = out*(max_rating - min_rating + 1) + min_rating - 0.5
        return out
        


class NCFWithGMF(nn.Module):
    def __init__(self, n_users, n_elements, n_factors, activation, hidden_units=10):
        super().__init__()
        self.mlp = MyNet(n_users, n_elements, n_factors, activation, hidden_units)
        self.gmf = GMF(n_users, n_elements, n_factors)
        self.final_fc = nn.Linear(151, 1)
        self._init()

    def _init(self):
        torch.nn.init.xavier_uniform_(self.final_fc.weight)
        self.final_fc.bias.data.fill_(0.01)

    def forward(self, users, movies, minmax=None):
        mlp_output = self.mlp(users, movies)
        gmf_output = self.gmf(users, movies)
        combined_features = torch.cat([mlp_output, gmf_output], dim=1)
        final_output = torch.sigmoid(self.final_fc(combined_features))
        if minmax is not None:
            min_rating, max_rating = minmax
            final_output = final_output * (max_rating - min_rating + 1) + min_rating - 0.5
        return final_output


In [None]:
X = pd.DataFrame({'user_id': new_users, 'elem_id': new_movies})
y = rekko_data['rating'].astype(np.float32)


print(f'Embeddings: {n_users} users, {n_elements} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

In [None]:
#настроим кросс валидацию 
dfs = {}
dfs_sizes = {}

# Define the number of folds
num_folds = 5

# Calculate the size of each fold
fold_size = len(X) // num_folds

for fold in range(num_folds):
    start_index = fold * fold_size
    end_index = (fold + 1) * fold_size if fold < num_folds - 1 else len(X)
    
    X_train = np.concatenate([X[:start_index], X[end_index:]], axis=0)
    X_test = X[start_index:end_index]
    y_train = np.concatenate([y[:start_index], y[end_index:]], axis=0)
    y_test = y[start_index:end_index]
    
    # Create new dataframes for each fold
    df_train = {'train': (X_train, y_train), 'val': (X_test, y_test)}
    df_test_sizes = {'train': len(X_train), 'val': len(X_test)}
    
    dfs[f'fold_{fold + 1}'] = df_train
    dfs_sizes[f'fold_{fold + 1}'] = df_test_sizes

In [None]:
#здесь будет обучение 
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
df = {'train': (X_train, y_train), 'val': (X_test, y_test)}
df_sizes = {'train': len(X_train), 'val': len(X_test)}

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error

def train_model(model, df, df_sizes, batches, epochs=10, lr=0.001, minmax=None):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    train_data, train_labels = df['train']
    val_data, val_labels = df['val']
    
    
    batch_size = 2000
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        
        # Training loop for collaborative filtering network
        for xb, yb in batches(train_data, train_labels, bs=batch_size, shuffle=True):
            users = xb[:, 0]  # Assuming user ID is at index 0
            movies = xb[:, 1]  # Assuming movie ID is at index 1
            ratings = yb.squeeze()  # Assuming ratings are single-dimensional

            optimizer.zero_grad()
            outputs = model(users, movies, minmax)
            loss = criterion(outputs.squeeze(), ratings.float())
            loss.backward()
            optimizer.step()


            running_loss += loss.item()
        
        # Calculating average loss per epoch
        running_loss /= df_sizes['train']
        
        # Validation
        model.eval()
        val_loss = 0.0
        predictions = []
        true_ratings = []
        with torch.no_grad():
            for xb, yb in batches(val_data, val_labels, bs=batch_size, shuffle=False):
                users = xb[:, 0]  # Assuming user ID is at index 0
                movies = xb[:, 1]  # Assuming movie ID is at index 1
                ratings = yb.squeeze()  # Assuming ratings are single-dimensional
                
                outputs = model(users, movies, minmax)
                loss = criterion(outputs.squeeze(), ratings.float())
                val_loss += loss.item()
                predictions.extend(outputs.squeeze().tolist())
                true_ratings.extend(ratings.tolist())

            val_loss /= df_sizes['val']
            val_rmse = mean_squared_error(true_ratings, predictions, squared=False)

        print(f"Epoch {epoch + 1}/{epochs}: "
              f"Train Loss: {running_loss:.4f}, "
              f"Val Loss: {val_loss:.4f}, "
              f"Val RMSE: {val_rmse:.4f}")

    print('Training complete')
    return val_rmse


In [None]:
ratings_range = rekko_data.rating.min(), rekko_data.rating.max()
ratings_range

In [None]:
def batches(X, y, bs=32, shuffle=True):
    for xb, yb in ReviewsIterator(X, y, bs, shuffle):
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1) 

In [None]:
res_list = []
for i in dfs:  
    data = pd.DataFrame()
    model = NCFWithGMF(n_users, n_elements, n_factors=150, activation=nn.ReLU, hidden_units=[100, 100, 100])
    res = train_model(model, dfs[i], dfs_sizes[i], batches, epochs=10, lr=0.001, minmax=ratings_range)
    res_list.append(res)
    
print('Mean RMSE on validation:', np.mean(res_list))

**Теперь попробуем сделать сеть, которая сможет учитывать табличные данные**

In [None]:
class TabularNetwork(nn.Module):
    def __init__(self, input_size, hidden_layers, output_size):
        super().__init__()

        layers = []
        for i in range(len(hidden_layers)):
            if i == 0:
                layers.append(nn.Linear(input_size, hidden_layers[i]))
            else:
                layers.append(nn.Linear(hidden_layers[i - 1], hidden_layers[i]))
            layers.append(nn.ReLU())  # Using ReLU activation function

        layers.append(nn.Linear(hidden_layers[-1], output_size))  # Output layer

        self.tabular_layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.tabular_layers(x)


class HybridRecommendation(nn.Module):
    def __init__(self, n_users, n_elements, n_factors, activation, input_size, output_size, hidden_layers, hidden_units=10):
        super().__init__()
        self.ncf_gmf = NCFWithGMF(n_users, n_elements, n_factors, activation, hidden_units)
        self.tabular_network = TabularNetwork(input_size, hidden_layers, output_size)  # Replace with your tabular network

        # Define a weight to combine predictions from both networks
        self.weight = nn.Parameter(torch.tensor(0.5))  # Initial weight, can be learned

    def forward(self, users, movies, tabular_data, minmax=None):
        # Get predictions from NCFWithGMF
        ncf_gmf_output = self.ncf_gmf(users, movies, minmax)

        # Get predictions from the tabular network
        tabular_output = self.tabular_network(tabular_data)

        # Combine predictions using weighted average
        weighted_output = self.weight * ncf_gmf_output + (1 - self.weight) * tabular_output

        return weighted_output


In [None]:
item_tab_data = rekko_data.iloc[:, 1:]
item_rating_data = item_tab_data['rating']
item_tab_data.drop('rating', axis=1, inplace=True)

In [None]:
item_tab_data.drop('element_uid', axis=1, inplace=True)

In [None]:
one_hot_encoded = pd.get_dummies(item_tab_data['type'])

#сделаем нужный датасет 
item_tab_data = pd.concat([item_tab_data, one_hot_encoded], axis=1)


In [None]:
item_tab_data.drop('type', axis=1, inplace=True)

In [None]:
cols = ['duration', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5']

In [None]:
item_tab_data[cols] = item_tab_data[cols].astype(float)

In [None]:
item_tab_data.info()

In [None]:
X = pd.DataFrame({'user_id': new_users, 'elem_id': new_movies})
y = rekko_data['rating'].astype(np.float32)


print(f'Embeddings: {n_users} users, {n_elements} movies')
print(f'Dataset shape: {X.shape}')
print(f'Target shape: {y.shape}')

In [None]:
def train_hybrid_model(model, df, tabular_data_train, tabular_data_test, tabular_y_train, df_sizes, num_epochs=10, lr=0.001, batch_size=10000):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()  # Using Mean Squared Error loss for regression

    train_data, train_labels = df['train']
    val_data, val_labels = df['val']

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        # Training loop for collaborative filtering network
        for xb, yb, indices in batches(train_data, train_labels, bs=batch_size, shuffle=True):
            users = xb[:, 0]  # Assuming user ID is at index 0
            movies = xb[:, 1]  # Assuming movie ID is at index 1
            ratings = yb.squeeze()  # Assuming ratings are single-dimensional

            optimizer.zero_grad()

            # Extract corresponding tabular data for training
            tabular_batch_train = pd.DataFrame([tabular_data_train.loc[i] for i in indices])
            tabular_batch_train = torch.tensor(tabular_batch_train.values.astype(np.float64))
            tabular_batch_train = tabular_batch_train.to(torch.float32)

            # Forward pass for training
            outputs = model(users, movies, tabular_batch_train, minmax=(1, 5))  # Assuming minmax range

            # Calculate loss
            loss = criterion(outputs.squeeze(), ratings)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        epoch_loss = running_loss / df_sizes['train']
        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}")

        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            users = torch.LongTensor(val_data['user_id'].values)
            movies = torch.LongTensor(val_data['elem_id'].values)
            ratings = torch.FloatTensor(val_labels.values)
            td_test = torch.tensor(tabular_data_test.values.astype(np.float64))
            td_test = td_test.to(torch.float32)

            outputs = model(users, movies, td_test, minmax=(1, 5))  # Assuming minmax range

            loss = criterion(outputs.squeeze(), ratings)
            val_loss += loss.item()

        val_loss /= df_sizes['val']
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}")

    print('Training finished.')
    return val_loss


In [None]:
def batches(X, y, bs=32, shuffle=True):
    iterator = ReviewsIterator(X.index, y, bs, shuffle)  # Using X.index for DataFrame indices
    for indices, (xb_indices, yb) in enumerate(iterator):
        xb = X.loc[xb_indices].values  # Accessing rows by indices and converting to array
        xb = torch.LongTensor(xb)
        yb = torch.FloatTensor(yb)
        yield xb, yb.view(-1, 1), xb_indices


In [None]:
import warnings
warnings.filterwarnings("default")

In [None]:
random_state = [40, 41, 42, 43, 44]
val_losses = []

for i in random_state: 
    print('Random state:', i)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)
    df = {'train': (X_train, y_train), 'val': (X_test, y_test)}
    df_sizes = {'train': len(X_train), 'val': len(X_test)}
    
    tabular_data_train, tabular_data_test, tabular_y_train, tabular_y_test = train_test_split(item_tab_data, item_rating_data, test_size=0.2, random_state=i)
    td_dim = tabular_data_train.shape[1]

    model = HybridRecommendation(n_users, n_elements, n_factors=150, activation=nn.ReLU, hidden_units=[100], input_size=td_dim, output_size=1, hidden_layers=[16])
    loss = train_hybrid_model(model, df, tabular_data_train, tabular_data_test, tabular_y_train, df_sizes, num_epochs=10, lr=0.001, batch_size=10000)
    val_losses.append(loss)
    
print('RMSE on validation in cross validation with 5 folds:', np.mean(val_losses))

Итоги: в моей реализации лучше всего сработал гибридный алгоритм, который учитывал много дополнительной информации, однако он оказывается несколько нестабильным. 