# NIRS (model)

In [None]:
# import utils as utils
import torch
import numpy as np
import pandas as pd
# from icecream import ic
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import random


def seed_everything(seed=42):
    random.seed(seed)

    np.random.seed(seed)

    sklearn.utils.check_random_state(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.width', None)
    pd.set_option('display.expand_frame_repr', False)

seed_everything(42)

%matplotlib inline
sns.set_style('darkgrid')

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
!pip install icecream torchsummary

## Init and reading data

In [None]:
products_df = pd.read_csv('/kaggle/input/dataaa/products_sampled_processed.csv')
reviews_df = pd.read_csv('/kaggle/input/dataaa/reviews_sampled_processed.csv')

print(products_df.shape, reviews_df.shape)

In [None]:
reviews_df.drop_duplicates(subset=['reviewerID', 'asin', 'reviewText', 'summary'], inplace=True)

In [None]:
df = reviews_df.merge(products_df, on='asin', how='inner')

unreviewed_products_df = products_df[~products_df['asin'].isin(reviews_df['asin'])]

In [None]:
print(f'Shape of df: {df.shape}')
print(f'Shape of unreviewed_products_df: {unreviewed_products_df.shape}')
print(f'Shape of products_df: {products_df.shape}')
print(f'Shape of reviews_df: {reviews_df.shape}')

## Other data preparation for the model

### User and product id mapping

In [None]:
user_ids = df['reviewerID'].values
product_ids = products_df['asin'].values

user_id_map = {uid: idx for idx, uid in enumerate(set(user_ids))}
product_id_map = {pid: idx for idx, pid in enumerate(set(product_ids))}

# Map user and item IDs to indices
df['user_index'] = df['reviewerID'].map(user_id_map)
df['item_index'] = df['asin'].map(product_id_map)
products_df['item_index'] = products_df['asin'].map(product_id_map)

### Text Embeddings (with Word2Vec or whatever)

In [None]:
from gensim.models import Word2Vec

def create_word2vec_embeddings(texts, embedding_dim):
  tokenized_texts = [text.split() for text in texts]
  
  model = Word2Vec(tokenized_texts, vector_size=embedding_dim, window=5, min_count=1, workers=4)
#   model.wv.gpu = True
  
  embeddings = []
  for text in tokenized_texts:
      embedding = np.mean([model.wv[word] for word in text if word in model.wv], axis=0)
      embeddings.append(embedding)
  
  return np.array(embeddings)

In [None]:
df_reviewed_products = products_df[products_df['asin'].isin(df['asin'].unique())].drop_duplicates(subset=['asin'])
df_reviewed_products['brand'] = df_reviewed_products['brand'].astype(str)
df_reviewed_products['title'] = df_reviewed_products['title'].astype(str)
df_reviewed_products['description'] = df_reviewed_products['description'].astype(str)

df_reviewed_products.reset_index(drop=True, inplace=True)

In [None]:
%time

text_embedding_dims = 200
reviewTexts_embs = torch.tensor(create_word2vec_embeddings(reviews_df['reviewText'].astype(str), embedding_dim=text_embedding_dims))
summary_embs = torch.tensor(create_word2vec_embeddings(reviews_df['summary'].astype(str), embedding_dim=text_embedding_dims))
title_embs = torch.tensor(create_word2vec_embeddings(products_df['title'].astype(str), embedding_dim=text_embedding_dims))
description_embs = torch.tensor(create_word2vec_embeddings(products_df['description'].astype(str), embedding_dim=text_embedding_dims))
feature_embs = torch.tensor(create_word2vec_embeddings(products_df['feature'].astype(str), embedding_dim=text_embedding_dims))
brand_embs = torch.tensor(create_word2vec_embeddings(products_df['brand'].astype(str), embedding_dim=text_embedding_dims))

#load the embeddings  
# reviewTexts_embs = torch.load('data/embeds/review_embeddings.pt')
# summary_embs = torch.load('data/embeds/summary_embeddings.pt')
# description_embs = torch.load('data/embeds/description_embeddings.pt')
# title_embs = torch.load('data/embeds/title_embeddings.pt')
# feature_embs = torch.load('data/embeds/feature_embeddings.pt')
# brand_embs = torch.load('data/embeds/brand_embeddings.pt')

print(reviewTexts_embs.shape, summary_embs.shape, title_embs.shape, description_embs.shape, feature_embs.shape, brand_embs.shape)

In [None]:
reviews_text_embs = torch.mean(torch.stack([reviewTexts_embs, summary_embs]), dim=0)
products_text_embs = torch.mean(torch.stack([title_embs, description_embs, feature_embs, brand_embs]), dim=0)

In [None]:
user_embeddings_map = {}
for i, row in reviews_df.iterrows():
    user_id = row['reviewerID']   
    if user_id not in user_embeddings_map:
        user_embeddings_map[user_id] = [reviews_text_embs[i]]
    else:
        user_embeddings_map[user_id].append(reviews_text_embs[i])

for user_id, emb in user_embeddings_map.items():
    user_tensors = user_embeddings_map[user_id]
    user_embeddings_map[user_id] = torch.mean(torch.stack(user_tensors), dim=0)
    
    
df['user_embs'] = df['reviewerID'].map(user_embeddings_map)

In [None]:
all_product_embeddings_map = {}
for i, row in products_df.iterrows():
    product_id = row['asin']
    all_product_embeddings_map[product_id] = products_text_embs[i]
products_df['product_embs'] = products_df['asin'].map(all_product_embeddings_map)

reviewed_products_embeddings_map = {}
for i, row in df_reviewed_products.iterrows():
    product_id = row['asin']
    reviewed_products_embeddings_map[product_id] = all_product_embeddings_map[product_id]
    
df['product_embs'] = df['asin'].map(reviewed_products_embeddings_map)

## Model

### DataLoaders for the Pytorch-based model

In [None]:
def build_train_test_data(data):
    """For each user, we hold the most recent positive interaction of the user"""
    
    train_set = []
    test_set = []
    
    for _, group in data.groupby('reviewerID'):
        sorted_group = group.sort_values('reviewTime', ascending=False)
        test_item = sorted_group.iloc[0]
        train_items = sorted_group.iloc[1:]
        
        train_set.append(train_items)
        test_set.append(test_item)
    
    train_set = pd.concat(train_set)
    test_set = pd.concat(test_set)
    
    return train_set, test_set

# def sample_negatives(df, user_id, num_negatives, all_item_ids):
#     interacted_items = set(df[df['user_index'] == user_id]['item_index'])
    
#     non_interacted_items = list(all_item_ids - interacted_items)
    
#     sampled_negatives = random.sample(non_interacted_items, min(num_negatives, len(non_interacted_items)))
    
#     return sampled_negatives

In [None]:
from torch.utils.data import Dataset, DataLoader 
import numpy as np

# class for the dataset
class AmazonReviewDataset(Dataset):
    def __init__(self, user_ids, product_ids, ratings, users_text_data, products_text_data):
        self.user_ids = user_ids
        self.product_ids = product_ids
        self.ratings = ratings
        self.users_text_data = users_text_data
        self.products_text_data = products_text_data
        
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        item_id = self.product_ids[index]
        rating = self.ratings[index]
        users_text_data = self.users_text_data[index]
        products_text_data = self.products_text_data[index]
        
        return user_id, item_id, rating, users_text_data, products_text_data

    
train_data, val_data = build_train_test_data(df)


train_dataset = AmazonReviewDataset(
    train_data['user_index'].values,
    train_data['item_index'].values,
    train_data['overall'].values,
    train_data['user_embs'].values,
    train_data['product_embs'].values

)

test_dataset = AmazonReviewDataset(
    val_data['user_index'].values,
    val_data['item_index'].values,
    val_data['overall'].values,
    val_data['user_embs'].values,
    val_data['product_embs'].values
)

# neg_dataset = AmazonReviewDataset(
#     neg_data['user_index'].values,
#     neg_data['item_index'].values,
#     neg_data['overall'].values,
#     neg_data['user_embs'].values,
#     neg_data['product_embs'].values
# )

### Quick model test with SVD from scikit-surprise

Quick performance check of SVD by using scikit-surprise:

In [None]:
from surprise import BaselineOnly, Dataset, SVD, Reader, accuracy, Trainset
from surprise.model_selection import cross_validate, train_test_split, KFold

reader = Reader(rating_scale=(1, 5))

data_test = Dataset.load_from_df(df[["reviewerID", "asin", "overall"]], reader)

trainset, testset = train_test_split(data_test, test_size=0.30)

kf = KFold(n_splits=5)

algo = SVD()

for trainset, testset in kf.split(data_test):

    algo.fit(trainset)
    predictions = algo.test(testset)

    accuracy.rmse(predictions, verbose=True)
    
user_id = 'A1HBTW5M7ZZ9PT'
test_user = reviews_df[reviews_df['reviewerID'] == user_id]
item_id = 'B00006IEI7'

pred = algo.predict(user_id, item_id, r_ui=5, verbose=True)

### Neural Collaborative Filtering

In [None]:
from torch import nn

In [None]:
class NCF(nn.Module):
  def __init__(self, n_users, n_items, emb_dim, text_user_dim, text_item_dim, dropout_rate=0.2) -> None:
    super(NCF, self).__init__()
    self.min_rating = 1
    self.max_rating = 5
    self.n_users = n_users
    self.n_items = n_items
    self.emb_dim = emb_dim
    
    # GMF embeddings
    self.user_embedding_gmf = nn.Embedding(n_users, emb_dim)
    self.item_embedding_gmf = nn.Embedding(n_items, emb_dim)
    
    # MLP embeddings
    self.user_embedding_mlp = nn.Embedding(n_users, 512 // 2)
    self.item_embedding_mlp = nn.Embedding(n_items, 512 // 2)
    
    # text embeddings
    self.user_text_layer = nn.Linear(text_user_dim, 512 // 2)
    self.item_text_layer = nn.Linear(text_item_dim, 512 // 2)
    
    # MLP layers
    self.fc_layers = nn.Sequential(
      nn.Linear(2448, 1024),
      nn.ReLU(),
      nn.Dropout(p=dropout_rate),
      nn.BatchNorm1d(1024),
      
      nn.Linear(1024, 512),
      nn.ReLU(),
      nn.Dropout(p=dropout_rate),
      nn.BatchNorm1d(512),
      
      nn.Linear(512 * 2, 256),
      nn.ReLU(),
      nn.Dropout(p=dropout_rate),
      nn.BatchNorm1d(256),
      
      nn.Linear(256, 128),
      nn.ReLU(),
      nn.Dropout(p=dropout_rate),
      nn.BatchNorm1d(128),
      
      nn.Linear(128, 64),
      nn.ReLU()
    )
    
    self.output_layer = nn.Linear(emb_dim + 64, 1)
    
    # Initialize weights
    self._init_embs_weight_()
        
  def _init_embs_weight_(self):
    nn.init.normal_(self.user_embedding_gmf.weight, std=0.01)
    nn.init.normal_(self.item_embedding_gmf.weight, std=0.01)
    nn.init.normal_(self.user_embedding_mlp.weight, std=0.01)
    nn.init.normal_(self.item_embedding_mlp.weight, std=0.01)
    
  def forward(self, user_ids, item_ids, user_texts, item_texts):
    # GMF part
    user_gmf = self.user_embedding_gmf(user_ids)
    item_gmf = self.item_embedding_gmf(item_ids)
    gmf_output = user_gmf * item_gmf
    
    # MLP part
    user_mlp = self.user_embedding_mlp(user_ids)
    item_mlp = self.item_embedding_mlp(item_ids)
    user_text_mlp = self.user_text_layer(user_texts)
    item_text_mlp = self.item_text_layer(item_texts)
    mlp_input = torch.cat([user_mlp, item_mlp, user_text_mlp, item_text_mlp], dim=1)
    # mlp_input = torch.cat([user_mlp, item_mlp], dim=1)
    mlp_output = self.fc_layers(mlp_input)
    
    # Concatenate GMF and MLP output
    output = self.output_layer(torch.cat([gmf_output, mlp_output], dim=1))
    output = output * (self.max_rating - self.min_rating) + self.min_rating
    return output

#### Training and evaluating - related functions

In [None]:
# Train the NCF model
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm
from torch import nn
from torch.optim.lr_scheduler import ReduceLROnPlateau

class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps
        
    def forward(self,yhat,y):
        loss = torch.sqrt(self.mse(yhat,y) + self.eps)
        return loss

def train_model(model: NCF, train_data: AmazonReviewDataset, val_data: AmazonReviewDataset, loss_func, optimizer, device, num_epochs):
    train_loss_history = []
    valid_loss_history = []
    hit_ratio_avg_history, ndcg_avg_history = [], []
    
    model.to(device)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=4)
    for epoch in range(num_epochs):
        model.train()
        train_loss_sum = 0

        # For each batch
        for user_ids, item_ids, rating, u_text_embeds, p_text_embeds in tqdm(train_data, desc='Training...'):
            user_ids, item_ids, rating = user_ids.to(device), item_ids.to(device), rating.to(device)
            u_text_embeds, p_text_embeds = u_text_embeds.to(device), p_text_embeds.to(device)
            
            optimizer.zero_grad()
            outputs = model(user_ids, item_ids, u_text_embeds, p_text_embeds)
            loss = loss_func(outputs.squeeze(), rating.float())
            loss.backward()
            optimizer.step()
            
            train_loss_sum += loss.item()

        model.eval()
        val_loss, _, _ = predict_and_evaluate(model, val_data, loss_func, device)
        scheduler.step(val_loss)
        
        # for both training and validation, the actual train loss
        # is the average loss over the entire dataset (the multiple batches)
        train_loss = np.sqrt(train_loss_sum / len(train_data))
        train_loss_history.append(train_loss)
        valid_loss_history.append(val_loss)
        
        # HR@k and NDCG@K
#         hit_ratio_avg, ndcg_avg = evaluate(model, val_data, df['asin'].nunique(), device)
#         hit_ratio_avg_history.append(hit_ratio_avg)
#         ndcg_avg_history.append(ndcg_avg)
        
#         print(f'Epoch {epoch+1}: train loss = {train_loss:.4f}, HR@k = {hit_ratio_avg:.4f}, NDCG@k = {ndcg_avg:.4f}')
        print(f'Epoch {epoch+1}: train loss = {train_loss:.4f}, val loss = {val_loss:.4f}')
    
    return train_loss_history, valid_loss_history

# def evaluate(model, val_data, item_count, device):
#     model.eval()
#     hit_ratio_sum = 0.0
#     ndcg_sum = 0.0
    
#     with torch.no_grad():
#         for user_id, item_id, rating, u_text_emb, p_text_emb in test_loader:
#             user_id, item_id, rating = user_id.to(device), item_id.to(device), rating.to(device)
#             u_text_emb, p_text_emb = u_text_emb.to(device), p_text_emb.to(device) 
            
#             # Randomly sample 100 items not interacted by the user
#             non_interacted_items = torch.randint(0, item_count, (100,)).to(device)
#             non_interacted_items = non_interacted_items[non_interacted_items != item_id]
            
#             # Rank the test item among the sampled items
#             test_item_score = model(user_id, item_id, u_text_emb, p_text_emb)
#             non_interacted_scores = model(user_id.expand(non_interacted_items.size(0)), non_interacted_items)
#             scores = torch.cat((test_item_score, non_interacted_scores))
            
#             # Calculate HR and NDCG
#             _, indices = torch.topk(scores, k=10)
#             hit_ratio = (indices == 0).float().mean().item()
#             ndcg = 1.0 / torch.log2(torch.where(indices == 0)[0][0].float() + 2).item()
            
#             hit_ratio_sum += hit_ratio
#             ndcg_sum += ndcg
    
#     hit_ratio_avg = hit_ratio_sum / len(test_loader)
#     ndcg_avg = ndcg_sum / len(test_loader)
    
#     return hit_ratio_avg, ndcg_avg
        
# Evaluate the NCF model
def predict_and_evaluate(model, data, loss_func, device):
    predictions = []
    true_ratings = []
    loss_sum = 0.0
    
    with torch.no_grad():
        for user_ids, item_ids, rating, u_text_embeds, p_text_embeds in tqdm(data, desc='Evaluating...'):
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            rating = rating.to(device)
            u_text_embeds = u_text_embeds.to(device)
            p_text_embeds = p_text_embeds.to(device)
            
            outputs = model(user_ids, item_ids, u_text_embeds, p_text_embeds)
            predictions.extend(outputs.cpu().numpy())
            true_ratings.extend(rating.cpu().numpy())
            
            loss_sum += loss_func(outputs.squeeze(), rating.float()).item()

    rmse = np.sqrt(loss_sum / len(data))
    mse = mean_squared_error(true_ratings, predictions)
    mae = mean_absolute_error(true_ratings, predictions)
    return rmse, mse, mae

#### Model training

In [None]:
import torch.optim as optim
import torch.nn as nn
torch.cuda.empty_cache()

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

# Create the NCF model
num_users = len(user_id_map)
num_items = len(product_id_map)
emb_dim = 300
droput_rate_fc = 0.2
text_embedding_dims = [emb.shape[1] for emb in [reviews_text_embs, products_text_embs]]

model = NCF(
  num_users, num_items, emb_dim, 200, 200, droput_rate_fc).to(device)

# Define loss function and optimizer
loss_func = RMSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)


# Train the model
num_epochs = 10
print('Loss criterion: RMSE\n')
train_loss_history, val_loss_history = train_model(model, train_loader, test_loader, loss_func, optimizer, device, num_epochs)

In [None]:
def plot_accuracy(train_acc_history, val_acc_history, loss_name='RMSE'):
  df = pd.DataFrame({'Epoch': range(1, len(train_acc_history)+1),
             'Train Accuracy': train_acc_history,
             'Validation Accuracy': val_acc_history})

  plt.figure(figsize=(10, 6))
  sns.lineplot(data=df, x='Epoch', y='Train Accuracy', label=f'Train loss ({loss_name})')
  sns.lineplot(data=df, x='Epoch', y='Validation Accuracy', label=f'Validation loss ({loss_name})')
  plt.xlabel('Epoch')
  plt.ylabel(loss_name)
  plt.title(f'Training and Validation loss {loss_name}')
  plt.legend()
  plt.show()
  
plot_accuracy(train_loss_history, val_loss_history, 'RMSE')

#### Model test with single predictions

In [None]:
def predict_score(user_id, item_id, model, user_mapping, item_mapping, device):
    # Convert user ID and item ID to their corresponding indices
    user_index = user_mapping[user_id]
    item_index = item_mapping[item_id]

    # Convert indices to tensors
    user_tensor = torch.tensor([user_index], dtype=torch.long).to(device)
    item_tensor = torch.tensor([item_index], dtype=torch.long).to(device)

    # Get the predicted score from the model
    with torch.no_grad():
        model.eval()
        score = model(user_tensor, item_tensor, user_embeddings_map[user_id].to(device), all_product_embeddings_map[item_id].to(device)).item()

    return score

item_id = 'B002H9Y8X2'
user_id = 'A2TUPLNRVDE0P6'
print(f'Predicted rating for user {user_id} and item {item_id}: {predict_score(user_id, item_id, model, user_id_map, product_id_map, device)}')