# NIRS (model)

In [27]:
import utils as utils
import torch
import numpy as np
import pandas as pd
import icecream as ic

utils.seed_everything(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Init and reading data

In [28]:
df = pd.read_csv('data/merged_data_processed.csv')
unreviewed_products_df = pd.read_csv('data/unreviewed_products_processed.csv')
products_df = pd.read_csv('data/products_sampled_processed.csv')
reviews_df = pd.read_csv('data/reviews_sampled_processed.csv')

In [29]:
print(f'Shape of df: {df.shape}')
print(f'Shape of unreviewed_products_df: {unreviewed_products_df.shape}')
print(f'Shape of products_df: {products_df.shape}')
print(f'Shape of reviews_df: {reviews_df.shape}')

Shape of df: (47238, 48)
Shape of unreviewed_products_df: (22903, 41)
Shape of products_df: (35246, 41)
Shape of reviews_df: (45538, 8)


In [30]:
print(f'Number of unique products: {products_df["asin"].nunique()}')
print(f'Number of unique users: {reviews_df["reviewerID"].nunique()}')
print(f'Number of unique reviewed products: {df["asin"].nunique()}')
print(f'Number of unique unreviewed products: {unreviewed_products_df["asin"].nunique()}')

Number of unique products: 34211
Number of unique users: 1000
Number of unique reviewed products: 11369
Number of unique unreviewed products: 22842


## Other data preparation for the model

### User and product id mapping

In [31]:
# Create user and item mappings
user_mapping = {user_id: index for index, user_id in enumerate(df['reviewerID'].unique())}
item_mapping = {item_id: index for index, item_id in enumerate(products_df['asin'].unique())}

reviewed_item_mapping = {item_id: index for index, item_id in enumerate(df['asin'].unique())}

# Map user and item IDs to indices
df['user_index'] = df['reviewerID'].map(user_mapping)
df['item_index'] = df['asin'].map(reviewed_item_mapping)

### User-item matrix

In [32]:
user_product_matrix = pd.pivot_table(df, values='overall', index='reviewerID', columns='asin', fill_value=0)

### Embeddings (with Word2Vec or whatever)

In [33]:
from gensim.models import Word2Vec

# Create embeddings using Word2Vec
def create_word2vec_embeddings(texts, embedding_dim):
  # Tokenize the texts
  tokenized_texts = [text.split() for text in texts]
  
  # Train Word2Vec model
  model = Word2Vec(tokenized_texts, vector_size=embedding_dim, window=5, min_count=1, workers=4)
  
  # Get the embeddings for each text
  embeddings = []
  for text in tokenized_texts:
      embedding = np.mean([model.wv[word] for word in text if word in model.wv], axis=0)
      embeddings.append(embedding)
  
  return np.array(embeddings)

In [34]:
# Create embeddings for textual data
review_embeddings = create_word2vec_embeddings(df['reviewText'], embedding_dim=100)
title_embeddings = create_word2vec_embeddings(df['title'], embedding_dim=100)
description_embeddings = create_word2vec_embeddings(df['description'], embedding_dim=100)
summary_embeddings = create_word2vec_embeddings(df['summary'], embedding_dim=100)
feature_embeddings = create_word2vec_embeddings(df['feature'], embedding_dim=100)
brand_embeddings = create_word2vec_embeddings(df['brand'], embedding_dim=100)


#load the embeddings
# review_embeddings = torch.load('data/embeds/review_embeddings.pt')
# summary_embeddings = torch.load('data/embeds/summary_embeddings.pt')
# description_embeddings = torch.load('data/embeds/description_embeddings.pt')
# title_embeddings = torch.load('data/embeds/title_embeddings.pt')
# feature_embeddings = torch.load('data/embeds/feature_embeddings.pt')

### Embed (SVD) from user-matrix pivot table

In [35]:
def SVD_embeddings(user_item_matrix: pd.DataFrame, embedding_length):
    """
    Apply SVD to the user-item matrix to obtain user and item embeddings.
    U = m x r orthogonal left singular matrix, which represents the relationship between users and latent factors
    S = r x r diagonal matrix, which describes the strength of each latent factor
    V = r x n diagonal right singular matrix, which indicates the similarity between items and latent factors
    
    :param user_item_matrix: user-item matrix
    :param embedding_length: number of latent factors, which is the dimensionality of the reduced space
    """
    from scipy.sparse.linalg import svds
    
    # transpose the matrix to obtain the item-user matrix
    # (this due to the large number of items compared to users)
    matrix = user_item_matrix.values

    U, Sigma, VT = svds(matrix, k=embedding_length)
    user_embed_df = pd.DataFrame(U, index = user_item_matrix.index)
    VT_T = np.transpose(VT)
    item_embed_df = pd.DataFrame(VT_T, index=user_item_matrix.columns)
    
    return user_embed_df, item_embed_df

In [36]:
user_embed_df, item_embed_df = SVD_embeddings(user_product_matrix, 800)

## Model

### DataLoaders for the Pytorch-based model

In [37]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader 

# class for the dataset
class AmazonReviewDataset(Dataset):
    def __init__(self, user_ids, item_ids, ratings, review_texts, product_titles, product_descriptions, review_summary, product_feature):
        self.user_ids = user_ids
        self.item_ids = item_ids
        self.ratings = ratings
        self.review_texts = review_texts
        self.product_titles = product_titles
        self.product_descriptions = product_descriptions
        self.review_summary = review_summary
        self.product_feature = product_feature

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        item_id = self.item_ids[index]
        rating = self.ratings[index]
        review_text = self.review_texts[index]
        product_title = self.product_titles[index]
        product_description = self.product_descriptions[index]
        review_summary = self.review_summary[index]
        product_feature = self.product_feature[index]
        return user_id, item_id, rating, review_text, product_title, product_description, review_summary, product_feature

# Split the data into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Create train and test datasets
train_dataset = AmazonReviewDataset(
    train_data['user_index'].values,
    train_data['item_index'].values,
    train_data['overall'].values,
    review_embeddings[train_data.index],
    title_embeddings[train_data.index],
    description_embeddings[train_data.index],
    summary_embeddings[train_data.index],
    feature_embeddings[train_data.index]
)

test_dataset = AmazonReviewDataset(
    test_data['user_index'].values,
    test_data['item_index'].values,
    test_data['overall'].values,
    review_embeddings[test_data.index],
    title_embeddings[test_data.index],
    description_embeddings[test_data.index],
    summary_embeddings[test_data.index],
    feature_embeddings[test_data.index]
)

### Vanilla SVD

In [38]:
# predictions with the the obrained SVD embeddings
def predict_ratings(user_embed_df: pd.DataFrame, item_embed_df: pd.DataFrame, user_id: str, item_id: str):
    """
    Predict the rating for a given user and item.
    
    :param user_embed_df: user embeddings
    :param item_embed_df: item embeddings
    :param user_id: user id
    :param item_id: item id
    """
    user_embedding = user_embed_df.loc[user_id]
    item_embedding = item_embed_df.loc[item_id]
    
    
    rating = np.dot(user_embedding, item_embedding)
    return rating

# example of prediction with a an item rated by a user
user_id = user_product_matrix.index[0]

# take the first product id rated by "user_id"
# (thus an element that is not 0 in the user-product matrix)
item_id = user_product_matrix.columns[user_product_matrix.loc[user_id] != 0].values[0]

print(f'Predicted rating for user {user_id} and item {item_id}: {predict_ratings(user_embed_df, item_embed_df, user_id, item_id)}')

Predicted rating for user A100WO06OQR8BQ and item B000034DLQ: 0.03378242774955097


In [39]:
# see if the predictions are consistent with the actual ratings
actual_rating = user_product_matrix.loc[user_id, item_id]

actual_rating

1.0

In [40]:
np.dot(user_embed_df, item_embed_df.T)

array([[-4.95599866e-04,  2.59605406e-03, -2.15158369e-03, ...,
         6.22025910e-03,  1.38991908e-03,  1.72065874e-04],
       [-1.25936316e-04, -1.79447050e-03, -6.46746476e-04, ...,
        -1.97749373e-03, -4.30993638e-04,  2.06622559e-04],
       [-1.01736420e-03,  1.07401753e-03,  9.87496629e-04, ...,
        -1.70572371e-03, -2.46175998e-03,  3.14422151e-04],
       ...,
       [-7.58173883e-05, -6.56917060e-04, -1.00105554e-03, ...,
         5.08348811e-04,  2.41673356e-03,  5.65461859e-04],
       [-7.90054976e-03,  9.44410429e-04,  1.71045655e-03, ...,
        -4.22877202e-03, -1.23563660e-04, -3.78508327e-04],
       [-2.05917440e-03,  1.79413998e-03,  7.98745975e-04, ...,
         1.14776802e-04, -4.45360286e-03, -3.12577453e-05]])

### Quick model test with SVD from scikit-surprise

Quick performance check of SVD by using scikit-surprise:

In [41]:
from surprise import BaselineOnly, Dataset, SVD, Reader, accuracy, Trainset
from surprise.model_selection import cross_validate, train_test_split, KFold

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data_test = Dataset.load_from_df(df[["reviewerID", "asin", "overall"]], reader)

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data_test, test_size=0.25)

# define a cross-validation iterator
kf = KFold(n_splits=5)

algo = SVD()

for trainset, testset in kf.split(data_test):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.7016
RMSE: 0.7237
RMSE: 0.7014
RMSE: 0.7050
RMSE: 0.7201


In [42]:
user_id = 'A1HBTW5M7ZZ9PT'
test_user = reviews_df[reviews_df['reviewerID'] == user_id]
item_id = 'B00006IEI7'

In [43]:
# get a prediction for specific users and items.
pred = algo.predict(user_id, item_id, r_ui=5, verbose=True)

user: A1HBTW5M7ZZ9PT item: B00006IEI7 r_ui = 5.00   est = 4.70   {'was_impossible': False}


### Neural Collaborative Filtering

In [44]:
from torch import nn

class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_dims, output_dim, text_embedding_dims):
        super(NCF, self).__init__()
        
        # embedding layers for the users and items ids
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Additional input layers for each type of text embedding
        # (the text embeddings will be passed through an additional 
        # linear layer (text_embedding_layer) to project them into 
        # the same embedding space as the user and item embeddings)
        self.text_embedding_layers = nn.ModuleList()
        for text_embedding_dim in text_embedding_dims:
            self.text_embedding_layers.append(nn.Linear(text_embedding_dim, embedding_dim))
        
        self.fc_layers = nn.ModuleList()
        input_dim = embedding_dim * 2
        for hidden_dim in hidden_dims:
            self.fc_layers.append(nn.Linear(input_dim, hidden_dim))
            self.fc_layers.append(nn.ReLU())
            input_dim = hidden_dim
        self.output_layer = nn.Linear(input_dim, output_dim)

    def forward(self, user_ids, item_ids, text_embeddings):
        user_embeddings = self.user_embedding(user_ids)
        item_embeddings = self.item_embedding(item_ids)
        
        # Project each type of text embedding to the same embedding space
        text_embs = []
        for i, text_embedding in enumerate(text_embeddings):
            text_emb = self.text_embedding_layers[i](text_embedding)
            text_embs.append(text_emb)
        
        x = torch.cat([user_embeddings, item_embeddings, *text_embs], dim=1)
        for layer in self.fc_layers:
            x = layer(x)
        output = self.output_layer(x)
        return output

#### Training and evaluating functions

In [45]:
# Train the NCF model
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm


def train_model(model, dataloader, criterion, optimizer, device, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        for user_ids, item_ids, ratings, _, _, _, _, _ in progress_bar:
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            ratings = ratings.to(device)

            optimizer.zero_grad()
            outputs = model(user_ids, item_ids)
            loss = criterion(outputs.squeeze(), ratings.float())
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            progress_bar.set_postfix(loss=running_loss / len(progress_bar))
            
        
            
# Evaluate the NCF model
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    true_ratings = []
    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating", unit="batch")
        for user_ids, item_ids, ratings, _, _, _, _, _ in progress_bar:
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            outputs = model(user_ids, item_ids)
            predictions.extend(outputs.squeeze().tolist())
            true_ratings.extend(ratings.tolist())

    mse = mean_squared_error(true_ratings, predictions)
    mae = mean_absolute_error(true_ratings, predictions)
    return mse, mae

#### Model training

In [46]:
import torch.optim as optim
import torch.nn as nn

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Create the NCF model
num_users = len(user_mapping)
num_items = len(reviewed_item_mapping)
embedding_dim = 100
hidden_dims = [64, 32]
output_dim = 1

model = NCF(num_users, num_items, embedding_dim, hidden_dims, output_dim).to(device)

# Define loss function (RMSE) and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
train_model(model, train_loader, criterion, optimizer, device, num_epochs)

# Evaluate the model
print('\n')
mse, mae = evaluate_model(model, test_loader, device)
rmse = torch.sqrt(torch.tensor(mse))
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")

Epoch 1/10: 100%|██████████| 591/591 [00:03<00:00, 172.85batch/s, loss=1.78]
Epoch 2/10: 100%|██████████| 591/591 [00:03<00:00, 182.18batch/s, loss=0.574]
Epoch 3/10: 100%|██████████| 591/591 [00:03<00:00, 181.84batch/s, loss=0.48]  
Epoch 4/10: 100%|██████████| 591/591 [00:03<00:00, 175.80batch/s, loss=0.421] 
Epoch 5/10: 100%|██████████| 591/591 [00:03<00:00, 170.41batch/s, loss=0.371] 
Epoch 6/10: 100%|██████████| 591/591 [00:03<00:00, 171.00batch/s, loss=0.325] 
Epoch 7/10: 100%|██████████| 591/591 [00:03<00:00, 173.27batch/s, loss=0.28]  
Epoch 8/10: 100%|██████████| 591/591 [00:03<00:00, 173.00batch/s, loss=0.239] 
Epoch 9/10: 100%|██████████| 591/591 [00:03<00:00, 169.81batch/s, loss=0.201] 
Epoch 10/10: 100%|██████████| 591/591 [00:03<00:00, 171.08batch/s, loss=0.167] 
Evaluating: 100%|██████████| 148/148 [00:00<00:00, 1374.91batch/s]



Test RMSE: 0.8271
Test MAE: 0.5718


#### Model test with single predictions

In [59]:
def predict_score(user_id, item_id, model, user_mapping, item_mapping, device):
    # Convert user ID and item ID to their corresponding indices
    user_index = user_mapping[user_id]
    item_index = item_mapping[item_id]

    # Convert indices to tensors
    user_tensor = torch.tensor([user_index], dtype=torch.long).to(device)
    item_tensor = torch.tensor([item_index], dtype=torch.long).to(device)

    # Get the predicted score from the model
    with torch.no_grad():
        model.eval()
        score = model(user_tensor, item_tensor).item()

    return score

print(f'Predicted rating for user {user_id} and item {item_id}: {predict_score(user_id, 'B0006HXE1E', model, user_mapping, item_mapping, device)}')
# print(f'Actual rating: {user_product_matrix.loc[user_id, item_id]}')

IndexError: index out of range in self

In [57]:
test = df[df['reviewerID'] == 'A1HBTW5M7ZZ9PT']
test = test[test['overall'] == 3]

test

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,dayDifferenceReview,description,title,brand,feature,rank,productPublishedDate,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Home,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games,dayDifferenceProduct,user_index,item_index
4581,3.0,"November 06, 2016",A1HBTW5M7ZZ9PT,B00006IE7Y,FTLOE,still search perfect pen,okay,0.933793,lic stic ball pen retractable affordable bic c...,bic csm11blu clic stic retractable ball pen me...,bic,retractable affordable bic classic comfortable...,29,"November 27, 2004",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.596844,812,514
4582,3.0,"November 06, 2016",A1HBTW5M7ZZ9PT,B00006IE7Y,FTLOE,still search perfect pen,okay,0.933793,lic stic ball pen retractable affordable bic c...,bic csm11blu clic stic retractable ball pen me...,bic,retractable affordable bic classic comfortable...,29,"November 27, 2004",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.596844,812,514
12641,3.0,"April 14, 2018",A1HBTW5M7ZZ9PT,B0006HXE1E,FTLOE,liked product bit surprised thin folder used m...,used make privacy folder student,0.984029,endaflex file folder perfect everyday filing n...,pendaflex file folder letter size manila 13 cu...,pendaflex,standard manila folder suit filing system 13cu...,27,"September 09, 2008",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.685299,812,1329
15802,3.0,"August 05, 2015",A1HBTW5M7ZZ9PT,B000MK4RAM,FTLOE,go wrong easily find alternative dollar tree,good buy find much cheaper dollar tree,0.891901,se flag mark important point textbook document...,postit flag value pack assorted color stick se...,postit,mark tab highlight colorcode flag removable re...,1,"October 17, 2017",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.955615,812,2228
31476,3.0,"October 01, 2015",A1HBTW5M7ZZ9PT,B00BUI5QWS,FTLOE,update february 2016 second laminator started ...,great buy,0.896999,mazon bran,amazonbasics thermal laminator,amazonbasics,laminate document 9 inch wide compatible lette...,56,"June 01, 2013",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.814403,812,6571


In [48]:
unreviewed_products_df.head(2)

Unnamed: 0,description,title,brand,feature,rank,productPublishedDate,asin,main_cat_All Beauty,main_cat_All Electronics,main_cat_Amazon Home,"main_cat_Arts, Crafts & Sewing","main_cat_Arts, Crafts &amp; Sewing",main_cat_Automotive,main_cat_Baby,main_cat_Books,main_cat_Camera & Photo,main_cat_Camera &amp; Photo,main_cat_Car Electronics,main_cat_Cell Phones & Accessories,main_cat_Cell Phones &amp; Accessories,main_cat_Computers,main_cat_Gift Cards,main_cat_Grocery,main_cat_Health & Personal Care,main_cat_Home Audio & Theater,main_cat_Home Audio &amp; Theater,main_cat_Industrial & Scientific,main_cat_Industrial &amp; Scientific,main_cat_Musical Instruments,main_cat_Office Products,main_cat_Pet Supplies,main_cat_Portable Audio & Accessories,main_cat_Software,main_cat_Sports & Outdoors,main_cat_Sports &amp; Outdoors,main_cat_Tools & Home Improvement,main_cat_Tools &amp; Home Improvement,main_cat_Toys & Games,main_cat_Toys &amp; Games,main_cat_Video Games,dayDifferenceProduct
0,xclusive design classi,best abstract fiery floral design mouse pad cu...,luxladymousepad,material made best plastic manufacturing also ...,-1,"September 21, 1677",B00KH94VSG,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.059833
1,itten piano key mouse pad 8 x 8 x 25 made heav...,3drose llc 8 x 8 x 025 inch kitten piano key m...,3drose,dimension inch 8 w x 8 h x 025 matte finish so...,1,"July 14, 2014",B00CX71JNU,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.848318
