# NIRS (model)

In [1]:
import utils as utils
import torch
import numpy as np
import pandas as pd
import icecream as ic

utils.seed_everything(42)

## Init

In [2]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
reviews_df = pd.read_csv('data/reviews_sampled_processed.csv')
products_df = pd.read_csv('data/products_sampled_processed.csv')

In [4]:
# remove duplicates
products_df = products_df.drop_duplicates(subset='asin', keep='first')

In [5]:
utils.print_shapes(reviews_df, products_df)

Reviews df shape: (45538, 8)
Products df shape: (34211, 41)


In [6]:
print(f'Number of unique products: {products_df["asin"].nunique()}')
print(f'Number of unique users: {reviews_df["reviewerID"].nunique()}')

Number of unique products: 34211
Number of unique users: 1000


In [53]:
reviews_df.head(3)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary
0,5.0,"April 09, 2016",A2GIQGI2UXOZ4M,439893577,Gene Sechrest,job big enough purpose fold flat fold stand gr...,ultimate kid magnetic board
1,5.0,"October 27, 2017",A2M13JN7YVG29U,528960911,Stacie Baugh,love,five star
2,5.0,"December 31, 2014",A2ZVLGM9E6X2HY,528960911,Michael Isgro,great money 15 look nice colorful,five star


In [54]:
products_df.head(3)

Unnamed: 0,description,title,brand,feature,rank,main_cat,productPublishedDate,asin
0,rotect rfid card skimsafe card holder made rig...,Black RFID Blocking ID Badge Holder (Holds 2 C...,Specialist ID,rfid blocking 2 card holder fips 201 approved ...,43,Office Products,"October 14, 2011",B005VSY1VK
1,itten piano key mouse pad 8 x 8 x 25 made heav...,3dRose LLC 8 x 8 x 0.25 Inches Kitten on Piano...,3dRose,dimension inch 8 w x 8 h x 025 matte finish so...,1,Office Products,"July 14, 2014",B00CX71JNU
2,ivo next favorite pen ultra gel stick vibrant ...,"Vivo Ultra Gel Stick Pens, 0.7mm Fine Tip, Bla...",VIVO,ultra smooth gel ink vivid black amp color ful...,1,Office Products,"April 30, 2009",B002CO43BO


In [55]:
# merge the datasets
df = pd.merge(reviews_df, products_df, on='asin', how='left')

# Fill NaN values with appropriate values
df['overall'] = df['overall'].fillna(0)  # Fill missing ratings with 0
df['reviewerID'] = df['reviewerID'].fillna('Unknown')  # Fill missing user IDs with 'Unknown'
df['asin'] = df['asin'].fillna('Unknown')  # Fill missing product IDs with 'Unknown'

df.head(3)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,description,title,brand,feature,rank,main_cat,productPublishedDate
0,5.0,"April 09, 2016",A2GIQGI2UXOZ4M,439893577,Gene Sechrest,job big enough purpose fold flat fold stand gr...,ultimate kid magnetic board,agnetic tabletop learning easel one simplestye...,Little Red Tool Box: Magnetic Tabletop Learnin...,Scholastic,fold flat easy storage open reveal giant 12 x ...,21,Office Products,"November 25, 2006"
1,5.0,"October 27, 2017",A2M13JN7YVG29U,528960911,Stacie Baugh,love,five star,yecatching 50 x 32 reference piece home classr...,Rand McNally M-Series Full-Color Laminated Uni...,Rand McNally,eyecatching 50 x 32 reference piece home class...,5,Office Products,"April 18, 2006"
2,5.0,"December 31, 2014",A2ZVLGM9E6X2HY,528960911,Michael Isgro,great money 15 look nice colorful,five star,yecatching 50 x 32 reference piece home classr...,Rand McNally M-Series Full-Color Laminated Uni...,Rand McNally,eyecatching 50 x 32 reference piece home class...,5,Office Products,"April 18, 2006"


## Other data preparation for the model

### User and product id mapping

In [57]:
# Create user and item mappings
user_mapping = {user_id: index for index, user_id in enumerate(df['reviewerID'].unique())}
item_mapping = {item_id: index for index, item_id in enumerate(df['asin'].unique())}

# Map user and item IDs to indices
df['user_index'] = df['reviewerID'].map(user_mapping)
df['item_index'] = df['asin'].map(item_mapping)

### Embeddings (with Word2Vec or whatever)

In [58]:
from gensim.models import Word2Vec

# Create embeddings using Word2Vec
def create_word2vec_embeddings(texts, embedding_dim):
  # Tokenize the texts
  tokenized_texts = [text.split() for text in texts]
  
  # Train Word2Vec model
  model = Word2Vec(tokenized_texts, vector_size=embedding_dim, window=5, min_count=1, workers=4)
  
  # Get the embeddings for each text
  embeddings = []
  for text in tokenized_texts:
      embedding = np.mean([model.wv[word] for word in text if word in model.wv], axis=0)
      embeddings.append(embedding)
  
  return np.array(embeddings)

In [67]:
# Create embeddings for textual data
# review_embeddings = create_word2vec_embeddings(df['reviewText'], embedding_dim=100)
# title_embeddings = create_word2vec_embeddings(df['title'], embedding_dim=100)
# description_embeddings = create_word2vec_embeddings(df['description'], embedding_dim=100)
# summary_embeddings = create_word2vec_embeddings(df['summary'], embedding_dim=100)
# feature_embeddings = create_word2vec_embeddings(df['feature'], embedding_dim=100)

#load the embeddings
review_embeddings = torch.load('data/embeds/review_embeddings.pt')
summary_embeddings = torch.load('data/embeds/summary_embeddings.pt')
description_embeddings = torch.load('data/embeds/description_embeddings.pt')
title_embeddings = torch.load('data/embeds/title_embeddings.pt')
feature_embeddings = torch.load('data/embeds/feature_embeddings.pt')

### Embed (SVD) from user-matrix pivot table

In [15]:
# create the pivot table with the user-item interactions (reviews)
user_product_matrix = pd.pivot_table(df, values='overall', index='reviewerID', columns='asin', fill_value=0)

In [16]:
def SVD_embeddings(user_item_matrix: pd.DataFrame, embedding_length):
    """
    Apply SVD to the user-item matrix to obtain user and item embeddings.
    U = m x r orthogonal left singular matrix, which represents the relationship between users and latent factors
    S = r x r diagonal matrix, which describes the strength of each latent factor
    V = r x n diagonal right singular matrix, which indicates the similarity between items and latent factors
    
    :param user_item_matrix: user-item matrix
    :param embedding_length: number of latent factors, which is the dimensionality of the reduced space
    """
    from scipy.sparse.linalg import svds
    
    # transpose the matrix to obtain the item-user matrix
    # (this due to the large number of items compared to users)
    matrix = user_item_matrix.values

    U, Sigma, VT = svds(matrix, k=embedding_length)
    user_embed_df = pd.DataFrame(U, index = user_item_matrix.index)
    VT_T = np.transpose(VT)
    item_embed_df = pd.DataFrame(VT_T, index=user_item_matrix.columns)
    
    return user_embed_df, item_embed_df

In [17]:
user_embed_df, item_embed_df = SVD_embeddings(user_product_matrix, 800)

In [18]:
utils.print_shapes(user_embed_df, item_embed_df)

Reviews df shape: (999, 800)
Products df shape: (8949, 800)


## Model

### Data part

In [61]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader 

# class for the dataset
class AmazonReviewDataset(Dataset):
    def __init__(self, user_ids, item_ids, ratings, review_texts, product_titles, product_descriptions, review_summary, product_feature):
        self.user_ids = user_ids
        self.item_ids = item_ids
        self.ratings = ratings
        self.review_texts = review_texts
        self.product_titles = product_titles
        self.product_descriptions = product_descriptions
        self.review_summary = review_summary
        self.product_feature = product_feature

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        item_id = self.item_ids[index]
        rating = self.ratings[index]
        review_text = self.review_texts[index]
        product_title = self.product_titles[index]
        product_description = self.product_descriptions[index]
        review_summary = self.review_summary[index]
        product_feature = self.product_feature[index]
        return user_id, item_id, rating, review_text, product_title, product_description, review_summary, product_feature

# Split the data into train and test sets
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Create train and test datasets
train_dataset = AmazonReviewDataset(
    train_data['user_index'].values,
    train_data['item_index'].values,
    train_data['overall'].values,
    review_embeddings[train_data.index],
    title_embeddings[train_data.index],
    description_embeddings[train_data.index],
    summary_embeddings[train_data.index],
    feature_embeddings[train_data.index]
)

test_dataset = AmazonReviewDataset(
    test_data['user_index'].values,
    test_data['item_index'].values,
    test_data['overall'].values,
    review_embeddings[test_data.index],
    title_embeddings[test_data.index],
    description_embeddings[test_data.index],
    summary_embeddings[test_data.index],
    feature_embeddings[test_data.index]
)

### Vanilla SVD

In [20]:
# predictions with the the obrained SVD embeddings
def predict_ratings(user_embed_df: pd.DataFrame, item_embed_df: pd.DataFrame, user_id: str, item_id: str):
    """
    Predict the rating for a given user and item.
    
    :param user_embed_df: user embeddings
    :param item_embed_df: item embeddings
    :param user_id: user id
    :param item_id: item id
    """
    user_embedding = user_embed_df.loc[user_id]
    item_embedding = item_embed_df.loc[item_id]
    
    
    rating = np.dot(user_embedding, item_embedding)
    return rating

# example of prediction with a an item rated by a user
user_id = user_product_matrix.index[0]

# take the first product id rated by "user_id"
# (thus an element that is not 0 in the user-product matrix)
item_id = user_product_matrix.columns[user_product_matrix.loc[user_id] != 0].values[0]

print(f'Predicted rating for user {user_id} and item {item_id}: {predict_ratings(user_embed_df, item_embed_df, user_id, item_id)}')

Predicted rating for user A100WO06OQR8BQ and item B000034DLQ: 0.04106693642973573


In [21]:
# see if the predictions are consistent with the actual ratings
actual_rating = user_product_matrix.loc[user_id, item_id]

actual_rating

1.0

In [22]:
np.dot(user_embed_df, item_embed_df.T)

array([[ 1.06259022e-03,  3.36041414e-03,  2.10828465e-03, ...,
        -5.92800395e-03, -4.63836032e-03,  1.60806135e-02],
       [-8.41605637e-04, -3.90086343e-04, -1.16589769e-03, ...,
        -2.46381296e-03,  2.20313593e-04, -1.21812485e-03],
       [ 1.01916979e-03,  7.30603672e-04,  8.85298798e-04, ...,
         3.49561297e-03,  2.78967836e-03, -1.13763659e-03],
       ...,
       [-1.54826132e-03, -2.46259024e-03,  6.40134527e-04, ...,
        -2.05614460e-03,  1.64324585e-03,  1.51996888e-03],
       [-5.38758957e-03,  3.14151719e-03, -5.97236288e-04, ...,
        -7.48162225e-03,  4.57788905e-03,  3.94381031e-04],
       [-1.47747455e-03,  5.56588897e-04,  2.46124572e-03, ...,
        -1.68997541e-03, -1.07781086e-04,  6.34457891e-05]])

### Quick model test with SVD from scikit-surprise

Quick performance check of SVD by using scikit-surprise:

In [23]:
from surprise import BaselineOnly, Dataset, SVD, Reader, accuracy, Trainset
from surprise.model_selection import cross_validate, train_test_split, KFold

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data_test = Dataset.load_from_df(df[["reviewerID", "asin", "overall"]], reader)

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data_test, test_size=0.25)

# define a cross-validation iterator
kf = KFold(n_splits=5)

algo = SVD()

for trainset, testset in kf.split(data_test):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.7310
RMSE: 0.7168
RMSE: 0.7304
RMSE: 0.7056
RMSE: 0.7210


In [24]:
user_id = 'A1HBTW5M7ZZ9PT'
test_user = reviews_df[reviews_df['reviewerID'] == user_id]
item_id = 'B00006IEI7'

test_user.head(5)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary
2113,5.0,"June 17, 2016",A1HBTW5M7ZZ9PT,B00006IDQS,FTLOE,absolutely love card board used create literac...,must every teacher
2184,3.0,"November 06, 2016",A1HBTW5M7ZZ9PT,B00006IE7Y,FTLOE,still search perfect pen,okay
2616,5.0,"October 18, 2015",A1HBTW5M7ZZ9PT,B00006IEI7,FTLOE,much moaning groaning student decided buy elec...,typical pencil sharpener
3391,5.0,"July 03, 2015",A1HBTW5M7ZZ9PT,B00006IFEU,FTLOE,obsession sharpy use everything run crazy fast...,great buy sharpie lover
3858,5.0,"June 17, 2016",A1HBTW5M7ZZ9PT,B00006IFH6,FTLOE,bought chart marker could get planning done su...,solid buy teacher soak chart paper


In [25]:
# get a prediction for specific users and items.
pred = algo.predict(user_id, item_id, r_ui=5, verbose=True)

user: A1HBTW5M7ZZ9PT item: B00006IEI7 r_ui = 5.00   est = 4.71   {'was_impossible': False}


### Neural Collaborative Filtering setup

In [62]:
from torch import nn

# NCF model class
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_dims, output_dim):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.fc_layers = nn.ModuleList()
        input_dim = embedding_dim * 2
        for hidden_dim in hidden_dims:
            self.fc_layers.append(nn.Linear(input_dim, hidden_dim))
            self.fc_layers.append(nn.ReLU())
            input_dim = hidden_dim
        self.output_layer = nn.Linear(input_dim, output_dim)

    def forward(self, user_ids, item_ids):
        user_embeddings = self.user_embedding(user_ids)
        item_embeddings = self.item_embedding(item_ids)
        x = torch.cat([user_embeddings, item_embeddings], dim=1)
        for layer in self.fc_layers:
            x = layer(x)
        output = self.output_layer(x)
        return output

#### Training and evaluating functions

In [63]:
# Train the NCF model
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm


def train_model(model, dataloader, criterion, optimizer, device, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
        for user_ids, item_ids, ratings, _, _, _, _, _ in progress_bar:
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            ratings = ratings.to(device)

            optimizer.zero_grad()
            outputs = model(user_ids, item_ids)
            loss = criterion(outputs.squeeze(), ratings.float())
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            progress_bar.set_postfix(loss=running_loss / len(progress_bar))
            
        
            
# Evaluate the NCF model
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    true_ratings = []
    with torch.no_grad():
        progress_bar = tqdm(dataloader, desc="Evaluating", unit="batch")
        for user_ids, item_ids, ratings, _, _, _, _, _ in progress_bar:
            user_ids = user_ids.to(device)
            item_ids = item_ids.to(device)
            outputs = model(user_ids, item_ids)
            predictions.extend(outputs.squeeze().tolist())
            true_ratings.extend(ratings.tolist())

    mse = mean_squared_error(true_ratings, predictions)
    mae = mean_absolute_error(true_ratings, predictions)
    return mse, mae

#### Model training

In [64]:
import torch.optim as optim
import torch.nn as nn

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Create the NCF model
num_users = len(user_mapping)
num_items = len(item_mapping)
embedding_dim = 100
hidden_dims = [64, 32]
output_dim = 1

model = NCF(num_users, num_items, embedding_dim, hidden_dims, output_dim).to(device)

# Define loss function (RMSE) and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
train_model(model, train_loader, criterion, optimizer, device, num_epochs)

# Evaluate the model
mse, mae = evaluate_model(model, test_loader, device)
rmse = torch.sqrt(torch.tensor(mse))
print(f"\nTest RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")

Epoch 1/10: 100%|██████████| 436/436 [00:02<00:00, 152.57batch/s, loss=2.51]
Epoch 2/10: 100%|██████████| 436/436 [00:02<00:00, 153.56batch/s, loss=0.593]
Epoch 3/10: 100%|██████████| 436/436 [00:03<00:00, 132.28batch/s, loss=0.492]
Epoch 4/10: 100%|██████████| 436/436 [00:02<00:00, 148.68batch/s, loss=0.428]
Epoch 5/10: 100%|██████████| 436/436 [00:02<00:00, 151.28batch/s, loss=0.376] 
Epoch 6/10: 100%|██████████| 436/436 [00:03<00:00, 144.01batch/s, loss=0.336] 
Epoch 7/10: 100%|██████████| 436/436 [00:02<00:00, 150.04batch/s, loss=0.298] 
Epoch 8/10: 100%|██████████| 436/436 [00:02<00:00, 154.47batch/s, loss=0.262] 
Epoch 9/10: 100%|██████████| 436/436 [00:03<00:00, 144.45batch/s, loss=0.227] 
Epoch 10/10: 100%|██████████| 436/436 [00:03<00:00, 144.12batch/s, loss=0.196] 
Evaluating: 100%|██████████| 109/109 [00:00<00:00, 1000.97batch/s]



Test RMSE: 0.8393
Test MAE: 0.6063


#### Model test with single predictions

In [46]:
def predict_score(user_id, item_id, model, user_mapping, item_mapping, device):
    # Convert user ID and item ID to their corresponding indices
    user_index = user_mapping[user_id]
    item_index = item_mapping[item_id]

    # Convert indices to tensors
    user_tensor = torch.tensor([user_index], dtype=torch.long).to(device)
    item_tensor = torch.tensor([item_index], dtype=torch.long).to(device)

    # Get the predicted score from the model
    with torch.no_grad():
        model.eval()
        score = model(user_tensor, item_tensor).item()

    return score

print(f'Predicted rating for user {user_id} and item {item_id}: {predict_score(user_id, item_id, model, user_mapping, item_mapping, device)}')
print(f'Actual rating: {user_product_matrix.loc[user_id, item_id]}')

Predicted rating for user A1HBTW5M7ZZ9PT and item B00006IEI7: 4.702612400054932
Actual rating: 5.0
Product info:                                              description                                 title   brand                                            feature  rank         main_cat productPublishedDate        asin
18236  tylish powerful xacto xlr electric pencil shar...  X-ACTO XLR Electric Pencil Sharpener  X-Acto  stylish electric pencil sharpener ideal home o...     5  Office Products       April 18, 2006  B00006IEI7
