# NIRS (model)

In [35]:
import utils as utils
import torch
import numpy as np
import pandas as pd
import icecream as ic

utils.seed_everything(42)

## Init

In [36]:
reviews_df = pd.read_csv('data/reviews_sampled_processed.csv')
products_df = pd.read_csv('data/products_sampled_processed.csv')

In [37]:
# remove duplicates
products_df = products_df.drop_duplicates(subset='asin', keep='first')

In [38]:
utils.print_shapes(reviews_df, products_df)

Reviews df shape: (45562, 7)
Products df shape: (17766, 8)


In [39]:
print(f'Number of unique products: {products_df["asin"].nunique()}')
print(f'Number of unique users: {reviews_df["reviewerID"].nunique()}')

Number of unique products: 17766
Number of unique users: 1000


In [40]:
reviews_df.head(3)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary
0,5.0,"June 02, 2015",A1HBTW5M7ZZ9PT,310818621,FTLOE,absolutely love organizer never one figured wa...,super good deal
1,5.0,"March 21, 2014",A2F0F4NB6BLGVX,310823706,Lee,good bible carrier large print bible afraid wo...,leatherlook bible carrier
2,5.0,"June 02, 2017",A23BRQWL8LNB37,439499887,David,kid love peppa reading say,five star


In [41]:
products_df.head(3)

Unnamed: 0,description,title,brand,feature,rank,main_cat,productPublishedDate,asin
0,rotect rfid card skimsafe card holder made rig...,Black RFID Blocking ID Badge Holder (Holds 2 C...,Specialist ID,rfid blocking 2 card holder fips 201 approved ...,43,Office Products,"October 14, 2011",B005VSY1VK
1,itten piano key mouse pad 8 x 8 x 25 made heav...,3dRose LLC 8 x 8 x 0.25 Inches Kitten on Piano...,3dRose,dimension inch 8 w x 8 h x 025 matte finish so...,1,Office Products,"July 14, 2014",B00CX71JNU
2,ivo next favorite pen ultra gel stick vibrant ...,"Vivo Ultra Gel Stick Pens, 0.7mm Fine Tip, Bla...",VIVO,ultra smooth gel ink vivid black amp color ful...,1,Office Products,"April 30, 2009",B002CO43BO


In [42]:
# merge the datasets
df = pd.merge(reviews_df, products_df, on='asin', how='outer')

# Fill NaN values with appropriate values
df['overall'] = df['overall'].fillna(0)  # Fill missing ratings with 0
df['reviewerID'] = df['reviewerID'].fillna('Unknown')  # Fill missing user IDs with 'Unknown'
df['asin'] = df['asin'].fillna('Unknown')  # Fill missing product IDs with 'Unknown'

df.head(3)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,description,title,brand,feature,rank,main_cat,productPublishedDate
0,0.0,,Unknown,136039847,,,,earson myhistorylab online access code america...,Pearson MyHistoryLab Online Access Code for Am...,Pearson MyHistoryLab,pearson myhistorylab online access code americ...,1.0,Office Products,"June 21, 2012"
1,5.0,"June 02, 2015",A1HBTW5M7ZZ9PT,310818621,FTLOE,absolutely love organizer never one figured wa...,super good deal,,,,,,,
2,5.0,"March 21, 2014",A2F0F4NB6BLGVX,310823706,Lee,good bible carrier large print bible afraid wo...,leatherlook bible carrier,,,,,,,


## Other data preparation for the model

### Embeddings

### Embed (SVD) from user-matrix pivot table

In [43]:
# create the pivot table with the user-item interactions (reviews)
user_product_matrix = pd.pivot_table(df, values='overall', index='reviewerID', columns='asin', fill_value=0)

In [44]:
# number of users and items
user_product_matrix.shape 

(1001, 29694)

In [58]:
def SVD_embeddings(user_item_matrix: pd.DataFrame, embedding_length):
    """
    Apply SVD to the user-item matrix to obtain user and item embeddings.
    U = m x r orthogonal left singular matrix, which represents the relationship between users and latent factors
    S = r x r diagonal matrix, which describes the strength of each latent factor
    V = r x n diagonal right singular matrix, which indicates the similarity between items and latent factors
    
    :param user_item_matrix: user-item matrix
    :param embedding_length: number of latent factors, which is the dimensionality of the reduced space
    """
    from scipy.sparse.linalg import svds
    
    # transpose the matrix to obtain the item-user matrix
    # (this due to the large number of items compared to users)
    matrix = user_item_matrix.values

    U, Sigma, VT = svds(matrix, k=embedding_length)
    user_embed_df = pd.DataFrame(U, index = user_item_matrix.index)
    VT_T = np.transpose(VT)
    item_embed_df = pd.DataFrame(VT_T, index=user_item_matrix.columns)
    
    return user_embed_df, item_embed_df

In [59]:
user_embed_df, item_embed_df = SVD_embeddings(user_product_matrix, 800)

In [60]:
utils.print_shapes(user_embed_df, item_embed_df)

Reviews df shape: (1001, 800)
Products df shape: (29694, 800)


## Model

### Vanilla SVD

In [61]:
# predictions with the the obrained SVD embeddings
def predict_ratings(user_embed_df: pd.DataFrame, item_embed_df: pd.DataFrame, user_id: str, item_id: str):
    """
    Predict the rating for a given user and item.
    
    :param user_embed_df: user embeddings
    :param item_embed_df: item embeddings
    :param user_id: user id
    :param item_id: item id
    """
    user_embedding = user_embed_df.loc[user_id]
    item_embedding = item_embed_df.loc[item_id]
    
    
    rating = np.dot(user_embedding, item_embedding)
    return rating

# example of prediction with a an item rated by a user
user_id = user_product_matrix.index[0]

# take the first product id rated by "user_id"
# (thus an element that is not 0 in the user-product matrix)
item_id = user_product_matrix.columns[user_product_matrix.loc[user_id] != 0].values[0]

print(f'Predicted rating for user {user_id} and item {item_id}: {predict_ratings(user_embed_df, item_embed_df, user_id, item_id)}')

Predicted rating for user A100WO06OQR8BQ and item B000034DLQ: 0.033576172950954386


In [62]:
# see if the predictions are consistent with the actual ratings
actual_rating = user_product_matrix.loc[user_id, item_id]

actual_rating

1.0

In [63]:
np.dot(user_embed_df, item_embed_df.T)

array([[ 5.15312314e-18,  1.56299148e-03,  1.91088821e-03, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-8.95960646e-18,  8.71139059e-04,  8.91887448e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-2.30808540e-18,  8.34657369e-04, -3.89877065e-03, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [-5.98872306e-18,  1.02680846e-04,  1.42300246e-03, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.29047853e-17, -1.63814272e-03,  3.18799017e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 9.56213196e-32,  7.09309961e-17,  4.34223589e-17, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

### Quick model test with SVD from scikit-surprise

Quick performance check of SVD by using scikit-surprise:

In [64]:
from surprise import BaselineOnly, Dataset, SVD, Reader, accuracy, Trainset
from surprise.model_selection import cross_validate, train_test_split, KFold

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data_test = Dataset.load_from_df(df[["reviewerID", "asin", "overall"]], reader)

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data_test, test_size=0.25)

# define a cross-validation iterator
kf = KFold(n_splits=5)

algo = SVD()

for trainset, testset in kf.split(data_test):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.8372
RMSE: 0.8370
RMSE: 0.8243
RMSE: 0.8329
RMSE: 0.8385


In [65]:
user_id = 'A1HBTW5M7ZZ9PT'
test_user = reviews_df[reviews_df['reviewerID'] == user_id]
item_id = 'B00006IEI7'

test_user.head(5)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary
0,5.0,"June 02, 2015",A1HBTW5M7ZZ9PT,0310818621,FTLOE,absolutely love organizer never one figured wa...,super good deal
2748,5.0,"June 17, 2016",A1HBTW5M7ZZ9PT,B00006IDQS,FTLOE,absolutely love card board used create literac...,must every teacher
2826,3.0,"November 06, 2016",A1HBTW5M7ZZ9PT,B00006IE7Y,FTLOE,still search perfect pen,okay
3245,5.0,"August 05, 2015",A1HBTW5M7ZZ9PT,B00006IEJC,FTLOE,really go wrong highlighter highly recommend,go wrong
3326,5.0,"October 18, 2015",A1HBTW5M7ZZ9PT,B00006IEI7,FTLOE,much moaning groaning student decided buy elec...,typical pencil sharpener


In [66]:
# get a prediction for specific users and items.
pred = algo.predict(user_id, item_id, r_ui=5, verbose=True)

user: A1HBTW5M7ZZ9PT item: B00006IEI7 r_ui = 5.00   est = 4.93   {'was_impossible': False}


### Neural Collaborative Filtering setup

In [67]:
import torch
import torch.nn as nn

# a first sketch of the NCF model
class NCF(nn.Module):
    def __init__(self, total_embed_length):
        super(NCF, self).__init__()
        self.total_embed_length = total_embed_length
        
        self.d1 = nn.Sequential(
            nn.Linear(self.total_embed_length, 2048),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(2048)
        )
        
        self.d2 = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(1024)
        )
        
        self.d3 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(512)
        )
        
        self.d4 = nn.Linear(512, 256)
        self.d5 = nn.Linear(256, 64)
        self.d6 = nn.Linear(64, 16)
        self.out = nn.Linear(16, 1)
        
    def forward(self, x):
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        x = nn.ReLU()(self.d4(x))
        x = nn.ReLU()(self.d5(x))
        x = nn.ReLU()(self.d6(x))
        out = nn.ReLU()(self.out(x))
        return out

In [68]:
from tqdm import tqdm

# model training
def train(model, dataset, criterion, optimizer, num_epochs, batch_size):
    model.train()  # Set the model to training mode
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        num_batches = len(dataset) // batch_size
        
        for batch_idx in tqdm(range(num_batches)):
            start_idx = batch_idx * batch_size
            end_idx = start_idx + batch_size
            
            inputs = dataset[start_idx:end_idx, :-1].astype(np.float32)
            labels = dataset[start_idx:end_idx, -1].astype(np.float32)
            
            inputs = torch.from_numpy(inputs)  # Convert inputs to PyTorch tensor
            labels = torch.from_numpy(labels)  # Convert labels to PyTorch tensor
            
            optimizer.zero_grad()  # Clear the gradients
            
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute the loss
            
            loss.backward()  # Backward pass
            optimizer.step()  # Update the weights
            
            running_loss += loss.item()
        
        epoch_loss = running_loss / num_batches
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")
    
    print("Training complete!")

In [69]:
data = pd.concat([user_embed_df, item_embed_df]).values
data.shape

(30695, 800)

In [77]:
data = pd.concat([user_embed_df, item_embed_df]).values
batch_size = 32
model = NCF(799)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train(model, data, criterion, optimizer, num_epochs=10, batch_size=batch_size)

  return F.mse_loss(input, target, reduction=self.reduction)
100%|██████████| 959/959 [00:18<00:00, 52.99it/s]


Epoch 1/10, Loss: 0.0001


100%|██████████| 959/959 [00:18<00:00, 51.60it/s]


Epoch 2/10, Loss: 0.0001


 26%|██▋       | 252/959 [00:05<00:14, 48.04it/s]


KeyboardInterrupt: 