# NIRS (model)

In [65]:
import utils as utils
import torch
import numpy as np
import pandas as pd

utils.seed_everything(42)

In [66]:
reviews_df = pd.read_csv('data/reviews_sampled_processed.csv')
products_df = pd.read_csv('data/products_sampled_processed.csv')

In [67]:
utils.print_shapes(reviews_df, products_df)

Reviews df shape: (45538, 7)
Products df shape: (17931, 8)


In [68]:
reviews_df.head(3)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary
0,5.0,"June 02, 2015",A1HBTW5M7ZZ9PT,310818621,FTLOE,absolutely love organizer never one figured wa...,super good deal
1,5.0,"March 21, 2014",A2F0F4NB6BLGVX,310823706,Lee,good bible carrier large print bible afraid wo...,leatherlook bible carrier
2,5.0,"June 02, 2017",A23BRQWL8LNB37,439499887,David,kid love peppa reading say,five star


In [69]:
products_df.head(3)

Unnamed: 0,description,title,brand,feature,rank,main_cat,productPublishedDate,asin
0,rotect rfid card skimsafe card holder made rig...,Black RFID Blocking ID Badge Holder (Holds 2 C...,Specialist ID,rfid blocking 2 card holder fips 201 approved ...,43,Office Products,"October 14, 2011",B005VSY1VK
1,itten piano key mouse pad 8 x 8 x 25 made heav...,3dRose LLC 8 x 8 x 0.25 Inches Kitten on Piano...,3dRose,dimension inch 8 w x 8 h x 025 matte finish so...,1,Office Products,"July 14, 2014",B00CX71JNU
2,ivo next favorite pen ultra gel stick vibrant ...,"Vivo Ultra Gel Stick Pens, 0.7mm Fine Tip, Bla...",VIVO,ultra smooth gel ink vivid black amp color ful...,1,Office Products,"April 30, 2009",B002CO43BO


In [70]:
# merge the datasets
df = pd.merge(reviews_df, products_df, on='asin', how='inner')
df.head(3)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,description,title,brand,feature,rank,main_cat,productPublishedDate
0,5.0,"October 27, 2017",A2M13JN7YVG29U,528960911,Stacie Baugh,love,five star,yecatching 50 x 32 reference piece home classr...,Rand McNally M-Series Full-Color Laminated Uni...,Rand McNally,eyecatching 50 x 32 reference piece home class...,5,Office Products,"April 18, 2006"
1,5.0,"December 31, 2014",A2ZVLGM9E6X2HY,528960911,Michael Isgro,great money 15 look nice colorful,five star,yecatching 50 x 32 reference piece home classr...,Rand McNally M-Series Full-Color Laminated Uni...,Rand McNally,eyecatching 50 x 32 reference piece home class...,5,Office Products,"April 18, 2006"
2,5.0,"June 30, 2014",A2WAF7IDNRUGEU,1604189541,Jane Iverson,cute,five star,erfect reminder calendar note homework note na...,Carson Dellosa Panda Notepad (151035),Carson-Dellosa,50 sheet per pad acid free lignin free perfect...,137,Office Products,"December 31, 2009"


## Other data preparation for the model

### Embeddings

### Embed from user-matrix pivot table

In [71]:
# create the pivot table with the user-item interactions (reviews)
user_product_matrix = pd.pivot_table(df, values='overall', index='reviewerID', columns='asin', fill_value=0)

In [74]:
def SVD_embeddings(user_item_matrix: pd.DataFrame, embedding_length):
    """
    Apply SVD to the user-item matrix to obtain user and item embeddings.
    U = m x r orthogonal left singular matrix, which represents the relationship between users and latent factors
    S = r x r diagonal matrix, which describes the strength of each latent factor
    V = r x n diagonal right singular matrix, which indicates the similarity between items and latent factors
    
    :param user_item_matrix: user-item matrix
    :param embedding_length: number of latent factors, which is the dimensionality of the reduced space
    """
    from scipy.sparse.linalg import svds

    U, Sigma, VT = svds(user_item_matrix.to_numpy(), k=embedding_length)
    user_embed_df = pd.DataFrame(U, index = user_item_matrix.index)
    VT_T = np.transpose(VT)
    item_embed_df = pd.DataFrame(VT_T, index=user_item_matrix.columns)
    
    return user_embed_df, item_embed_df

In [75]:
user_embed_df, item_embed_df = SVD_embeddings(user_product_matrix, 50)

## Model tests

### Quick model test with SVD from scikit-surprise

Quick performance check of SVD by using scikit-surprise:

In [105]:
from surprise import BaselineOnly, Dataset, SVD, Reader, accuracy
from surprise.model_selection import cross_validate, train_test_split, KFold

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[["reviewerID", "asin", "overall"]], reader)

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=0.25)

# define a cross-validation iterator
kf = KFold(n_splits=5)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.7579
RMSE: 0.7631
RMSE: 0.7904
RMSE: 0.7391
RMSE: 0.7608


In [130]:
user_id = 'A1HBTW5M7ZZ9PT'
test_user = reviews_df[reviews_df['reviewerID'] == user_id]
item_id = 'B00006IDQS'

test_user.head(3)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary
0,5.0,"June 02, 2015",A1HBTW5M7ZZ9PT,0310818621,FTLOE,absolutely love organizer never one figured wa...,super good deal
2745,5.0,"June 17, 2016",A1HBTW5M7ZZ9PT,B00006IDQS,FTLOE,absolutely love card board used create literac...,must every teacher
2822,3.0,"November 06, 2016",A1HBTW5M7ZZ9PT,B00006IE7Y,FTLOE,still search perfect pen,okay


In [131]:
# get a prediction for specific users and items.
pred = algo.predict(user_id, item_id, r_ui=5, verbose=True)

user: A1HBTW5M7ZZ9PT item: B00006IDQS r_ui = 5.00   est = 4.71   {'was_impossible': False}


### Neural Collaborative Filtering setup

In [143]:
import torch
import torch.nn as nn

# a first sketch of the NCF model
class NCF(nn.Module):
    def __init__(self, total_embed_shape):
        super(NCF, self).__init__()
        self.total_embed_shape = total_embed_shape
        
        self.d1 = nn.Sequential(
            nn.Linear(*total_embed_shape),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(2048)
        )
        
        self.d2 = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(1024)
        )
        
        self.d3 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(512)
        )
        
        self.d4 = nn.Linear(512, 256)
        self.d5 = nn.Linear(256, 64)
        self.d6 = nn.Linear(64, 16)
        self.out = nn.Linear(16, 1)
        
    def forward(self, x):
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        x = nn.ReLU()(self.d4(x))
        x = nn.ReLU()(self.d5(x))
        x = nn.ReLU()(self.d6(x))
        out = nn.ReLU()(self.out(x))
        return out

In [144]:
from tqdm import tqdm

# model training
def train(model, dataset, criterion, optimizer, num_epochs, batch_size):
    model.train()  # Set the model to training mode
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        num_batches = len(dataset) // batch_size
        
        for batch_idx in tqdm(range(num_batches)):
            start_idx = batch_idx * batch_size
            end_idx = start_idx + batch_size
            
            inputs = dataset[start_idx:end_idx, :-1].astype(np.float32)
            labels = dataset[start_idx:end_idx, -1].astype(np.float32)
            
            inputs = torch.from_numpy(inputs)  # Convert inputs to PyTorch tensor
            labels = torch.from_numpy(labels)  # Convert labels to PyTorch tensor
            
            optimizer.zero_grad()  # Clear the gradients
            
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute the loss
            
            loss.backward()  # Backward pass
            optimizer.step()  # Update the weights
            
            running_loss += loss.item()
        
        epoch_loss = running_loss / num_batches
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")
    
    print("Training complete!")

In [147]:
data.shape

(1911, 50)

In [146]:
data = pd.concat([user_embed_df, item_embed_df]).values
model = NCF(data.shape) # 64 for user and 64 for item embeddings
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())
train(model, data, criterion, optimizer, num_epochs=10, batch_size=32)

  0%|          | 0/59 [00:00<?, ?it/s]




RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x49 and 1911x50)