# NIRS (model)

In [33]:
import utils as utils
import torch
import numpy as np
import pandas as pd
import icecream as ic

utils.seed_everything(42)

## Init

In [34]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [35]:
reviews_df = pd.read_csv('data/reviews_sampled_processed.csv')
products_df = pd.read_csv('data/products_sampled_processed.csv')

In [36]:
# remove duplicates
products_df = products_df.drop_duplicates(subset='asin', keep='first')

In [37]:
utils.print_shapes(reviews_df, products_df)

Reviews df shape: (34839, 7)
Products df shape: (25775, 8)


In [38]:
print(f'Number of unique products: {products_df["asin"].nunique()}')
print(f'Number of unique users: {reviews_df["reviewerID"].nunique()}')

Number of unique products: 25775
Number of unique users: 999


In [39]:
reviews_df.head(3)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary
0,5.0,"April 09, 2016",A2GIQGI2UXOZ4M,439893577,Gene Sechrest,job big enough purpose fold flat fold stand gr...,ultimate kid magnetic board
1,5.0,"October 27, 2017",A2M13JN7YVG29U,528960911,Stacie Baugh,love,five star
2,5.0,"December 31, 2014",A2ZVLGM9E6X2HY,528960911,Michael Isgro,great money 15 look nice colorful,five star


In [40]:
products_df.head(3)

Unnamed: 0,description,title,brand,feature,rank,main_cat,productPublishedDate,asin
0,rotect rfid card skimsafe card holder made rig...,Black RFID Blocking ID Badge Holder (Holds 2 C...,Specialist ID,rfid blocking 2 card holder fips 201 approved ...,43,Office Products,"October 14, 2011",B005VSY1VK
1,itten piano key mouse pad 8 x 8 x 25 made heav...,3dRose LLC 8 x 8 x 0.25 Inches Kitten on Piano...,3dRose,dimension inch 8 w x 8 h x 025 matte finish so...,1,Office Products,"July 14, 2014",B00CX71JNU
2,ivo next favorite pen ultra gel stick vibrant ...,"Vivo Ultra Gel Stick Pens, 0.7mm Fine Tip, Bla...",VIVO,ultra smooth gel ink vivid black amp color ful...,1,Office Products,"April 30, 2009",B002CO43BO


In [41]:
# merge the datasets
df = pd.merge(reviews_df, products_df, on='asin', how='left')

# Fill NaN values with appropriate values
df['overall'] = df['overall'].fillna(0)  # Fill missing ratings with 0
df['reviewerID'] = df['reviewerID'].fillna('Unknown')  # Fill missing user IDs with 'Unknown'
df['asin'] = df['asin'].fillna('Unknown')  # Fill missing product IDs with 'Unknown'

df.head(3)

Unnamed: 0,overall,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,description,title,brand,feature,rank,main_cat,productPublishedDate
0,5.0,"April 09, 2016",A2GIQGI2UXOZ4M,439893577,Gene Sechrest,job big enough purpose fold flat fold stand gr...,ultimate kid magnetic board,agnetic tabletop learning easel one simplestye...,Little Red Tool Box: Magnetic Tabletop Learnin...,Scholastic,fold flat easy storage open reveal giant 12 x ...,21,Office Products,"November 25, 2006"
1,5.0,"October 27, 2017",A2M13JN7YVG29U,528960911,Stacie Baugh,love,five star,yecatching 50 x 32 reference piece home classr...,Rand McNally M-Series Full-Color Laminated Uni...,Rand McNally,eyecatching 50 x 32 reference piece home class...,5,Office Products,"April 18, 2006"
2,5.0,"December 31, 2014",A2ZVLGM9E6X2HY,528960911,Michael Isgro,great money 15 look nice colorful,five star,yecatching 50 x 32 reference piece home classr...,Rand McNally M-Series Full-Color Laminated Uni...,Rand McNally,eyecatching 50 x 32 reference piece home class...,5,Office Products,"April 18, 2006"


In [42]:
reviewed_products = set(reviews_df['asin'].unique())
product_dataset_products = set(products_df['asin'].unique())

missing_products = reviewed_products - product_dataset_products
num_missing_products = len(missing_products)

print(f"Number of missing products: {num_missing_products}")


Number of missing products: 0


In [11]:
from torch.utils.data import Dataset, DataLoader 

# class for the dataset
class AmazonReviewDataset(Dataset):
    def __init__(self, user_ids, item_ids, ratings, review_texts, product_titles, product_descriptions):
        self.user_ids = user_ids
        self.item_ids = item_ids
        self.ratings = ratings
        self.review_texts = review_texts
        self.product_titles = product_titles
        self.product_descriptions = product_descriptions

    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        item_id = self.item_ids[index]
        rating = self.ratings[index]
        review_text = self.review_texts[index]
        product_title = self.product_titles[index]
        product_description = self.product_descriptions[index]
        return user_id, item_id, rating, review_text, product_title, product_description

## Other data preparation for the model

### User and product id mapping

In [12]:
# Create user and item mappings
user_mapping = {user_id: index for index, user_id in enumerate(reviews_df['reviewerID'].unique())}
item_mapping = {item_id: index for index, item_id in enumerate(products_df['asin'].unique())}

# Map user and item IDs to indices
data['user_index'] = data['reviewerID'].map(user_mapping)
data['item_index'] = data['asin'].map(item_mapping)

### Embeddings

### Embed (SVD) from user-matrix pivot table

In [None]:
# create the pivot table with the user-item interactions (reviews)
user_product_matrix = pd.pivot_table(df, values='overall', index='reviewerID', columns='asin', fill_value=0)

In [None]:
# number of users and items
user_product_matrix.shape 

In [None]:
def SVD_embeddings(user_item_matrix: pd.DataFrame, embedding_length):
    """
    Apply SVD to the user-item matrix to obtain user and item embeddings.
    U = m x r orthogonal left singular matrix, which represents the relationship between users and latent factors
    S = r x r diagonal matrix, which describes the strength of each latent factor
    V = r x n diagonal right singular matrix, which indicates the similarity between items and latent factors
    
    :param user_item_matrix: user-item matrix
    :param embedding_length: number of latent factors, which is the dimensionality of the reduced space
    """
    from scipy.sparse.linalg import svds
    
    # transpose the matrix to obtain the item-user matrix
    # (this due to the large number of items compared to users)
    matrix = user_item_matrix.values

    U, Sigma, VT = svds(matrix, k=embedding_length)
    user_embed_df = pd.DataFrame(U, index = user_item_matrix.index)
    VT_T = np.transpose(VT)
    item_embed_df = pd.DataFrame(VT_T, index=user_item_matrix.columns)
    
    return user_embed_df, item_embed_df

In [None]:
user_embed_df, item_embed_df = SVD_embeddings(user_product_matrix, 800)

In [None]:
utils.print_shapes(user_embed_df, item_embed_df)

## Model

### Vanilla SVD

In [None]:
# predictions with the the obrained SVD embeddings
def predict_ratings(user_embed_df: pd.DataFrame, item_embed_df: pd.DataFrame, user_id: str, item_id: str):
    """
    Predict the rating for a given user and item.
    
    :param user_embed_df: user embeddings
    :param item_embed_df: item embeddings
    :param user_id: user id
    :param item_id: item id
    """
    user_embedding = user_embed_df.loc[user_id]
    item_embedding = item_embed_df.loc[item_id]
    
    
    rating = np.dot(user_embedding, item_embedding)
    return rating

# example of prediction with a an item rated by a user
user_id = user_product_matrix.index[0]

# take the first product id rated by "user_id"
# (thus an element that is not 0 in the user-product matrix)
item_id = user_product_matrix.columns[user_product_matrix.loc[user_id] != 0].values[0]

print(f'Predicted rating for user {user_id} and item {item_id}: {predict_ratings(user_embed_df, item_embed_df, user_id, item_id)}')

In [None]:
# see if the predictions are consistent with the actual ratings
actual_rating = user_product_matrix.loc[user_id, item_id]

actual_rating

In [None]:
np.dot(user_embed_df, item_embed_df.T)

### Quick model test with SVD from scikit-surprise

Quick performance check of SVD by using scikit-surprise:

In [None]:
from surprise import BaselineOnly, Dataset, SVD, Reader, accuracy, Trainset
from surprise.model_selection import cross_validate, train_test_split, KFold

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data_test = Dataset.load_from_df(df[["reviewerID", "asin", "overall"]], reader)

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data_test, test_size=0.25)

# define a cross-validation iterator
kf = KFold(n_splits=5)

algo = SVD()

for trainset, testset in kf.split(data_test):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

In [None]:
user_id = 'A1HBTW5M7ZZ9PT'
test_user = reviews_df[reviews_df['reviewerID'] == user_id]
item_id = 'B00006IEI7'

test_user.head(5)

In [None]:
# get a prediction for specific users and items.
pred = algo.predict(user_id, item_id, r_ui=5, verbose=True)

### Neural Collaborative Filtering setup

In [None]:
import torch
import torch.nn as nn

# a first sketch of the NCF model
class NCF(nn.Module):
    def __init__(self, total_embed_length):
        super(NCF, self).__init__()
        self.total_embed_length = total_embed_length
        
        self.d1 = nn.Sequential(
            nn.Linear(self.total_embed_length, 2048),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(2048)
        )
        
        self.d2 = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(1024)
        )
        
        self.d3 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(512)
        )
        
        self.d4 = nn.Linear(512, 256)
        self.d5 = nn.Linear(256, 64)
        self.d6 = nn.Linear(64, 16)
        self.out = nn.Linear(16, 1)
        
    def forward(self, x):
        x = self.d1(x)
        x = self.d2(x)
        x = self.d3(x)
        x = nn.ReLU()(self.d4(x))
        x = nn.ReLU()(self.d5(x))
        x = nn.ReLU()(self.d6(x))
        out = nn.ReLU()(self.out(x))
        return out

In [None]:
from tqdm import tqdm

# model training
def train(model, dataset, criterion, optimizer, num_epochs, batch_size):
    model.train()  # Set the model to training mode
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        num_batches = len(dataset) // batch_size
        
        for batch_idx in tqdm(range(num_batches)):
            start_idx = batch_idx * batch_size
            end_idx = start_idx + batch_size
            
            inputs = dataset[start_idx:end_idx, :-1].astype(np.float32)
            labels = dataset[start_idx:end_idx, -1].astype(np.float32)
            
            inputs = torch.from_numpy(inputs)  # Convert inputs to PyTorch tensor
            labels = torch.from_numpy(labels)  # Convert labels to PyTorch tensor
            
            optimizer.zero_grad()  # Clear the gradients
            
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute the loss
            
            loss.backward()  # Backward pass
            optimizer.step()  # Update the weights
            
            running_loss += loss.item()
        
        epoch_loss = running_loss / num_batches
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")
    
    print("Training complete!")

In [None]:
data = pd.concat([user_embed_df, item_embed_df]).values
batch_size = 32
model = NCF(799)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train(model, data, criterion, optimizer, num_epochs=10, batch_size=batch_size)

### Overall setup