In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import json
import os
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import seaborn as sns
import matplotlib.pyplot as plt
import swifter
import multiprocessing
import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.formula.api as smf
from collections import defaultdict
from dataclasses import dataclass
from torch import nn
import torch.nn.functional as F
import torch
import random
import copy
from tqdm import tqdm

sns.set_style("darkgrid")

# Loading Processed Reviews

In [3]:
# Paths
PROCESSED_FOLDER = './data/processed/'
PROCESSED_REVIEWS_FILE = 'processed_reviews_with_sentiment.csv'

In [4]:
reviews = pd.read_csv(os.path.join(PROCESSED_FOLDER, PROCESSED_REVIEWS_FILE))

In [5]:
reviews.head()

Unnamed: 0,review_id,user_id,item_id,text,rating,tokenized_text,neg,neg_sent_avg,pos_sent_avg,compound_sent_avg
0,255938,0,1,"First things first. My ""reviews"" system is exp...",8,"[['First', 'things', 'first', '.'], ['My', '``...",0.094,0.073333,0.168937,0.208675
1,259117,1,2,Let me start off by saying that Made in Abyss ...,10,"[['Let', 'me', 'start', 'off', 'by', 'saying',...",0.081,0.062741,0.177963,0.302804
2,253664,2,3,"Art 9/10: It is great, especially the actions ...",7,"[['Art', '9/10', ':', 'It', 'is', 'great', ','...",0.051,0.047278,0.159833,0.220767
3,247454,3,4,As someone who loves Studio Ghibli and its mov...,6,"[['As', 'someone', 'who', 'loves', 'Studio', '...",0.083,0.055577,0.135269,0.187896
4,23791,4,5,code geass is one of those series that everybo...,10,"[['code', 'geass', 'is', 'one', 'of', 'those',...",0.016,0.028857,0.248143,0.534129


# Converting Data for Modeling

In [6]:
# Convert item_id to 0 indexed
if min(reviews['item_id']) != 0:
    reviews['item_id'] = reviews['item_id'] - 1
    print("Done")

Done


In [7]:
@dataclass
class Review:
    user_id: int
    item_id: int
    rating: int
    text: str
    pos_sent: float
    neg_sent: float
    compound_sent: float

In [8]:
USER_KEY = 'user_id'
ITEM_KEY = 'item_id'
RATING_KEY = 'rating'

In [9]:
reviews.head()

Unnamed: 0,review_id,user_id,item_id,text,rating,tokenized_text,neg,neg_sent_avg,pos_sent_avg,compound_sent_avg
0,255938,0,0,"First things first. My ""reviews"" system is exp...",8,"[['First', 'things', 'first', '.'], ['My', '``...",0.094,0.073333,0.168937,0.208675
1,259117,1,1,Let me start off by saying that Made in Abyss ...,10,"[['Let', 'me', 'start', 'off', 'by', 'saying',...",0.081,0.062741,0.177963,0.302804
2,253664,2,2,"Art 9/10: It is great, especially the actions ...",7,"[['Art', '9/10', ':', 'It', 'is', 'great', ','...",0.051,0.047278,0.159833,0.220767
3,247454,3,3,As someone who loves Studio Ghibli and its mov...,6,"[['As', 'someone', 'who', 'loves', 'Studio', '...",0.083,0.055577,0.135269,0.187896
4,23791,4,4,code geass is one of those series that everybo...,10,"[['code', 'geass', 'is', 'one', 'of', 'those',...",0.016,0.028857,0.248143,0.534129


In [10]:
user_to_reviews = defaultdict(list)
for _, row in reviews.iterrows():
    user_id, item_id, rating, text = row[USER_KEY], row[ITEM_KEY], row[RATING_KEY], row['text']
    pos_sent, neg_sent, compound_sent = row['pos_sent_avg'], row['neg_sent_avg'], row['compound_sent_avg']
    user_to_reviews[user_id].append(Review(user_id, item_id, rating, text, pos_sent, neg_sent, compound_sent))

## Creating the score matrix

In [11]:
# users by items
X = np.zeros(shape=(reviews['user_id'].nunique(), reviews['item_id'].nunique()))

In [12]:
for _, row in reviews.iterrows():
    user_id, item_id, rating = row[USER_KEY], row[ITEM_KEY], row[RATING_KEY]
    X[user_id][item_id] = rating

## Train/Test Split

In [13]:
train_X = copy.deepcopy(X)
valid_X = np.zeros(shape=X.shape)
test_X = np.zeros(shape=X.shape)

for user_id, reviews in user_to_reviews.items():
    # can confirm this actually shuffles properly (this code block works)
    random.shuffle(reviews)

    # Leave one out for valid
    valid_review = reviews[0]
    train_X[valid_review.user_id][valid_review.item_id] = 0
    valid_X[valid_review.user_id][valid_review.item_id] = valid_review.rating
    
    # Leave one out for test
    test_review = reviews[1]
    train_X[test_review.user_id][test_review.item_id] = 0
    test_X[test_review.user_id][test_review.item_id] = test_review.rating
    
    # Rest for train

## Creating bias terms for users / items from the training data

In [21]:
# users
user_to_pos_sent = defaultdict(list)
user_to_neg_sent = defaultdict(list)
user_to_compound_sent = defaultdict(list)

# items
item_to_pos_sent = defaultdict(list)
item_to_neg_sent = defaultdict(list)
item_to_compound_sent = defaultdict(list)

# loadding
for user_id, reviews in user_to_reviews.items():
    for r in reviews:
        # skip if not in train
        if train_X[user_id, r.item_id] == 0:
            continue
        user_to_pos_sent[user_id].append(r.pos_sent)
        user_to_neg_sent[user_id].append(r.neg_sent)
        user_to_compound_sent[user_id].append(r.compound_sent)
        item_to_pos_sent[r.item_id].append(r.pos_sent)
        item_to_neg_sent[r.item_id].append(r.neg_sent)
        item_to_compound_sent[r.item_id].append(r.compound_sent)

In [75]:
# Averaging values to get bias term
def list_mapping_to_float_mapping(hm: dict):
    id_to_sent_term = defaultdict(float)
    for k, v in hm.items():
        id_to_sent_term[k] = np.mean(v)
    return id_to_sent_term

user_to_pos_sent_term = list_mapping_to_float_mapping(user_to_pos_sent)
user_to_neg_sent_term = list_mapping_to_float_mapping(user_to_neg_sent)
user_to_compound_sent_term = list_mapping_to_float_mapping(user_to_compound_sent)
item_to_pos_sent_term = list_mapping_to_float_mapping(item_to_pos_sent)
item_to_neg_sent_term = list_mapping_to_float_mapping(item_to_neg_sent)
item_to_compound_sent_term = list_mapping_to_float_mapping(item_to_compound_sent)

## MF with sentiment as a bias(vectorized)

In [150]:
def l2_regularization(values):
    return torch.sum(torch.square(values))

class SentimentMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=20, regularization_constant=1e-6, sentiment_regularization_constant=1e-6, eps=1e-8):
        super().__init__()
        self.user_factors = nn.Embedding(num_users, embedding_dim)
        self.item_factors = nn.Embedding(num_items, embedding_dim)
        self.regularization_constant = regularization_constant
        self.eps = eps
        
        # Our two new embeddings: Let's use the sentiment scores!
        self.user_sentiment_emb = nn.Embedding(num_users, 3)
        self.item_sentiment_emb = nn.Embedding(num_items, 3)
        self.sentiment_regularization_constant = sentiment_regularization_constant
        
    def forward(self, user: torch.LongTensor, item: torch.LongTensor):
        # user is shape (users, 1)
        # item is shape (items, 1)
        # embedding output shape is (*, emb_dim) = (users/items, emb_dim)
        user_latent_factors = self.user_factors(user)
        item_latent_factors = self.item_factors(item)
        pred_rating = user_latent_factors @ item_latent_factors.T
        
        
        user_ids_list = [u.item() for u in user]
        item_ids_list = [i.item() for i in item]
        user_sent = torch.FloatTensor([[user_to_pos_sent_term[user_id], user_to_neg_sent_term[user_id], user_to_compound_sent_term[user_id]] for user_id in user_ids_list])
        item_sent = torch.FloatTensor([[item_to_pos_sent_term[item_id], item_to_neg_sent_term[item_id], item_to_compound_sent_term[item_id]] for item_id in item_ids_list])
        
        # (users x 3) @ (users x 3).T = (users x 3) @ (3 x users) => (users x users)
        user_sentiment_bias = self.user_sentiment_emb(user) @ user_sent.T
        item_sentiment_bias = self.item_sentiment_emb(item) @ item_sent.T
        
        # Add our sentiment bias
        pred_rating += (user_sentiment_bias + item_sentiment_bias)
        
        # Clip in the desired range
        pred_rating = 1 + 9 * torch.sigmoid(pred_rating)        
        return pred_rating.diagonal()
    
    def loss(self, pred_rating: torch.LongTensor, rating: torch.LongTensor, rmse=False):
        if rmse:
            loss = torch.sqrt(F.mse_loss(pred_rating, rating) + self.eps)
        else:
            loss = F.mse_loss(pred_rating, rating) + self.eps
        
        
        # L2 Regularization
        sum_of_squared_values = l2_regularization(self.user_factors.weight) + l2_regularization(self.item_factors.weight)
        l2_penalty = (1/len(rating)) * self.regularization_constant * sum_of_squared_values
        
        sentiment_l2 = (1/len(rating)) * self.sentiment_regularization_constant * (l2_regularization(self.user_sentiment_emb.weight) + l2_regularization(self.item_sentiment_emb.weight))
        l2_penalty += sentiment_l2
        
        # Total Loss
        total_loss = loss + l2_penalty
        return total_loss
    
#     def RMSE_loss(self, pred_rating: torch.LongTensor, rating: torch.LongTensor):
#         # RMSE
#         RMSE_loss = torch.sqrt(F.mse_loss(pred_rating, rating) + self.eps)
        
#         # L2 Regularization
#         sum_of_squared_values = l2_regularization(self.user_factors.weight) + l2_regularization(self.item_factors.weight)
#         sum_of_squared_values += l2_regularization(self.user_sentiment_emb.weight) + l2_regularization(self.item_sentiment_emb.weight)
#         l2_penalty = (1/len(rating)) * self.regularization_constant * sum_of_squared_values
        
#         # Total Loss
#         total_loss = RMSE_loss + l2_penalty
#         return total_loss
    
    def predict_single_interaction(self, user_id: int, item_id: int):
        user = torch.LongTensor([user_id])
        item = torch.LongTensor([item_id])
        return self.forward(user, item)

In [122]:
def eval_MSE_loss(eval_X, model, round_digits=3):
    """Uses reduction mean"""
    user_ids_list, item_ids_list = eval_X.nonzero()
    gt_ratings = torch.FloatTensor([eval_X[user_id, item_id] for user_id, item_id in zip(user_ids_list, item_ids_list)])
    curr_users_tensor = torch.LongTensor(user_ids_list)
    curr_items_tensor = torch.LongTensor(item_ids_list)
    pred_ratings = model.forward(curr_users_tensor, curr_items_tensor)
    
    return round(F.mse_loss(pred_ratings, gt_ratings).item(), 3)

def eval_RMSE_loss(eval_X, model):
    """Uses reduction mean"""
    user_ids_list, item_ids_list = eval_X.nonzero()
    gt_ratings = torch.FloatTensor([eval_X[user_id, item_id] for user_id, item_id in zip(user_ids_list, item_ids_list)])
    curr_users_tensor = torch.LongTensor(user_ids_list)
    curr_items_tensor = torch.LongTensor(item_ids_list)
    pred_ratings = model.forward(curr_users_tensor, curr_items_tensor)
    
    return round(torch.sqrt(F.mse_loss(pred_ratings, gt_ratings)).item(), 3)

def eval_MAE_loss(eval_X, model):
    """Uses reduction mean"""
    user_ids_list, item_ids_list = eval_X.nonzero()
    gt_ratings = torch.FloatTensor([eval_X[user_id, item_id] for user_id, item_id in zip(user_ids_list, item_ids_list)])
    curr_users_tensor = torch.LongTensor(user_ids_list)
    curr_items_tensor = torch.LongTensor(item_ids_list)
    pred_ratings = model.forward(curr_users_tensor, curr_items_tensor)
    
    return round(F.l1_loss(pred_ratings, gt_ratings).item(), 3)


def train_v2(train_X, valid_X, model, optimizer, n_epochs=10, batch_size=5, rmse=False):
    """Training Function, calculates training and validation loss"""
    
    for epoch in (range(1, n_epochs+1)):
        users, items = train_X.nonzero()
        num_examples = len(users)
        permuted_indices = np.random.permutation(num_examples)
        users, items = users[permuted_indices], items[permuted_indices]
        

        total_train_loss = 0
        
        
        for i in tqdm(range(num_examples // batch_size)):
            user_ids_list = users[i*batch_size:i*batch_size+batch_size]
            item_ids_list = items[i*batch_size:i*batch_size+batch_size]

            # Set gradients to zero
            optimizer.zero_grad()

            # Turn data into tensors
            rating = torch.FloatTensor([train_X[user_id, item_id] for user_id, item_id in zip(user_ids_list, item_ids_list)])
            curr_users_tensor = torch.LongTensor(user_ids_list)
            curr_items_tensor = torch.LongTensor(item_ids_list)

            # Predict and calculate loss
            pred_rating = model.forward(curr_users_tensor, curr_items_tensor)
            assert pred_rating.shape == rating.shape
            
            ## SELECTING LOSS HERE
            # loss = model.loss(pred_rating, rating)
            loss = model.loss(pred_rating, rating, rmse=rmse)

            # Backpropagate
            loss.backward()

            # Update the parameters
            optimizer.step()

            # MSE Loss w/o regularization (just for status updates)
            total_train_loss += F.mse_loss(pred_rating, rating, reduction='sum')

        # Computing validation loss for display
        total_valid_loss = eval_MSE_loss(valid_X, model)
        total_valid_RMSE_loss = eval_RMSE_loss(valid_X, model)
        total_valid_MAE_loss = eval_MAE_loss(valid_X, model)
        
        print(f"Epoch {epoch} MSE Loss: {round(total_train_loss.item() / (batch_size * (num_examples//batch_size)), 3)}, valid MSE Loss: {total_valid_loss}, valid RMSE Loss: {total_valid_RMSE_loss}, valid MAE Loss: {total_valid_MAE_loss}")

## Training

In [157]:
# these parameter settings are pretty good, just adjust LR after you get low enough => MSE of 3.6)
# batch_size=64
# Adam
# weight decay in regularization constant
embedding_dim=200
lr=1e-2
regularization_constant=1e-2
sentiment_regularization_constant=0

model = SentimentMF(num_users=X.shape[0], num_items=X.shape[1], 
                    embedding_dim=embedding_dim, 
                    regularization_constant=regularization_constant,
                    sentiment_regularization_constant=sentiment_regularization_constant
                   )
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [193]:
embedding_dim=200
lr=1e-2
regularization_constant=1e-2
sentiment_regularization_constant=0

model = SentimentMF(num_users=X.shape[0], num_items=X.shape[1], 
                    embedding_dim=embedding_dim, 
                    regularization_constant=regularization_constant,
                    sentiment_regularization_constant=sentiment_regularization_constant
                   )
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## Testing Above

In [174]:
# # On-the-fly modifications
# lr = 1e-3
# optimizer = torch.optim.SGD(model.parameters(), lr=lr)

In [None]:
train_v2(train_X, valid_X, model, optimizer, n_epochs=25, batch_size=128, rmse=True)

100%|████████████████████████████████████| 254/254 [00:01<00:00, 179.32it/s]


Epoch 1 MSE Loss: 21.11, valid MSE Loss: 16.254, valid RMSE Loss: 4.032, valid MAE Loss: 3.219


100%|████████████████████████████████████| 254/254 [00:01<00:00, 179.89it/s]


Epoch 2 MSE Loss: 10.41, valid MSE Loss: 9.952, valid RMSE Loss: 3.155, valid MAE Loss: 2.49


100%|████████████████████████████████████| 254/254 [00:01<00:00, 146.92it/s]


Epoch 3 MSE Loss: 4.725, valid MSE Loss: 6.357, valid RMSE Loss: 2.521, valid MAE Loss: 1.999


100%|████████████████████████████████████| 254/254 [00:01<00:00, 170.30it/s]


Epoch 4 MSE Loss: 2.772, valid MSE Loss: 4.857, valid RMSE Loss: 2.204, valid MAE Loss: 1.759


100%|████████████████████████████████████| 254/254 [00:01<00:00, 180.77it/s]


Epoch 5 MSE Loss: 1.816, valid MSE Loss: 4.055, valid RMSE Loss: 2.014, valid MAE Loss: 1.593


100%|████████████████████████████████████| 254/254 [00:01<00:00, 182.81it/s]


Epoch 6 MSE Loss: 1.161, valid MSE Loss: 3.717, valid RMSE Loss: 1.928, valid MAE Loss: 1.515


100%|████████████████████████████████████| 254/254 [00:01<00:00, 188.54it/s]


Epoch 7 MSE Loss: 0.879, valid MSE Loss: 3.592, valid RMSE Loss: 1.895, valid MAE Loss: 1.479


100%|████████████████████████████████████| 254/254 [00:01<00:00, 172.01it/s]


Epoch 8 MSE Loss: 0.843, valid MSE Loss: 3.405, valid RMSE Loss: 1.845, valid MAE Loss: 1.427


100%|████████████████████████████████████| 254/254 [00:01<00:00, 162.83it/s]


Epoch 9 MSE Loss: 0.864, valid MSE Loss: 3.347, valid RMSE Loss: 1.83, valid MAE Loss: 1.397


100%|████████████████████████████████████| 254/254 [00:01<00:00, 187.04it/s]


Epoch 10 MSE Loss: 0.844, valid MSE Loss: 3.309, valid RMSE Loss: 1.819, valid MAE Loss: 1.389


100%|████████████████████████████████████| 254/254 [00:01<00:00, 168.35it/s]


Epoch 11 MSE Loss: 0.854, valid MSE Loss: 3.27, valid RMSE Loss: 1.808, valid MAE Loss: 1.379


100%|████████████████████████████████████| 254/254 [00:01<00:00, 162.63it/s]


Epoch 12 MSE Loss: 0.849, valid MSE Loss: 3.325, valid RMSE Loss: 1.823, valid MAE Loss: 1.379


 87%|███████████████████████████████▎    | 221/254 [00:01<00:00, 131.39it/s]

## Evaluation

In [192]:
total_test_loss = eval_MSE_loss(test_X, model)
total_test_RMSE_loss = eval_RMSE_loss(test_X, model)
total_test_MAE_loss = eval_MAE_loss(test_X, model)
print(f"test MSE Loss: {total_test_loss}, test RMSE Loss: {total_test_RMSE_loss}, test MAE Loss: {total_test_MAE_loss}")

test MSE Loss: 3.143, test RMSE Loss: 1.773, test MAE Loss: 1.348
