In [1]:
%load_ext autoreload
%autoreload 2

In [258]:
import pandas as pd
import numpy as np
import json
import os
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import re
import seaborn as sns
import matplotlib.pyplot as plt
import swifter
import multiprocessing
import time
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.formula.api as smf
from collections import defaultdict
from dataclasses import dataclass
from torch import nn
import torch.nn.functional as F
import torch
import random
import copy
from tqdm import tqdm

sns.set_style("darkgrid")

# Loading Processed Reviews

In [3]:
# Paths
PROCESSED_FOLDER = './data/processed/'
PROCESSED_REVIEWS_FILE = 'processed_reviews.csv'

In [4]:
reviews = pd.read_csv(os.path.join(PROCESSED_FOLDER, PROCESSED_REVIEWS_FILE))

In [5]:
reviews.head()

Unnamed: 0,review_id,user_id,item_id,text,rating,tokenized_text
0,255938,0,1,"First things first. My ""reviews"" system is exp...",8,"[['First', 'things', 'first', '.'], ['My', '``..."
1,259117,1,2,Let me start off by saying that Made in Abyss ...,10,"[['Let', 'me', 'start', 'off', 'by', 'saying',..."
2,253664,2,3,"Art 9/10: It is great, especially the actions ...",7,"[['Art', '9/10', ':', 'It', 'is', 'great', ','..."
3,247454,3,4,As someone who loves Studio Ghibli and its mov...,6,"[['As', 'someone', 'who', 'loves', 'Studio', '..."
4,23791,4,5,code geass is one of those series that everybo...,10,"[['code', 'geass', 'is', 'one', 'of', 'those',..."


# Converting Data for Modeling

In [6]:
# Convert item_id to 0 indexed
if min(reviews['item_id']) != 0:
    reviews['item_id'] = reviews['item_id'] - 1
    print("Done")

Done


In [7]:
@dataclass
class Review:
    user_id: int
    item_id: int
    rating: int

In [8]:
USER_KEY = 'user_id'
ITEM_KEY = 'item_id'
RATING_KEY = 'rating'

In [9]:
user_to_reviews = defaultdict(list)
for _, row in reviews.iterrows():
    user_id, item_id, rating = row[USER_KEY], row[ITEM_KEY], row[RATING_KEY]
    user_to_reviews[user_id].append(Review(user_id, item_id, rating))

## Creating the score matrix

In [10]:
# users by items
X = np.zeros(shape=(reviews['user_id'].nunique(), reviews['item_id'].nunique()))

In [11]:
for _, row in reviews.iterrows():
    user_id, item_id, rating = row[USER_KEY], row[ITEM_KEY], row[RATING_KEY]
    X[user_id][item_id] = rating

## Train/Test Split

In [63]:
train_X = copy.deepcopy(X)
valid_X = np.zeros(shape=X.shape)
test_X = np.zeros(shape=X.shape)

for user_id, reviews in user_to_reviews.items():
    random.shuffle(reviews)
    # Leave one out for valid
    valid_review = reviews[0]
    train_X[valid_review.user_id][valid_review.item_id] = 0
    valid_X[valid_review.user_id][valid_review.item_id] = valid_review.rating
    
    # Leave one out for test
    test_review = reviews[1]
    train_X[test_review.user_id][test_review.item_id] = 0
    test_X[test_review.user_id][test_review.item_id] = test_review.rating
    
    # Rest for train

## Vectorized Matrix Factorization

In [34]:
def l2_regularization(values):
    return torch.sum(torch.square(values))

In [367]:
class VanillaMF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=20, regularization_constant=1e-6):
        super().__init__()
        self.user_factors = nn.Embedding(num_users, embedding_dim)
        self.item_factors = nn.Embedding(num_items, embedding_dim)
        self.regularization_constant = regularization_constant
        
    def forward(self, user: torch.LongTensor, item: torch.LongTensor):
        # (users, emb_dim) * (items, emb_dim) = (interactions, emb_dim)
        result_tensor = self.user_factors(user) * self.item_factors(item)
        user_latent_factors = self.user_factors(user)
        item_latent_factors = self.item_factors(item)
        pred_rating = user_latent_factors @ item_latent_factors.T
        # Sum along row
        # pred_rating = torch.sum(result_tensor, axis=1)
        pred_rating = 1 + 9 * torch.sigmoid(pred_rating)
        return pred_rating.diagonal()
    
    def loss(self, pred_rating, rating):
        # MSE
        MSE_loss = F.mse_loss(pred_rating, rating)
        
        # L2 Regularization
        sum_of_squared_values = l2_regularization(self.user_factors.weight) + l2_regularization(self.item_factors.weight)
        l2_penalty = (1/len(rating)) * self.regularization_constant * sum_of_squared_values
        
        # Total Loss
        total_loss = MSE_loss + l2_penalty
        return total_loss
    
    def predict_single_interaction(self, user_id: int, item_id: int):
        user = torch.LongTensor([user_id])
        item = torch.LongTensor([item_id])
        return self.forward(user, item)

In [349]:
def train_v2(train_X, valid_X, model, optimizer, n_epochs=10, batch_size=5):
    """Training Function, calculates training and validation loss"""
    
    for epoch in (range(1, n_epochs+1)):
        rows, cols = train_X.nonzero()
        p = np.random.permutation(len(rows))
        
        
        rows, cols = rows[p], cols[p]

        
        # Loss doesn't include regularization term
        total_train_loss = 0
        
        for i in tqdm(range(len(rows) // batch_size)):
            curr_rows = rows[i*batch_size:i*batch_size+batch_size]
            curr_cols = cols[i*batch_size:i*batch_size+batch_size]

            # Set gradients to zero
            optimizer.zero_grad()

            # Turn data into tensors
            # rating = torch.FloatTensor([[train_X[row, col] for row,col in zip(curr_rows, curr_cols)])
            rating = torch.FloatTensor([train_X[row, col] for row,col in zip(curr_rows, curr_cols)])
            users = torch.LongTensor(curr_rows)
            items = torch.LongTensor(curr_cols)

            # Predict and calculate loss
            pred_rating = model.forward(users, items)
            assert pred_rating.shape == rating.shape
            loss = model.loss(pred_rating, rating)

            # Backpropagate
            loss.backward()

            # Update the parameters
            optimizer.step()

            # Total MSE Loss (just for status updates) for this epoch
            total_train_loss += F.mse_loss(pred_rating, rating, reduction='sum')

        # Loss doesn't include regularization term
        rows, cols = valid_X.nonzero()
        rating = torch.FloatTensor([valid_X[row, col] for row, col in zip(rows, cols)])
        users = torch.LongTensor(rows)
        items = torch.LongTensor(cols)

        # Predict and calculate loss
        pred_rating = model.forward(users, items)
        
        total_valid_loss = F.mse_loss(pred_rating, rating, reduction='sum')
            
        
        print(f"Epoch {epoch} MSE Loss: {total_train_loss / len(train_X.nonzero()[0])}, valid MSE Loss: {total_valid_loss / len(valid_X.nonzero()[0])}")

In [293]:
embedding_dim=40
lr=1e-3
regularization_constant=0

model = VanillaMF(num_users=X.shape[0], num_items=X.shape[1], embedding_dim=embedding_dim, regularization_constant=regularization_constant)
# optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-2)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=1e-3)

In [294]:
# weight decay in regularization constant
embedding_dim=100
lr=1e-3
regularization_constant=0

model = VanillaMF(num_users=X.shape[0], num_items=X.shape[1], embedding_dim=embedding_dim, regularization_constant=regularization_constant)
# optimizer = torch.optim.Adagrad(model.parameters(), lr=lr, momentum=0.9)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

In [368]:
# these parameter settings are pretty good, just adjust LR after you get low enough => MSE of 3.6)
# batch_size=64
# Adam
# weight decay in regularization constant
embedding_dim=200
lr=1e-2
regularization_constant=1e-2

model = VanillaMF(num_users=X.shape[0], num_items=X.shape[1], embedding_dim=embedding_dim, regularization_constant=regularization_constant)
# optimizer = torch.optim.Adagrad(model.parameters(), lr=lr, momentum=0.9)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [363]:
# train_X = copy.deepcopy(X)
# valid_X = np.zeros(shape=X.shape)
# test_X = np.zeros(shape=X.shape)

# for user_id, reviews in user_to_reviews.items():
#     random.shuffle(reviews)
#     # Leave one out for valid
#     valid_review = reviews[0]
#     train_X[valid_review.user_id][valid_review.item_id] = 0
#     valid_X[valid_review.user_id][valid_review.item_id] = valid_review.rating
    
#     # Leave one out for test
#     test_review = reviews[1]
#     train_X[test_review.user_id][test_review.item_id] = 0
#     test_X[test_review.user_id][test_review.item_id] = test_review.rating
    
#     # Rest for train

In [370]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
model.regularization_constant = 1e-2

In [385]:
train_v2(train_X, valid_X, model, optimizer, n_epochs=20, batch_size=64)

100%|████████████████████████████████████| 508/508 [00:02<00:00, 210.46it/s]


Epoch 1 MSE Loss: 0.3070699870586395, valid MSE Loss: 3.655904531478882


100%|████████████████████████████████████| 508/508 [00:02<00:00, 242.07it/s]


Epoch 2 MSE Loss: 0.31069415807724, valid MSE Loss: 3.6307990550994873


100%|████████████████████████████████████| 508/508 [00:02<00:00, 229.45it/s]


Epoch 3 MSE Loss: 0.30833128094673157, valid MSE Loss: 3.640995740890503


100%|████████████████████████████████████| 508/508 [00:02<00:00, 237.16it/s]


Epoch 4 MSE Loss: 0.30671894550323486, valid MSE Loss: 3.641130208969116


100%|████████████████████████████████████| 508/508 [00:02<00:00, 231.97it/s]


Epoch 5 MSE Loss: 0.3076803386211395, valid MSE Loss: 3.641362428665161


100%|████████████████████████████████████| 508/508 [00:02<00:00, 208.05it/s]


Epoch 6 MSE Loss: 0.3067213296890259, valid MSE Loss: 3.627647638320923


100%|████████████████████████████████████| 508/508 [00:03<00:00, 155.45it/s]


Epoch 7 MSE Loss: 0.30461814999580383, valid MSE Loss: 3.643526554107666


100%|████████████████████████████████████| 508/508 [00:03<00:00, 143.12it/s]


Epoch 8 MSE Loss: 0.30677783489227295, valid MSE Loss: 3.6434247493743896


100%|████████████████████████████████████| 508/508 [00:02<00:00, 190.89it/s]


Epoch 9 MSE Loss: 0.30403047800064087, valid MSE Loss: 3.6368014812469482


100%|████████████████████████████████████| 508/508 [00:02<00:00, 196.33it/s]


Epoch 10 MSE Loss: 0.306076318025589, valid MSE Loss: 3.644857406616211


100%|████████████████████████████████████| 508/508 [00:02<00:00, 175.84it/s]


Epoch 11 MSE Loss: 0.30747881531715393, valid MSE Loss: 3.6424314975738525


100%|████████████████████████████████████| 508/508 [00:03<00:00, 153.96it/s]


Epoch 12 MSE Loss: 0.30444827675819397, valid MSE Loss: 3.64949369430542


100%|████████████████████████████████████| 508/508 [00:02<00:00, 186.00it/s]


Epoch 13 MSE Loss: 0.3044379949569702, valid MSE Loss: 3.6454830169677734


100%|████████████████████████████████████| 508/508 [00:02<00:00, 209.69it/s]


Epoch 14 MSE Loss: 0.30440062284469604, valid MSE Loss: 3.647071123123169


100%|████████████████████████████████████| 508/508 [00:02<00:00, 194.69it/s]


Epoch 15 MSE Loss: 0.30392882227897644, valid MSE Loss: 3.638470411300659


100%|████████████████████████████████████| 508/508 [00:02<00:00, 208.10it/s]


Epoch 16 MSE Loss: 0.30227571725845337, valid MSE Loss: 3.6541049480438232


100%|████████████████████████████████████| 508/508 [00:02<00:00, 215.29it/s]


Epoch 17 MSE Loss: 0.30395784974098206, valid MSE Loss: 3.667353391647339


100%|████████████████████████████████████| 508/508 [00:02<00:00, 208.03it/s]


Epoch 18 MSE Loss: 0.30366766452789307, valid MSE Loss: 3.6400904655456543


100%|████████████████████████████████████| 508/508 [00:02<00:00, 218.17it/s]


Epoch 19 MSE Loss: 0.30209341645240784, valid MSE Loss: 3.6564972400665283


100%|████████████████████████████████████| 508/508 [00:02<00:00, 236.21it/s]


Epoch 20 MSE Loss: 0.30523982644081116, valid MSE Loss: 3.6384291648864746


In [None]:
train_v2(train_X, valid_X, model, optimizer, n_epochs=30)

In [383]:
my_X = test_X

In [384]:
rows, cols = my_X.nonzero()
rating = torch.FloatTensor([my_X[row, col] for row, col in zip(rows, cols)])
users = torch.LongTensor(rows)
items = torch.LongTensor(cols)

# Predict and calculate loss
pred_rating = model.forward(users, items)
total_test_loss = F.mse_loss(pred_rating, rating)
total_test_loss

tensor(3.4709, grad_fn=<MseLossBackward0>)

In [159]:
rating

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [28]:
rating = torch.FloatTensor([X[0,0]])

In [29]:
rating

tensor([8.])

In [17]:
row = torch.LongTensor([0])

In [18]:
col = torch.LongTensor([0])

In [19]:
rating, row, col

(8.0, tensor([0]), tensor([0]))