## Dataset source
https://www.kaggle.com/gspmoreira/articles-sharing-reading-from-cit-deskdrop

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Import from my module.
from utils import smooth_user_preference, cat_to_id_transform

In [7]:
# Load data
interactions_df = pd.read_csv('data/users_interactions.csv')

# Process interactions_df
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}
interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])

In [8]:
interactions_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1.0
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1.0
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,3.0
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,,1.0


In [9]:
# Set column name mappings.
USER_KEY = "personId"
ITEM_KEY = "contentId"
RATE_KEY = "eventStrength"

In [10]:
tmp = interactions_df.groupby([USER_KEY, ITEM_KEY]).size()
users_interactions_count_df = tmp.groupby(USER_KEY).size()

print('# users: %d' % len(users_interactions_count_df))

# users: 1895


In [11]:
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5]
users_with_enough_interactions_df = users_with_enough_interactions_df.reset_index()[[USER_KEY]]

print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users with at least 5 interactions: 1140


In [12]:
interactions_from_selected_users_df = interactions_df.merge(
    users_with_enough_interactions_df, 
    how = 'right',
    left_on = USER_KEY,
    right_on = USER_KEY,
)
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions from users with at least 5 interactions: 69868


In [13]:
interactions_full_df = interactions_from_selected_users_df \
                    .groupby([USER_KEY, ITEM_KEY])[RATE_KEY].sum() \
                    .apply(smooth_user_preference).reset_index()

print('# of unique user/item interactions: %d' % len(interactions_full_df))

# of unique user/item interactions: 39106


In [14]:
# Transform all ids to categories
u2idx, u_cat = cat_to_id_transform(interactions_full_df[USER_KEY])
i2idx, i_cat = cat_to_id_transform(interactions_full_df[ITEM_KEY])
interactions_full_df[USER_KEY] = u_cat
interactions_full_df[ITEM_KEY] = i_cat

In [146]:
interactions_train_df, interactions_test_df = train_test_split(
    interactions_full_df,
    stratify=interactions_full_df['personId'], 
    test_size=0.3,
    random_state=42
)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 27374
# interactions on Test set: 11732


In [147]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size, lam_u, lam_v):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)

        # initializing our matrices with normal distribution
        nn.init.normal_(self.user_emb.weight)
        self.user_emb.weight.mul(0.1)
        nn.init.normal_(self.item_emb.weight)
        self.item_emb.weight.mul(0.1)
        
        self.lam_u = lam_u
        self.lam_v = lam_v


    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        output = (u*v).sum(1)     # taking the dot product
        
        # Perform Frobenius norm (but without sqrt)
        u_reg = self.lam_u * torch.sum(u**2)
        v_reg = self.lam_v * torch.sum(v**2)

        return output, u_reg, v_reg

In [152]:
num_users = len(interactions_train_df[USER_KEY])
num_items = len(interactions_train_df[ITEM_KEY])
print(num_users)
print(num_items)
model = MF(num_users, num_items, emb_size=10, lam_u=0.01, lam_v=0.01)

# train_df, valid_df = train_test_split(dataset, test_size=0.2)
# resetting indices to avoid indexing errors
train_df = interactions_train_df.reset_index(drop=True)
test_df = interactions_test_df.reset_index(drop=True)

27374
27374


In [153]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [154]:
def train_epocs(model, epochs=150000, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
#     optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.8)
    model = model.to(device)
    model.train()
    for epoch in tqdm(range(epochs)):
        optimizer.zero_grad()
        
        user_tensor = torch.LongTensor(train_df[USER_KEY].values).to(device)
        item_tensor = torch.LongTensor(train_df[ITEM_KEY].values).to(device)
        ratings = torch.FloatTensor(train_df[RATE_KEY].values).to(device)
        # print(torch.max(item_tensor))
        # print(torch.min(item_tensor))
        
        y_hat, u_reg, v_reg = model(user_tensor, item_tensor)
        
        loss = F.mse_loss(y_hat, ratings) + u_reg + v_reg
        if epoch % 10000 == 0:
            print(f"Epoch: {epoch}, Loss: {loss}")

        loss.backward()
        optimizer.step()
        

def test(model):
    model.eval()
    user_tensor = torch.LongTensor(test_df[USER_KEY].values).to(device)
    item_tensor = torch.LongTensor(test_df[ITEM_KEY].values).to(device)
    ratings = torch.FloatTensor(test_df[RATE_KEY].values).to(device)
    y_hat, _, _ = model(user_tensor, item_tensor)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [155]:
# Start training
train_epocs(model, epochs=30000)

  0%|▍                                                                                                                          | 115/30000 [00:00<00:52, 572.32it/s]

Epoch: 0, Loss: 5500.333984375


 34%|████████████████████████████████████████▋                                                                                | 10077/30000 [00:15<00:30, 648.61it/s]

Epoch: 10000, Loss: 2.617297887802124


 67%|████████████████████████████████████████████████████████████████████████████████▉                                        | 20075/30000 [00:31<00:16, 615.03it/s]

Epoch: 20000, Loss: 2.6176130771636963


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30000/30000 [00:46<00:00, 644.72it/s]


In [156]:
# Start testing
test(model)

test loss 2.615 
