In [5]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SGD
from torch.utils.data import DataLoader, TensorDataset

import warnings
warnings.filterwarnings("ignore")

In [2]:
# define cols and read csv
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')
 
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols, encoding='latin-1')

genre_cols = [
    "genre_unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
    "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
    "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
]
movies_cols = [
    'movie_id', 'title', 'release_date', "video_release_date", "imdb_url"
] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

In [3]:
movies_count = len(movies.movie_id.unique())
users_count = len(users.user_id.unique())
interactions_count = len(ratings.user_id)

print('Movies count:', movies_count)
print('Users count:', users_count)
print('Interactions count: ', interactions_count)
print("Data Sparsity:", (1-(interactions_count/(movies_count*users_count)))*100)

Movies count: 1682
Users count: 943
Interactions count:  100000
Data Sparsity: 93.69533063577546


In [4]:
def mark_genres(movies, genres):
  def get_random_genre(gs):
    active = [genre for genre, g in zip(genres, gs) if g==1]
    if len(active) == 0:
      return 'Other'
    return np.random.choice(active)
  def get_all_genres(gs):
    active = [genre for genre, g in zip(genres, gs) if g==1]
    if len(active) == 0:
      return 'Other'
    return '-'.join(active)
  movies['genre'] = [
      get_random_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
  movies['all_genres'] = [
      get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]
 
mark_genres(movies, genre_cols)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   user_id         100000 non-null  int64
 1   movie_id        100000 non-null  int64
 2   rating          100000 non-null  int64
 3   unix_timestamp  100000 non-null  int64
dtypes: int64(4)
memory usage: 3.1 MB


In [58]:
def global_temporal_split(df, timestamp_col, user_id_col, movie_id_col, holdout_fraction=0.2):
    cutoff_date = df[timestamp_col].quantile(q=1-holdout_fraction)
    test = df[df[timestamp_col] > cutoff_date]
    train = df[df[timestamp_col] <= cutoff_date]
    users_train = train[user_id_col].unique()
    train_users_count = len(users_train)
    test_users_count = len(test[user_id_col].unique())
    test_movies_count = len(test[movie_id_col].unique())
    print("Split completed.")
    print(f"Train count: {len(train)}")
    print(f"Train users count: {train_users_count}")
    print(f"Test count: {len(test)}")
    print(f"Test users count: {test_users_count}")
    print(f"Test movies count: {test_movies_count}")
    
    # make sure all users in test are in train
    users_train = train[user_id_col].unique()
    test = test[test[user_id_col].isin(users_train)]
    print("Truncating test users invalid for CF completed.")
    print(f"Test count: {len(test)}")
    test_users_count_after = len(test[user_id_col].unique())
    print(f"Test users count: {test_users_count_after}")
    print(f"{1- (test_users_count_after/test_users_count):.2f}% of users truncated.")
    return train, test

In [29]:
train_e_df, test_i_df = global_temporal_split(ratings, 0.1)

Split completed.
Train count: 90000
Train users count: 867
Test count: 10000
Test users count: 166
Test movies count: 1343
Truncating test users invalid for CF completed.
Test count: 2886
Test users count: 90
0.45783132530120485 users truncated.


In [10]:
users_df = pd.read_csv("ml-1m/users.dat", sep="::", names=['userId', 'gender', 'ageCode', 'occupationCode', 'zipCode'])
age_code_mapping = {1:  "Under 18", 18:  "18-24", 25:  "25-34", 35:  "35-44", 45:  "45-49", 50:  "50-55", 56:  "56+"}
users_df['age'] = users_df['ageCode'].map(age_code_mapping)
occupation_code_mapping = {
    0:  "other",
	1:  "academic/educator",
	2:  "artist",
	3:  "clerical/admin",
	4:  "college/grad student",
	5:  "customer service",
	6:  "doctor/health care",
	7:  "executive/managerial",
	8:  "farmer",
	9:  "homemaker",
	10:  "K-12 student",
	11:  "lawyer",
	12:  "programmer",
	13:  "retired",
	14:  "sales/marketing",
	15:  "scientist",
	16:  "self-employed",
	17:  "technician/engineer",
	18:  "tradesman/craftsman",
	19:  "unemployed",
	20:  "writer"
}
users_df['occupation'] = users_df['occupationCode'].map(occupation_code_mapping)
users_df.head()

Unnamed: 0,userId,gender,ageCode,occupationCode,zipCode,age,occupation
0,1,F,1,10,48067,Under 18,K-12 student
1,2,M,56,16,70072,56+,self-employed
2,3,M,25,15,55117,25-34,scientist
3,4,M,45,7,2460,45-49,executive/managerial
4,5,M,25,20,55455,25-34,writer


In [9]:
movies_df = pd.read_csv("ml-1m/movies.dat", sep="::", names=['movieId', 'title', 'genres'], encoding='latin-1')
movies_genre_df = movies_df.copy()
movies_genre_df['genres'] = movies_genre_df['genres'].map(lambda x: x.split("|"))
movies_genre_df = movies_genre_df.explode('genres')
display(movies_df.head())
display(movies_genre_df.head())

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children's
0,1,Toy Story (1995),Comedy
1,2,Jumanji (1995),Adventure
1,2,Jumanji (1995),Children's


In [11]:
ratings_df = pd.read_csv("ml-1m/ratings.dat", sep="::", names=['userId', 'movieId', 'rating', 'timestamp'])
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [12]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   userId     1000209 non-null  int64
 1   movieId    1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [18]:
train_df, test_df = global_temporal_split(ratings_df, timestamp_col='timestamp', user_id_col = 'userId', movie_id_col='movieId', holdout_fraction=0.1)

Split completed.
Train count: 900188
Train users count: 6011
Test count: 100021
Test users count: 1209
Test movies count: 3407
Truncating test users invalid for CF completed.
Test count: 95812
Test users count: 1180
0.023986765922249798 users truncated.


In [None]:
train_i_df = train_df['rating'].map(lambda x: 1.0 if x >= 3.5 else 0)
test_i_df = test_df['rating'].map(lambda x: 1.0 if x >= 3.5 else 0)

In [69]:
class MFBasic(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)

    def forward(self, user_id, item_id):
        output = (self.user_embedding(user_id) * self.item_embedding(item_id)).sum(dim=1) + np.squeeze(self.user_bias(user_id)) + np.squeeze(self.item_bias(item_id))
        return output.flatten()

In [66]:
def train_model(model, optimizer, train_loader, test_loader, num_epochs=10):
    for epoch in range(num_epochs):
        total_loss = 0
        for user_id, item_id, rating in train_loader:
            prediction = model.forward(user_id, item_id)
            loss = F.mse_loss(prediction, rating)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        test_total_loss= 0
        for user_id, item_id, rating in test_loader:
            prediction = model.forward(user_id, item_id)
            loss = F.mse_loss(prediction, rating)
            test_total_loss += loss.item()
            
        print(f'Epoch {epoch+1}, Loss: {np.sqrt(total_loss/len(train_loader)):.4f}, Test Loss: {np.sqrt(test_total_loss/len(test_loader)):.4f}')

In [55]:
user_idx_mapping = {original_id: i for i, original_id in enumerate(train_df['userId'].unique())}

In [61]:
movie_idx_mapping = {original_id: i for i, original_id in enumerate(ratings_df['movieId'].unique())}

In [57]:
train_df['userId_idx'] = train_df['userId'].map(user_idx_mapping)
test_df['userId_idx'] = test_df['userId'].map(user_idx_mapping)

In [62]:
train_df['movieId_idx'] = train_df['movieId'].map(movie_idx_mapping)
test_df['movieId_idx'] = test_df['movieId'].map(movie_idx_mapping)

In [71]:
model = MFBasic(num_users=train_df['userId'].nunique(), num_items=ratings_df['movieId'].nunique(), embedding_dim=30)
optimizer = SGD(model.parameters(), lr=0.1)
train_dataset = TensorDataset(
    torch.tensor(train_df['userId_idx'].astype(int).values),
    torch.tensor(train_df['movieId_idx'].astype(int).values),
    torch.tensor(train_df['rating'].values, dtype=torch.float32)
)

test_dataset = TensorDataset(
    torch.tensor(test_df['userId_idx'].astype(int).values),
    torch.tensor(test_df['movieId_idx'].astype(int).values),
    torch.tensor(test_df['rating'].values, dtype=torch.float32)
)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)
train_model(model, optimizer, train_loader, test_loader, num_epochs=10)

Epoch 1, Loss: 5.0946, Test Loss: 4.3660
Epoch 2, Loss: 3.2657, Test Loss: 3.5204
Epoch 3, Loss: 2.5688, Test Loss: 3.0778
Epoch 4, Loss: 2.1884, Test Loss: 2.8000


KeyboardInterrupt: 

In [None]:
model_i = MFBasic(num_users=train_i_df['userId'].nunique(), num_items=train_i_df['movieId'].nunique(), embedding_dim=30)
optimizer = SGD(model_i.parameters(), lr=1.0, weight_decay=1e-4)
train_dataset = TensorDataset(
    torch.tensor(train_i_df['userId_idx'].astype(int).values),
    torch.tensor(train_i_df['movieId_idx'].astype(int).values),
    torch.tensor(train_i_df['rating'].values, dtype=torch.float32)
)

test_dataset = TensorDataset(
    torch.tensor(test_i_df['userId_idx'].astype(int).values),
    torch.tensor(test_i_df['movieId_idx'].astype(int).values),
    torch.tensor(test_i_df['rating'].values, dtype=torch.float32)
)
train_loader = DataLoader(train_dataset, batch_size=2048, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2048, shuffle=False)
train_model(model_i, optimizer, train_loader, test_loader, num_epochs=20)