## Kernelized PMF

This code follows the procedure described in the paper 'Kernelized Probabilistic Matrix Factorization: Exploiting Graphs and Side Information' by Tinghui Zhou et. al. It can be found at https://epubs.siam.org/doi/abs/10.1137/1.9781611972825.35

In [1]:
import torch
import numpy as np
import pandas as pd

In [2]:
#Reading data and creating the ratings matrix
df = pd.read_csv('ratings.csv')

In [3]:
# Filter sparse movies
min_movie_ratings = 10
filter_movies = (df['movieId'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

# Filter sparse users
min_user_ratings = 10
filter_users = (df['userId'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

df = df[(df['movieId'].isin(filter_movies)) & (df['userId'].isin(filter_users))]
del filter_movies, filter_users, min_movie_ratings, min_user_ratings

In [4]:
# Shuffle DataFrame
df = df.drop('timestamp', axis=1).sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,userId,movieId,rating
0,239,2167,4.0
1,438,1517,3.5
2,185,1288,3.0
3,73,59258,4.5
4,216,671,2.0


In [5]:
from sklearn.model_selection import train_test_split
df_train , df_test = train_test_split(df, test_size = 0.2)
ratings = pd.pivot_table(df_train, index = 'userId', columns = 'movieId', values = 'rating')
ratings[ratings.isnull()] = -1
ratings=torch.FloatTensor(ratings.values)
non_zero_mask = (ratings != -1).type(torch.FloatTensor)
if(min(torch.sum(non_zero_mask,1))==0):
    print('Do another split')
else:
    print(f"Min no of rating by a user in train set: {min(torch.sum(non_zero_mask,1))}")      

Min no of rating by a user in train set: 3.0


In [6]:
ratings = pd.pivot_table(df_train, index = 'userId', columns = 'movieId', values = 'rating')

#Scaling
min_rating, max_rating = df['rating'].min(), df['rating'].max()
ratings = (ratings - min_rating) / (max_rating - min_rating)

ratings[ratings.isnull()] = -1
ratings = torch.FloatTensor(ratings.values)
ratings = ratings

ratings_original = pd.pivot_table(df_train, index = 'userId', columns = 'movieId', values = 'rating')
ratings_original[ratings_original.isnull()] = -1
ratings_original = torch.FloatTensor(ratings_original.values)
ratings_original = ratings_original


rate2 = pd.pivot_table(df, index = 'userId', columns = 'movieId', values = 'rating')
rate2 = torch.FloatTensor(rate2.values)
rate2 = rate2

rate = pd.pivot_table(df_train, index = 'userId', columns = 'movieId', values = 'rating')
rate[rate.isnull()] = -1
rate[rate >= 0] = np.nan
rate = rate + 1
rate = torch.FloatTensor(rate.values)
rate = rate

test = rate2 - rate
test[torch.isnan(test)] = -1

n_users, n_movies = ratings.shape
n_users, n_movies

(610, 2121)

#### Kernel Development

In [7]:
from scipy.spatial import distance
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
md = pd.read_csv('movies.csv')
md=md.drop('title', axis=1)
md.head()

Unnamed: 0,movieId,genres
0,1,Adventure|Animation|Children|Comedy|Fantasy
1,2,Adventure|Children|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama|Romance
4,5,Comedy


In [9]:
md_filtrd=md[md['movieId'].isin(df['movieId'])]
print(md_filtrd.shape)

(2121, 2)


In [10]:
md_filtrd['genres']=md_filtrd['genres'].str.replace('|', ' ')
md_filtrd['genres']=md_filtrd['genres'].str.replace('-', '')
md_filtrd['genres']=md_filtrd['genres'].str.replace('(no genres listed)', 'none')
md_filtrd['genres']=md_filtrd['genres'].str.replace('IMAX', '')
md_filtrd.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

Unnamed: 0,movieId,genres
0,1,Adventure Animation Children Comedy Fantasy
1,2,Adventure Children Fantasy
2,3,Comedy Romance
4,5,Comedy
5,6,Action Crime Thriller


In [11]:
# Create the bag of words feature matrix
count = CountVectorizer()
bag_of_words = count.fit_transform(md_filtrd['genres'])

# Get feature names
feature_names = count.get_feature_names()

# View feature names
feature_names

['action',
 'adventure',
 'animation',
 'children',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'fantasy',
 'filmnoir',
 'horror',
 'musical',
 'mystery',
 'romance',
 'scifi',
 'thriller',
 'war',
 'western']

In [12]:
# Show feature matrix
bag_of_words=bag_of_words.toarray()
print(bag_of_words.shape)
print(type(bag_of_words))

(2121, 18)
<class 'numpy.ndarray'>


In [13]:
# Adjacency Matrix using jaccard distance
A=np.zeros((n_movies, n_movies))
threshold=0.6
for i in range(n_movies):
    for j in range(n_movies):
        A[i,j]=int(distance.jaccard(bag_of_words[i],bag_of_words[j])<threshold)        

In [14]:
# Creating the Laplacian
D=np.sum(A,1).reshape((2121,1))
L=D-A #Laplacian
L.shape
#np.all(np.linalg.eigvals(L) > 0)

(2121, 2121)

In [15]:
# Diffusion Kernel
#beta=0.01
#Kv=np.exp(-beta*L)
#Sv=np.linalg.inv(Kv)

# Regularized Laplacian Kernel
I=np.eye(n_movies)
Kv=np.linalg.inv(I+0.1*L)
Sv=torch.from_numpy(I+0.1*L).float()

# Commute Time (CT) kernel
# Kv=np.linalg.pinv(L)
# Sv=torch.from_numpy(L).float()

#### Kernelized PMF Class

In [16]:
#Instead of returning the prediction, this returns the loss directly in the forward prop
class KernelizedPMFLoss(torch.nn.Module):
    def __init__(self, lam_u=0.5, lam_v=0.5):
        super().__init__()
        self.lam_u = lam_u
        self.lam_v = lam_v
    
    def forward(self, matrix, u_features, v_features):
        non_zero_mask = (matrix != -1).type(torch.FloatTensor)
        predicted = torch.sigmoid(torch.mm(u_features, v_features.t()))
        
        diff = ((matrix - predicted)**2)
        prediction_error = torch.sum(diff*non_zero_mask)

        u_regularization = self.lam_u * torch.sum(u_features.norm(dim=1)**2)
        v_regularization = self.lam_v * torch.trace(torch.mm(torch.mm(v_features.t(),Sv),v_features))
        
        return prediction_error + u_regularization + v_regularization

#### Gradient Descent to optimize parameters

In [17]:
latent_vectors, lam_u, lam_v = 25, 0.5, 0.5
user_features = torch.randn(n_users, latent_vectors, requires_grad=True)
user_features.data.mul_(0.01)
movie_features = np.random.multivariate_normal(np.zeros(n_movies), Kv, latent_vectors).T
movie_features = torch.from_numpy(movie_features).float().requires_grad_(True)
#movie_features.data.mul_(0.01)

PMF_model = KernelizedPMFLoss(lam_u, lam_v)
optimizer = torch.optim.Adam([user_features, movie_features], lr=0.01)
for step, epoch in enumerate(range(3000)):
    optimizer.zero_grad()
    loss = PMF_model(ratings, user_features, movie_features)
    loss.backward()
    optimizer.step()
    if step % 500 == 0:
        print(f"Step {step}, {loss:.3f}")

  after removing the cwd from sys.path.


Step 0, 30072.535
Step 500, -3779196416.000
Step 1000, -14953871360.000
Step 1500, -32281690112.000
Step 2000, -54907912192.000
Step 2500, -82229436416.000


In [18]:
#Training error
predicted = torch.sigmoid(torch.mm(user_features, movie_features.t()))
predicted = (predicted*(max_rating - min_rating) + min_rating)

non_zero_mask = (ratings != -1).type(torch.FloatTensor)
torch.sqrt(torch.sum(((predicted - ratings_original) ** 2) * non_zero_mask)/len(df_train))

tensor(1.2079, grad_fn=<SqrtBackward>)

In [19]:
#Testing Error
non_zero_mask = (test != -1).type(torch.FloatTensor)
torch.sqrt(torch.sum(((predicted - test) ** 2) * non_zero_mask)/len(df_test))

tensor(1.4673, grad_fn=<SqrtBackward>)