In [65]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import pairwise_distances 

In [2]:
INP_DIR = "data/download/ml-100k"

In [11]:
def read_users(filepath):
    cols = ["user_id", "age", "sex", "occupation", "zip_code"]
    df = pd.read_csv(filepath, sep="|", names=cols, encoding="latin-1")
    return df


def read_ratings(filepath):
    cols = ["user_id", "movie_id", "rating", "unix_timestamp"]
    df = pd.read_csv(filepath, sep="\t", names=cols, encoding="latin-1")
    return df


def read_items(filepath):
    cols = ["movie id", "movie title" , "release date", "video release date", "IMDb URL", "unknown", 
            "Action", "Adventure", "Animation", "Children\'s", "Comedy", "Crime", "Documentary", 
            "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", 
            "Sci-Fi", "Thriller", "War", "Western"]
    
    df = pd.read_csv(filepath, sep="|", names=cols, encoding="latin-1")
    return df

In [17]:
users_df = read_users(os.path.join(INP_DIR, "u.user"))
users_df.shape

(943, 5)

In [18]:
users_df.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [48]:
users_df.isnull().sum().sum()

0

In [13]:
ratings_df = read_ratings(os.path.join(INP_DIR, "u.data"))
ratings_df.shape

(100000, 4)

In [14]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [49]:
ratings_df.isnull().sum().sum()

0

In [15]:
items_df = read_items(os.path.join(INP_DIR, "u.item"))
items_df.shape

(1682, 24)

In [16]:
items_df.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [34]:
ratings_df_train = read_ratings(os.path.join(INP_DIR, "ua.base"))
ratings_df_test = read_ratings(os.path.join(INP_DIR, "ua.test"))

ratings_df.shape, ratings_df_train.shape, ratings_df_test.shape

((100000, 4), (90570, 4), (9430, 4))

The test set has 10 ratings per user.

In [25]:
ratings_df_test.groupby(["user_id"])["movie_id"].count().mean()

10.0

In [22]:
ratings_df_test.groupby(["user_id"])["movie_id"].count().nunique()

1

# Collaborative filtering

In [53]:
user_movie_matrix = pd.pivot_table(ratings_df, values="rating", index="user_id", columns="movie_id").fillna(0)
user_movie_matrix.shape

(943, 1682)

In [54]:
user_movie_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
# each row of the matrix passed to pairwise_distances is a vector
# the distance is between vectors

user_similarity = pairwise_distances(user_movie_matrix, metric="cosine")

item_similarity = pairwise_distances(user_movie_matrix.T, metric="cosine")

In [77]:
user_movie_matrix.shape, user_similarity.shape, item_similarity.shape

((943, 1682), (943, 943), (1682, 1682))

## Prediction

* For user-user similarity: $P_{u,i} = \frac{\sum_v S_{u, v} * R_{u, i}}{\sum_v S_{u, v}}$

* For item-item similarity: $P_{u,i} = \frac{\sum_n R_{u, n} * S_{n, i}}{\sum_n S_{n, i}}$

Note that S is a symmetric matrix.

In [83]:
def predict_user(user_movie_mat, similarity_mat):
    return similarity_mat.dot(user_movie_mat) / similarity_mat.sum(axis=1, keepdims=True)


def predict_item(user_movie_mat, similarity_mat):
    return user_movie_mat.dot(similarity_mat) / similarity_mat.sum(axis=0, keepdims=True)

In [88]:
user_pred = predict_user(user_movie_matrix, user_similarity)

In [89]:
item_pred = predict_item(user_movie_matrix, item_similarity)