# Lab 8: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering.

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [2]:
from google.colab import drive
drive.mount('/content/drive')

from google.colab import files
uploaded = files.upload()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Saving u1.base to u1.base
Saving u1.test to u1.test
Saving u.data to u.data
Saving u.item to u.item


In [3]:
from math import sqrt
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics.pairwise import linear_kernel
from sklearn.neighbors import NearestNeighbors


# 1. load data
user_ratings_train = pd.read_csv('u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

user_ratings_test = pd.read_csv('u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])

movie_info =  pd.read_csv('u.item',
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

user_ratings_train = pd.merge(movie_info, user_ratings_train)
user_ratings_test = pd.merge(movie_info, user_ratings_test)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
user_ratings_train = user_ratings_train.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

user_ratings_test = user_ratings_test.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')




user_ratings_train = user_ratings_train.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index),
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

user_ratings_test = user_ratings_test.reindex(
                            index=user_ratings_train.index.union(user_ratings_test.index),
                            columns=user_ratings_train.columns.union(user_ratings_test.columns) )

print(user_ratings_train.shape)
print(user_ratings_test.shape)

(943, 1664)
(943, 1664)


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [4]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_absolute_error

# Replace missing values with mean value
user_ratings_train['average'] = user_ratings_train.mean(axis = 1)
user_ratings_train_filled = user_ratings_train.T.fillna(user_ratings_train['average'], axis = 0).T

# Pearson correlation
pearson_sim_train = 1 - pairwise_distances(user_ratings_train_filled, metric = "correlation")

# Train using 10 nearest neighbors
train_model = NearestNeighbors(n_neighbors = 10)
train_model.fit(pearson_sim_train)

neighbors_distance, neighbors_ind = train_model.kneighbors()
neighbors_ind += 1 #

predictions, actual = [], []

# predict the ratings
for user_id, row in user_ratings_test.iterrows():
    for movie, rating in row.iteritems():
        if not pd.isnull(rating):
            predicted_rating, sum_of_sim = 0, 0

            for x in range(0, 10):
                ngbh_id = neighbors_ind[user_id-1][x]
                nghb_rating = user_ratings_train.loc[ngbh_id,movie]

                if not pd.isnull(nghb_rating):
                    nghb_distance = neighbors_distance[user_id-1][x]
                    sum_of_sim += nghb_distance
                    predicted_rating += nghb_distance*(nghb_rating-user_ratings_train.loc[ngbh_id, 'average'])

            if (sum_of_sim != 0):
                predicted_rating = predicted_rating/sum_of_sim
                predicted_rating += user_ratings_train.loc[user_id, 'average']
                predictions.append(predicted_rating)
                actual.append(rating)

# Compute MAE
mae = mean_absolute_error(predictions, actual)
print('MAE: {}'.format(str(mae)))

  for movie, rating in row.iteritems():


MAE: 0.8316716024705649


## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [5]:
# Transpose to get features for predictions
item_ratings_train = user_ratings_train.T
item_ratings_test = user_ratings_test.T

# Replace missing values with mean value
item_ratings_train['average'] = item_ratings_train.mean(axis = 1)
item_ratings_train_filled = item_ratings_train.T.fillna(item_ratings_train['average'], axis = 0).T

# Drop rows with null data
dropped_rows = item_ratings_train_filled[item_ratings_train_filled.isna().any(axis=1)]
item_ratings_train_noNan = item_ratings_train_filled.drop(dropped_rows.index)
item_ratings_test = item_ratings_test.drop(dropped_rows.index)

# Cosine similarity
pearson_sim_train = 1 - pairwise_distances(item_ratings_train_noNan, metric="cosine")

# Train using 10 nearest neighbors
train_model = NearestNeighbors(n_neighbors=10)
train_model.fit(pearson_sim_train)

neighbors_distance, neighbors_ind = train_model.kneighbors()
neighbors_ind += 1

predictions, actual = [], []

# predict the ratings
for movie_id, row in item_ratings_test.iterrows():
    item_id = item_ratings_test.index.get_loc(movie_id)

    for user, rating in row.iteritems():
        if not pd.isnull(rating):
            predicted_rating, sum_of_sim = 0, 0

            for x in range(0, 10):
                ngbh_id = neighbors_ind[item_id][x]

                if ngbh_id < len(item_ratings_train_noNan):
                    nghb_rating = item_ratings_train_noNan.iloc[ngbh_id].loc[user]

                    if not pd.isnull(nghb_rating):
                        nghb_distance = neighbors_distance[item_id][x]
                        sum_of_sim += nghb_distance
                        predicted_rating += nghb_distance*(nghb_rating)

            if (sum_of_sim != 0):
                predicted_rating = predicted_rating/sum_of_sim
                predictions.append(predicted_rating)
                actual.append(rating)

# Compute MAE
mae = mean_absolute_error(predictions, actual)
print('MAE: {}'.format(str(mae)))

  for user, rating in row.iteritems():


MAE: 1.0284405172262243
