# Lab 8: Recommender System

In this assignment, we will study how to do user-based collaborative filtering and item-based collaborative filtering. 

In [13]:
ml_100k = '/Users/omarshakir/Desktop/Movie Reccomender System/ml_100k'

## 1. Dataset

In this assignment, we will use MovieLens-100K dataset. It includes about 100,000 ratings from 1000 users on 1700 movies.  

In [15]:
# 1. load data
# Load training and test data for user-movie ratings and movie information
train_df = pd.read_csv('./ml-100k/u1.base',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])
test_df = pd.read_csv('./ml-100k/u1.test',
                            sep='\t',names=['user_id','movie_id','rating'], usecols=[0,1,2])
movie_info = pd.read_csv('./ml-100k/u.item', 
                          sep='|', names=['movie_id','title'], usecols=[0,1],
                          encoding="ISO-8859-1")

# Merge movie information with user-movie ratings
train_df = pd.merge(movie_info, train_df)
test_df = pd.merge(movie_info, test_df)

# 2. get the rating matrix. Each row is a user, and each column is a movie.
# Convert the merged user-movie ratings to a pivot table for easy use
train_df = train_df.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')
test_df = test_df.pivot_table(index=['user_id'],
                                        columns=['title'],
                                        values='rating')

# Reindex the training and test data to include all movies and users in both datasets
train_df = train_df.reindex(
                            index=train_df.index.union(test_df.index), 
                            columns=train_df.columns.union(test_df.columns) )
test_df = test_df.reindex(
                            index=train_df.index.union(test_df.index), 
                            columns=train_df.columns.union(test_df.columns) )

# Print the shape of the training and test data
print(train_df.shape)
print(test_df.shape)

(943, 1664)
(943, 1664)


## Task 1. User-based CF

* Use pearson correlation to get the similarity between different users.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [16]:
# your code

from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_absolute_error
import math

# Replace Nans with row avg
train_df['avg'] = train_df.mean(axis=1)
train_df_no_nan = train_df.T.fillna(train_df['avg'], axis=0).T

# Calculate Pearson similarity
pearson_sim_train = 1-pairwise_distances(train_df_no_nan, metric="correlation")

# Fit Nearest Neighbors model
train_model = NearestNeighbors(n_neighbors=10)
train_model.fit(pearson_sim_train)

# Get nearest neighbors
neighbors_distance, neighbors_ind = train_model.kneighbors()
neighbors_ind += 1 # +1 fixes off by one error since ids start at 1 instead of 0

# Predict ratings for test data
predictions = []
actual = []
for user_id, row in test_df.iterrows():
    for movie, rating in row.items():
        if not pd.isnull(rating):
            predicted_rating = 0
            sum_of_sim = 0
            for x in range(0,10):
                ngbh_id = neighbors_ind[user_id-1][x]
                nghb_rating = train_df.loc[ngbh_id,movie]
                if not pd.isnull(nghb_rating):
                    nghb_distance = neighbors_distance[user_id-1][x]
                    sum_of_sim += nghb_distance
                    predicted_rating += nghb_distance*(nghb_rating-train_df.loc[ngbh_id, 'avg'])
            if (sum_of_sim != 0):
                predicted_rating = predicted_rating/sum_of_sim
                predicted_rating += train_df.loc[user_id, 'avg']
                predictions.append(predicted_rating)
                actual.append(rating)

# Calculate MAE
mae = mean_absolute_error(predictions, actual)
print('MAE: ' + str(mae))

MAE: 0.8316716024705649


## Task 2. Item-based CF
* Use cosine similarity to get the similarity between different items.
* Based on the obtained similarity score, predict the ratings. You can use 5 nearest neighbors or 10 nearest neighbors.
* Compute MAE for the testing set.

In [17]:
# your code
ir_train = train_df.T
ir_test = test_df.T

# Replace Nans with row average
ir_train['avg'] = ir_train.mean(axis=1)
ir_train_no_nan = ir_train.T.fillna(ir_train['avg'], axis=0).T

# Remove rows with no data
dropped_rows = ir_train_no_nan[ir_train_no_nan.isna().any(axis=1)]
ir_train_no_nan = ir_train_no_nan.drop(dropped_rows.index)
ir_test = ir_test.drop(dropped_rows.index)

pearson_sim_train = 1-pairwise_distances(ir_train_no_nan, metric="cosine")

train_model = NearestNeighbors(n_neighbors=10)
train_model.fit(pearson_sim_train)

neighbors_distance, neighbors_ind = train_model.kneighbors()
neighbors_ind += 1 # +1 fixes off by one error since ids start at 1 instead of 0

predictions = []
actual = []

for movie_id, row in ir_test.iterrows():
    item_id = ir_test.index.get_loc(movie_id)
    for user, rating in row.items():
        if not pd.isnull(rating):
            predicted_rating = 0
            sum_of_sim = 0
            for x in range(0,10):
                ngbh_id = neighbors_ind[item_id][x]
                nghb_rating = ir_train.iloc[ngbh_id].loc[user]
                if not pd.isnull(nghb_rating):
                    nghb_distance = neighbors_distance[item_id][x]
                    sum_of_sim += nghb_distance
                    predicted_rating += nghb_distance*(nghb_rating)
            if (sum_of_sim != 0):
                predicted_rating = predicted_rating/sum_of_sim
                predictions.append(predicted_rating)
                actual.append(rating)

mae = mean_absolute_error(predictions, actual)
print('MAE: ' + str(mae))

MAE: 1.037308373538804
