# Assignment 3: Collaborative Filtering (user-user)

### Reading Data

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform

In [2]:
tags = pd.read_csv('data/movie-tags.csv', header=None, names=['movie_id', 'tag'])
movies = pd.read_csv('data/movie-titles.csv', header=None, names=['movie_id', 'movie', 'genres'])
ratings = pd.read_csv('data/ratings.csv', header=None, names=['user_id', 'movie_id', 'rating'])

### Score Adjustment

First, you will adjust each user’s rating vector by subtracting that user’s mean rating from each of their ratings (this corrects for the fact that some users think 5 stars is anything worth seeing and others think 3 stars is very good).

* computing each user's mean rating

In [3]:
users_ratings = ratings.groupby('user_id')
means = users_ratings.transform('mean')

* subtracting each user's mean rating from their ratings

In [4]:
adjusted_ratings = ratings.copy()
adjusted_ratings['rating'] = ratings['rating'] - means['rating']

* create user item ratings

In [5]:
user_item_matrix = adjusted_ratings.pivot(index='user_id', columns='movie_id', values='rating')
user_item_matrix = user_item_matrix.fillna(0)

In [6]:
movie_ratings = adjusted_ratings.groupby('movie_id')

### Find Similar Users

* create pairwise cosine similarity matrix between users

In [7]:
similarities = squareform(1 - pdist(user_item_matrix, 'cosine'))
similarity_matrix = pd.DataFrame(similarities,
                                 index=user_item_matrix.index, columns=user_item_matrix.index)

* find users who rated the target item

In [8]:
def find_users_rated_item(item_id):
    return movie_ratings.get_group(item_id)['user_id'].values

* find top 30 similar users to the target user and filter those who rated the target item

In [9]:
def find_similar_users(target_user, target_item, user_limit=30):
    similar_users = similarity_matrix[target_user]
    users_rated_target_item = find_users_rated_item(target_item)
    similar_users = similar_users[users_rated_target_item]
    similar_users = similar_users.nlargest(30)
    return similar_users

### Combine Ratings

* apply weighted average on ratings to combine and use cosine similarity as the weights

In [10]:
def get_adjusted_score(target_user, target_item):
    similarities = find_similar_users(target_user, target_item)
    r = user_item_matrix.loc[similarities.index]
    return sum(r[target_item] * similarities)/sum(abs(similarities))

### Calculate Prediction Score

* re-adjust the prediction back the target user’s original rating scale by adding the target user’s mean rating back into the prediction

In [11]:
def get_predictions(target_user, target_item=None, item_limit=10):
    if not target_user in similarity_matrix:
        print "cannot find this target user: ", target_user
        return
    target_user_rating_mean = users_ratings.get_group(target_user)['rating'].mean()
    
    if target_item == None:
        target_item = [x for x in movies['movie_id'].values
                       if not x in users_ratings.get_group(target_user)['movie_id'].values]
    
    if not isinstance(target_item, list):
        print type(target_item)
        return target_user_rating_mean + get_adjusted_score(target_user, target_item)
    
    
    predictions = {}
    for item in target_item:
        mean_adjusted_rating = get_adjusted_score(target_user, item)
        predictions[item] = target_user_rating_mean + mean_adjusted_rating
    return pd.Series(predictions).nlargest(item_limit)

## Predicting -- Usage Examples

In [12]:
get_predictions(320)

858      4.561585
2360     4.555820
318      4.555679
8638     4.511598
7371     4.511268
922      4.503316
1217     4.497019
44555    4.490616
1089     4.479172
2859     4.467170
dtype: float64

In [13]:
get_predictions(320, target_item=[11,922,123])

922    4.503316
123    4.441667
11     3.522166
dtype: float64