# Assignment 4: Collaborative Filtering (item-item)

### Reading Data

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform

In [2]:
tags = pd.read_csv('data/movie-tags.csv', header=None, names=['movie_id', 'tag'])
movies = pd.read_csv('data/movie-titles.csv', header=None, names=['movie_id', 'movie', 'genres'])
ratings = pd.read_csv('data/ratings.csv', header=None, names=['user_id', 'movie_id', 'rating'])

## 1. Compute Similarities:
compute the similarities between items and store them in a model:

* Normalize each item rating vector by subtracting the item’s mean rating from each rating prior to computing similarities

In [3]:
movies_ratings = ratings.groupby('movie_id')
means = movies_ratings.transform('mean')
item_mean_ratings = movies_ratings.mean()

subtracting each item's mean rating from their ratings

In [4]:
adjusted_ratings = ratings.copy()
adjusted_ratings['rating'] = ratings['rating'] - means['rating']

create item user ratings

In [5]:
item_user_matrix = adjusted_ratings.pivot(index='movie_id', columns='user_id', values='rating')
item_user_matrix = item_user_matrix.fillna(0)

In [6]:
user_ratings = adjusted_ratings.groupby('user_id')

* Use cosine similarity between normalized item rating vectors

create pairwise cosine similarity matrix between users

In [7]:
similarities = squareform(1 - pdist(item_user_matrix, 'cosine'))
similarity_matrix = pd.DataFrame(similarities,
                                 index=item_user_matrix.index, columns=item_user_matrix.index)

## 2. Scoring items:
score the items using the weighted average of the user’s ratings for similar items.

In [8]:
def find_items_rated_by_user(user_id):
    return user_ratings.get_group(user_id)['movie_id'].values

* find top 30 similar items to the target user and filter those who rated by the target user

In [9]:
def find_similar_items(target_user, target_item, item_neighbor_limit=20):
    similar_items = similarity_matrix[target_item]
    items_rated_target_user = find_items_rated_by_user(target_user)
    similar_items = similar_items[items_rated_target_user]
    similar_items = similar_items.nlargest(item_neighbor_limit)
    return similar_items[similar_items > 0]

### Combine Ratings

* apply weighted average on ratings to combine and use cosine similarity as the weights

In [10]:
def get_adjusted_score(target_user, target_item):
    similarities = find_similar_items(target_user, target_item)
    r = item_user_matrix.loc[similarities.index]
    return sum(r[target_user] * similarities)/sum(abs(similarities))

### Calculate Prediction Score

* re-adjust the prediction

In [11]:
def get_predictions(target_user, target_item=None, top_n=10):
    if not target_user in user_ratings.groups:
        print "this user has no ratings: ", target_user
        return
    
    if target_item == None:
        target_item = [x for x in movies['movie_id'].values
                       if not x in user_ratings.get_group(target_user)['movie_id'].values]
    
    if not isinstance(target_item, list):
        target_item_rating_mean = item_mean_ratings.loc[target_item]['rating']
        return target_item_rating_mean + get_adjusted_score(target_user, target_item)
    
    predictions = {}
    for item in target_item:
        mean_adjusted_rating = get_adjusted_score(target_user, item)
        target_item_rating_mean = item_mean_ratings.loc[item]['rating']
        predictions[item] = target_item_rating_mean + mean_adjusted_rating
    return pd.Series(predictions).nlargest(top_n)

## Predicting -- Usage Examples

In [12]:
get_predictions(320)

7502     4.484422
1224     4.422800
858      4.407754
318      4.402919
1203     4.385783
3462     4.378955
99114    4.375891
4973     4.375703
898      4.371050
922      4.356929
dtype: float64

In [13]:
get_predictions(320, target_item=[11,922,123])

922    4.356929
123    4.085160
11     3.282354
dtype: float64