In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from scipy.sparse import csr_matrix, lil_matrix
import time

In [2]:
def read_small():
    data_path = 'ml-latest-small/'
    movies = pd.read_csv(data_path + 'movies.csv')
    ratings = pd.read_csv(data_path + 'ratings.csv')
    return movies, ratings

def read_10M():
    data_path = 'ml-10M100K/'
    movies = pd.read_csv(data_path + 'movies.dat', delimiter='::', engine='python')
    ratings = pd.read_csv(data_path + 'ratings.dat', delimiter='::', engine='python')
    return movies, ratings

In [3]:
movies, ratings = read_small()

In [4]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [6]:
movie_count = len(movies)
user_count = max(ratings.userId)

mean_score = ratings.rating.values.mean()

In [7]:
all_scores = np.zeros((user_count, max(movies.movieId)), dtype=np.float32)
for (user, movie, rating, _) in ratings.values:
    all_scores[int(user)-1,int(movie)-1] = rating
all_scores = all_scores[:,movies.movieId.values.astype(int)-1]

In [8]:
def get_info(movieId, info='title'):
    return movies[info].values[np.where(movies.movieId.values == movieId)]

In [9]:
def get_user_ratings(userId):
    return ratings.values[np.where(ratings.userId == userId)][:,1:]

In [10]:
def user_baselines():
    all_scores_local = all_scores.copy()
    all_scores_local[all_scores_local == 0] = np.nan
    all_scores_local -= mean_score
    means = np.nanmean(all_scores_local, axis=1)
    return np.nan_to_num(means)

In [11]:
def item_baselines():
    usr_baselines = user_baselines()[:, np.newaxis]
    all_scores_local = all_scores.copy()
    all_scores_local[all_scores_local == 0] = np.nan
    all_scores_local -= usr_baselines + mean_score
    means = np.nanmean(all_scores_local, axis=0)
    return np.nan_to_num(means)

In [12]:
# ta funkcja zwraca szacowania nawet istniejących ocen

def baseline_predictors():
    usr_baselines = user_baselines()[:, np.newaxis]
    itm_baselines = item_baselines()
    scores = np.ones((user_count, movie_count)) * mean_score
    return scores + usr_baselines + itm_baselines

In [13]:
def recommend_cf_uu(userId, n, min_n=0, subtract_mean=True, div_by_std=True, weighted=True, metric='pearson'):
    assert not (div_by_std and not subtract_mean)
    all_scores_local = all_scores.copy()
    
    all_rated = all_scores_local > 0    
    user_rated = all_rated[userId]
    
    all_scores_local[all_scores_local == 0] = np.nan
    
    means = np.nanmean(all_scores_local, axis=1)
    user_mean = means[userId-1]
        
    stds = np.nanstd(all_scores_local, axis=1)
    user_std = stds[userId-1]
        
    all_scores_local = np.nan_to_num(all_scores_local)
    
    user_ratings = all_scores_local[userId-1]
    
    t0 = time.time()
    
    if metric == 'cosine':
        dists = cdist(user_ratings[np.newaxis] - user_mean, all_scores_local - means[:, np.newaxis], 
                      metric='cosine')[0]
    elif metric == 'pearson':
        user_centered = (user_ratings - user_mean) * user_rated
        all_centered = (all_scores_local - means[:, np.newaxis]) * all_rated
        numerator = (user_centered * all_centered).sum(axis=1)
        denominator = np.sqrt((user_centered**2).sum()) * np.sqrt((all_centered**2).sum(axis=1))
        dists = numerator / denominator
    
    print time.time()-t0
    
    order = dists.argsort()
    order = np.delete(order, np.where(order == userId-1))
    estimated_ratings = np.zeros(movie_count)
    ratings_gathered = np.zeros(movie_count, dtype=np.int8)
    denominators = np.zeros(movie_count)
    
    t0 = time.time()
    
    i = 0
    for u in order:
        mask = all_rated[u] * (ratings_gathered < n)
        weight = (1 - dists[u]) if weighted else 1.
        mean = means[u] if subtract_mean else 0.
        std = stds[u] if div_by_std else 1.
        estimated_ratings += (all_scores_local[u] - mean) / std * mask * weight
        ratings_gathered += mask
        denominators += np.abs(mask * weight)
        if all(ratings_gathered == n):
            break
        i += 1
            
    print time.time()-t0
                
    estimated_ratings /= denominators
    estimated_ratings[ratings_gathered < min_n] = np.nan
    estimated_ratings[user_rated] = np.nan
    
    if subtract_mean:
        multiplier = user_std if div_by_std else 1.        
        estimated_ratings = user_mean + multiplier * estimated_ratings
    
    movie_order = (-estimated_ratings).argsort()
    
    return movie_order + 1, estimated_ratings[movie_order], ratings_gathered[movie_order]

In [14]:
def calculate_similarities_cf_ii():
    return cdist(all_scores.T, all_scores.T, metric='cosine')

In [90]:
def recommend_cf_ii(userId, n, dists=None):
    if dists is None:
        raise NotImplementedError('distances have to be precomputed')

    user_scores = all_scores[userId]
    user_rated = user_scores > 0

    dists = dists[:, user_rated]
    if dists.shape[1] > n:
        order = dists.argpartition(axis=1, kth=n-1)[:, :n]
    else:
        order = np.indices((movie_count, dists.shape[1]))[1]
    weights = dists[np.arange(movie_count)[:, np.newaxis], order]
    
    base_user_preds = baseline_predictors()[userId]
    estimated_ratings = user_scores[user_rated][order] - base_user_preds[:, np.newaxis]
    estimated_ratings = (estimated_ratings * weights).sum(axis=1) / \
                        (np.abs(weights)).sum(axis=1) + base_user_preds
    estimated_ratings[user_rated] = np.nan
    
    movie_order = (-estimated_ratings).argsort()
    
    return movie_order + 1, estimated_ratings[movie_order]

In [16]:
def compare_genres(userID, recoms):
    genres = {}
    for c in map(lambda x: get_info(x,'genres'), ratings.movieId.values[np.where(ratings.userId.values == userID)]):
        for g in c[0].split('|'):
            if g not in genres:
                genres[g] = 0
            genres[g] += 1
            
    recom_genres = {}
    for c in movies.genres.values[recoms[:100]]:
        for g in c.split('|'):
            if g not in recom_genres:
                recom_genres[g] = 0
            recom_genres[g] += 1
            
    return sorted(genres.items(), key=lambda x: x[1], reverse=True), \
           sorted(recom_genres.items(), key=lambda x: x[1], reverse=True)

# testy uu

In [17]:
userId = 5

In [18]:
recoms, scores, sample_size = recommend_cf_uu(userId, 50, 10, subtract_mean=True, div_by_std=True, 
                                              weighted=True, metric='pearson')
scores[:10], sample_size[:10]

0.028088092804
0.063728094101


(array([ 4.57353462,  4.52208989,  4.43930143,  4.40418622,  4.40411773,
         4.40350698,  4.40028808,  4.39415057,  4.3912664 ,  4.38195306]),
 array([11, 50, 11, 22, 50, 50, 50, 12, 50, 50], dtype=int8))

In [19]:
genres, recom_genres = compare_genres(userId, recoms)
print genres
print ''
print recom_genres

[('Comedy', 55), ('Drama', 39), ('Romance', 37), ('Adventure', 20), ('Children', 14), ('Fantasy', 14), ('Action', 12), ('Thriller', 12), ('Musical', 10), ('Animation', 10), ('Crime', 10), ('Sci-Fi', 7), ('Mystery', 6), ('IMAX', 5), ('Horror', 4), ('War', 4), ('Documentary', 3)]

[('Drama', 58), ('Comedy', 40), ('Romance', 25), ('Thriller', 18), ('Action', 15), ('Adventure', 14), ('Crime', 13), ('Sci-Fi', 11), ('Mystery', 8), ('Fantasy', 8), ('Horror', 7), ('Musical', 5), ('War', 5), ('Film-Noir', 3), ('Animation', 3), ('Children', 3), ('Documentary', 1), ('Western', 1)]


# testy ii

In [20]:
dists = calculate_similarities_cf_ii()

In [64]:
userId = 50

In [96]:
t0 = time.time()
recoms, scores = recommend_cf_ii(userId, 50, dists=dists)
print time.time()-t0
scores[:10]

0.224896907806


array([ 4.04451237,  4.04451237,  4.04451237,  4.04451237,  4.04451237,
        4.04451237,  4.04451237,  4.04451237,  4.03757788,  4.0256001 ])

In [79]:
np.indices((3,4))[1]

array([[0, 1, 2, 3],
       [0, 1, 2, 3],
       [0, 1, 2, 3]])