In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
import time

In [2]:
def read_small():
    data_path = 'ml-latest-small/'
    movies = pd.read_csv(data_path + 'movies.csv')
    ratings = pd.read_csv(data_path + 'ratings.csv')
    min_rate = .5
    return movies, ratings, .5

def read_10M():
    data_path = 'ml-10M100K/'
    movies = pd.read_csv(data_path + 'movies.dat', delimiter='::', engine='python')
    ratings = pd.read_csv(data_path + 'ratings.dat', delimiter='::', engine='python')
    min_rate = .5
    return movies, ratings, .5

def read_1M():
    data_path = 'ml-1m/'
    movies = pd.read_csv(data_path + 'movies.dat', delimiter='::', engine='python')
    ratings = pd.read_csv(data_path + 'ratings.dat', delimiter='::', engine='python') 
    min_rate = 1.
    return movies, ratings, 1.

In [179]:
movies, ratings, min_rate = read_1M()

In [180]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [181]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [182]:
movie_count = len(movies)
user_count = max(ratings.userId)

mean_score = ratings.rating.values.mean()

In [183]:
all_scores = np.zeros((user_count, max(movies.movieId)))
for (user, movie, rating, _) in ratings.values:
    all_scores[int(user)-1,int(movie)-1] = rating
all_scores = all_scores[:,movies.movieId.values.astype(int)-1]

In [184]:
def train_test_split(scores, seed=1234, set_seed=True):
    N = scores.shape[0]
    test_size = N / 10
    if set_seed:
        np.random.seed(seed)
    test_idx = np.random.choice(N, test_size, replace=False)
    test_set = scores[test_idx]
    train_set = np.delete(scores, test_idx, axis=0)
    return train_set, test_set

def query_target_split(test_set, seed=1234, set_seed=True):
    query_set = test_set.copy()
    target_set = test_set.copy()
    if set_seed:
        np.random.seed(seed)
    for u in xrange(test_set.shape[0]):
        rated = np.where(test_set[u] > 0)[0]
        target = np.random.choice(rated, rated.size / 2, replace=False)
        query_set[u,target] = 0
    target_set[np.where(query_set > 0)] = 0
    return query_set, target_set

def evaluate_ratings(scores, target, method='nmae', min_rate=.5):
    mask = target > 0
    abs_dists = np.abs(scores * mask - target)
    if method == 'mae':
        return abs_dists.sum() / mask.sum()
    elif method == 'nmae':
        return abs_dists.sum() / mask.sum() / (5. - min_rate)
    elif method == 'rmse':
        return np.sqrt((abs_dists**2).sum() / mask.sum())
    else:
        raise Exception('Invalid argument value: method')
        
def compare_genres(userID, recoms):
    genres = {}
    for c in map(lambda x: get_info(x,'genres'), ratings.movieId.values[np.where(ratings.userId.values == userID)]):
        for g in c[0].split('|'):
            if g not in genres:
                genres[g] = 0
            genres[g] += 1
            
    recom_genres = {}
    for c in movies.genres.values[recoms[:100]]:
        for g in c.split('|'):
            if g not in recom_genres:
                recom_genres[g] = 0
            recom_genres[g] += 1
            
    return sorted(genres.items(), key=lambda x: x[1], reverse=True), \
           sorted(recom_genres.items(), key=lambda x: x[1], reverse=True)

In [185]:
def get_info(movieId, info='title'):
    return movies[info].values[np.where(movies.movieId.values == movieId)]

def get_user_ratings(userId):
    return ratings.values[np.where(ratings.userId == userId)][:,1:]

In [186]:
def user_baselines(train, query=None, mean_score=None):
    if query is None:
        query = train
    if mean_score is None:
        mean_score = train.sum() / np.count_nonzero(train)
    baselines = query.sum(axis=1) / (query > 0).sum(axis=1) - mean_score
    return np.nan_to_num(baselines)

def item_baselines(train, usr_baselines=None, mean_score=None):
    if mean_score is None:
        mean_score = train.sum() / np.count_nonzero(train)
    if usr_baselines is None:
        usr_baselines = user_baselines(train, mean_score=mean_score)
    nonzeros = (train > 0).sum(axis=0)
    usr_baselines = usr_baselines[:, np.newaxis] * (train > 0)
    preds = (train.sum(axis=0) - usr_baselines.sum(axis=0)) / nonzeros - mean_score
    return np.nan_to_num(preds)

# ta funkcja zwraca szacowania nawet istniejących ocen

def baseline_predictors(train, query=None, mean_score=None):
    if query is None:
        query = train
    if mean_score is None:
        mean_score = train.sum() / np.count_nonzero(train)
    usr_baselines = user_baselines(train, query, mean_score)
    itm_baselines = item_baselines(train, mean_score=mean_score)
    return mean_score + usr_baselines[:, np.newaxis] + itm_baselines

In [187]:
def per_user_means(scores):    
    return scores.sum(axis=1) / (scores > 0).sum(axis=1)

def per_user_stds(scores):
    means = per_user_means(scores)
    nonzeros = (scores > 0).sum(axis=1)
    sums = scores.sum(axis=1)
    sq_sums = (scores**2).sum(axis=1)
    return np.sqrt((sq_sums + nonzeros*means**2 - 2*means*sums) / nonzeros)

# user-user CF

In [188]:
def scores_cf_uu(train, query, target_inds, n, subtract_mean=True, div_by_std=True, metric='pearson'):
    assert not (div_by_std and not subtract_mean)
    
    ratings = []
    num_ratings = []
    
    train_rated = train > 0
    train_means = per_user_means(train)
    train_stds = per_user_stds(train)
    
    train_centered = (train - train_means[:, np.newaxis]) * train_rated
    
    query_means = per_user_means(query)
    query_stds = per_user_stds(query)
    
    baselines = baseline_predictors(train, query)
    
    for u_idx in xrange(query.shape[0]):

        user_ratings = query[u_idx]
        user_rated = user_ratings > 0
        user_mean = query_means[u_idx]
        user_std = query_stds[u_idx]
        user_baseline = baselines[u_idx]
        
        targets = np.where(target_inds[u_idx])[0]
        interesting_users_inds = np.unique(np.where(train_rated[:, targets])[0])
        train_cut = train_centered[interesting_users_inds]
        train_cut_rated = train_rated[interesting_users_inds]
        
        targets_ratings = train[interesting_users_inds][:, targets]
        
        user_centered = (user_ratings - user_mean) * user_rated

        if metric == 'cosine':
            dists = cdist(user_centered[np.newaxis], train_cut, metric='cosine')
        elif metric == 'pearson':
            numerator = user_centered.dot(train_cut.T)
            denom1 = (user_centered**2).dot(train_cut_rated.T)
            denom2 = user_rated.dot((train_cut**2).T)
            dists = (numerator / np.sqrt(denom1 * denom2))[np.newaxis]

        dists_zero_idx = np.where((dists == 0).T * train_cut_rated[:, targets])

        per_item_dists = dists.T * train_cut_rated[:, targets]
        per_item_dists[per_item_dists == 0] = np.nan
        per_item_dists[dists_zero_idx] = 0
        
        if per_item_dists.shape[0] > n:
            order = per_item_dists.argpartition(axis=0, kth=n-1)[:n]
        else:
            order = np.indices((per_item_dists.shape[0], targets.size))[0]
        
        weights = per_item_dists[order, np.arange(targets.size)]

        ratings_gathered = (~np.isnan(weights)).sum(axis=0)

        weights = np.nan_to_num(weights)
        
        scores = targets_ratings[order, np.arange(targets.size)]
        if subtract_mean:
            multiplier = train_stds[interesting_users_inds][order] if div_by_std else 1
            scores = (scores - train_means[interesting_users_inds][order]) / multiplier

        scores = (scores * weights).sum(axis=0) / (np.abs(weights)).sum(axis=0)
        
        estimated_ratings = np.zeros(query.shape[1])
        estimated_ratings[targets] = scores
        
        if subtract_mean:
            multiplier = user_std if div_by_std else 1.        
            estimated_ratings = user_mean + multiplier * estimated_ratings
            
        estimated_ratings[user_rated] = user_ratings[user_rated]

        # dla ocen nan dajemy baseline predictor

        estimated_ratings[np.isnan(estimated_ratings)] = user_baseline[np.isnan(estimated_ratings)]
        
        ratings.append(estimated_ratings[np.newaxis])
        
        user_num_ratings = np.zeros(query.shape[1])
        user_num_ratings[targets] = ratings_gathered
        num_ratings.append(user_num_ratings[np.newaxis])
        
    ratings = np.vstack(ratings)
    num_ratings = np.vstack(num_ratings)
        
    return ratings, num_ratings

In [189]:
def recommend_cf_uu(userId, n, scores, min_n=0, subtract_mean=True, div_by_std=True, metric='pearson'):
    
    estimated_ratings, ratings_gathered = scores_cf_uu(scores, scores[userId-1][np.newaxis], 
                                                       [np.ones(scores.shape[1])], n, subtract_mean, 
                                                       div_by_std, metric)
    estimated_ratings, ratings_gathered = estimated_ratings[0], ratings_gathered[0]
    estimated_ratings[scores[userId-1] > 0] = np.nan
    estimated_ratings[ratings_gathered < min_n] = np.nan
    
    movie_order = (-estimated_ratings).argsort()
    
    return movie_order + 1, estimated_ratings[movie_order], ratings_gathered[movie_order]

# item-item CF

In [190]:
def calculate_similarities_cf_ii(scores, nan_to_1=True):
    dists = cdist(scores.T, scores.T, metric='cosine')
    if nan_to_1:
        dists[np.isnan(dists)] = 1
    return dists

In [191]:
def scores_cf_ii(train, query, target_inds, n, dists=None):
    if dists is None:
        raise NotImplementedError('distances have to be precomputed')
    
    ratings = []
    
    baselines = baseline_predictors(train, query)
    
    for u_idx in xrange(query.shape[0]):

        user_ratings = query[u_idx]
        user_rated = user_ratings > 0
        user_baseline = baselines[u_idx]
        
        targets = np.where(target_inds[u_idx])[0]        

        user_dists = dists[targets][:, user_rated]
        if user_dists.shape[1] > n:
            order = user_dists.argpartition(axis=1, kth=n-1)[:, :n]
        else:
            order = np.indices((targets.size, user_dists.shape[1]))[1]
        weights = user_dists[np.arange(targets.size)[:, np.newaxis], order]

        scores = user_ratings[user_rated][order] - user_baseline[targets, np.newaxis]
        scores = (scores * weights).sum(axis=1) / (np.abs(weights)).sum(axis=1) + user_baseline[targets]
            
        estimated_ratings = np.zeros(query.shape[1])
        estimated_ratings[targets] = scores
        estimated_ratings[user_rated] = user_ratings[user_rated]
        
        ratings.append(estimated_ratings[np.newaxis])

    return np.vstack(ratings)

In [192]:
def recommend_cf_ii(userId, n, scores, dists=None):
    if dists is None:
        raise NotImplementedError('distances have to be precomputed')

    estimated_ratings = scores_cf_ii(scores, scores[userId-1][np.newaxis], [np.ones(scores.shape[1])], 
                                     n, dists=dists)[0]
    estimated_ratings[scores[userId-1] > 0] = np.nan
    
    movie_order = (-estimated_ratings).argsort()
    
    return movie_order + 1, estimated_ratings[movie_order]

# Slope One

In [227]:
def calculate_similarities_SO(scores, nan_to_0=True):    
    rated = scores > 0    
    dists = scores.T.dot(rated)
    dists = dists - dists.T
    dists /= rated.T.dot(rated.astype(np.float))
    return np.nan_to_num(dists)

In [222]:
def scores_SO_basic(train, query, target_inds, dists=None):
    if dists is None:
        raise NotImplementedError('distances have to be precomputed')
        
    ratings = []    
        
    for u_idx in xrange(query.shape[0]):
        user_ratings = query[u_idx]
        user_rated = user_ratings > 0
        
        targets = np.where(target_inds[u_idx])[0]        

        user_mean = user_ratings.sum() / user_rated.sum()
        scores = dists[targets][:, user_rated].mean(axis=1) + user_mean
        
        estimated_ratings = np.zeros(query.shape[1])
        estimated_ratings[targets] = scores
        estimated_ratings[user_rated] = user_ratings[user_rated]
        
        ratings.append(estimated_ratings[np.newaxis])
        
    return np.vstack(ratings)

In [232]:
def recommend_SO_basic(userId, scores, dists=None):
    if dists is None:
        raise NotImplementedError('distances have to be precomputed')

    estimated_ratings = scores_SO_basic(scores, scores[userId-1][np.newaxis], [np.ones(scores.shape[1])], dists)[0]
    estimated_ratings[scores[userId-1] > 0] = np.nan
    
    movie_order = (-estimated_ratings).argsort()
    
    return movie_order + 1, estimated_ratings[movie_order]

# testy

In [244]:
seed = np.random.randint(1, 1000000)
seed = 12340
train, test = train_test_split(all_scores, seed)
query, target = query_target_split(test, seed, True)

# baseline evaluation

In [245]:
evaluate_ratings(baseline_predictors(train, query), target, method='mae')

0.72610953612238482

# testy uu

In [246]:
userId = 55

t0 = time.time()
recoms, scores, sample_size = recommend_cf_uu(userId, 50, all_scores, 10, metric='cosine')
print time.time()-t0
scores[:10], recoms[:10]

1.6916987896


(array([ 4.93597315,  4.82404485,  4.81861098,  4.76318241,  4.76081769,
         4.75777036,  4.75484343,  4.75229947,  4.73198646,  4.73163966]),
 array([2837, 3270, 1221,  942, 2962, 1181,  258, 1133, 2129, 1243]))

In [247]:
t0 = time.time()
S = scores_cf_uu(train, query, target > 0, 50, metric='cosine')[0]
time.time() - t0

62.324748039245605

In [248]:
evaluate_ratings(S, target, method='mae')

0.68562928835160786

# testy ii

In [249]:
dists = calculate_similarities_cf_ii(all_scores)

In [250]:
userId = 55

t0 = time.time()
recoms, scores = recommend_cf_ii(userId, 50, all_scores, dists=dists)
print time.time()-t0
scores[:10], recoms[:10]

0.279159069061


(array([ 4.14650447,  4.14532208,  4.14211497,  4.14203507,  4.14176689,
         4.14122667,  4.14080789,  4.14073863,  4.14069204,  4.14055616]),
 array([2951,  161, 1132, 3204, 3673, 1080,  532, 1210, 1175, 1173]))

In [251]:
dists = calculate_similarities_cf_ii(train)

In [252]:
t0 = time.time()
S_ii = scores_cf_ii(train, query, target > 0, 40, dists)
time.time() - t0

0.694904088973999

In [253]:
evaluate_ratings(np.nan_to_num(S_ii), target, method='mae')

0.75744151162488427

# testy SO basic

In [254]:
dists = calculate_similarities_cf_ii(all_scores)

In [255]:
userId = 55

t0 = time.time()
recoms, scores = recommend_SO_basic(userId, all_scores, dists=dists)
print time.time()-t0
scores[:10], recoms[:10]

0.0242028236389


(array([ 5.12,  5.12,  5.12,  5.12,  5.12,  5.12,  5.12,  5.12,  5.12,  5.12]),
 array([2527, 1141, 1140, 1131, 1128, 1126, 2890, 1125, 1122, 1107]))

In [256]:
dists = calculate_similarities_SO(train)

In [257]:
t0 = time.time()
S = scores_SO_basic(train, query, target > 0, dists)
time.time() - t0

0.3181030750274658

In [258]:
evaluate_ratings(np.nan_to_num(S), target, method='mae')

0.70723413889974329