In [1]:
import numpy as np
import time
from testing import cross_valid, analyze_cross_valid_res, train_test_split, query_target_split, evaluate_ratings
from utils import read_10M, read_1M, read_small
from collaborative_filtering import recommend_cf_ii, recommend_cf_uu, calculate_similarities_cf_ii, \
                                    scores_cf_uu, scores_cf_ii
from slope_one import recommend_SO, calculate_similarities_SO, calculate_similarities_SO_bipolar, \
                      scores_SO_basic, scores_SO_weighted, scores_SO_bipolar
from baselines import baseline_predictors

Funkcje scores_... szacują oceny filmów o indeksach target_inds 
dla użytkowników w query w oparciu o zbiór uczący train.

Funkcje recommend_... służą do polecania filmów, przykład użycia na dole tego notebooka.

Funkcje calculate\_similarities_... obliczają macierze podobieństwa między filmami dla różnych metod.

In [2]:
movies, ratings, min_rate = read_10M()

In [3]:
movie_index = {k:v for (v,k) in movies.movieId.to_dict().items()}

In [4]:
all_scores = np.zeros((max(ratings.userId), len(movies)))
for (user, movie, rating, _) in ratings.values:
    all_scores[int(user)-1, movie_index[int(movie)]] = rating
all_scores = all_scores[np.where(all_scores.sum(axis=1) > 0)[0]]

# cross validation

In [None]:
# to są główne testy, wyniki w raporcie

In [None]:
t0=time.time()
mean_scores, std_scores = analyze_cross_valid_res(cross_valid(all_scores, method='uucf', min_rate=min_rate, 
                                                              cf_n=10, uucf_metric='pearson'))
print mean_scores
print std_scores
print time.time()-t0

In [None]:
t0=time.time()
mean_scores, std_scores = analyze_cross_valid_res(cross_valid(all_scores, method='uucf', min_rate=min_rate, 
                                                              cf_n=20, uucf_metric='pearson'))
print mean_scores
print std_scores
print time.time()-t0

In [None]:
t0=time.time()
mean_scores, std_scores = analyze_cross_valid_res(cross_valid(all_scores, method='uucf', min_rate=min_rate, 
                                                              cf_n=30, uucf_metric='pearson'))
print mean_scores
print std_scores
print time.time()-t0

# pozostałe testy

In [None]:
# podział na zbiór uczący i testowy

In [5]:
seed = 1234
train, test = train_test_split(all_scores, seed)
query, target = query_target_split(test, seed, True)

## baseline evaluation

In [None]:
# ewaluacja na zbiorze testowym

In [6]:
evaluate_ratings(baseline_predictors(train, query), target, method='mae')

0.68612817295587047

## testy uu

In [None]:
# rekomendacje

In [8]:
userId = 55

t0 = time.time()
recoms, scores, sample_size = recommend_cf_uu(userId, 30, all_scores, 10, metric='pearson')
print time.time()-t0
scores[:10], recoms[:10]

43.2127840519


(array([ 4.68933149,  4.65831542,  4.63180198,  4.62600546,  4.59492645,
         4.59287746,  4.58403634,  4.57123128,  4.56383638,  4.55288093]),
 array([8507, 9468, 5916, 6557,  906, 3650, 9369, 8576, 9656, 3049]))

In [None]:
# ewaluacja na zbiorze testowym

In [8]:
t0 = time.time()
S = scores_cf_uu(train, query, target > 0, 50, metric='pearson', min_rate=min_rate)[0]
time.time() - t0

0.3459289073944092

In [9]:
evaluate_ratings(S, target, method='mae')

0.66482114440825846

## testy ii

In [None]:
# rekomendacje

In [8]:
t0 = time.time()
dists = calculate_similarities_cf_ii(all_scores, subtract_baselines=True)
print time.time()-t0

161.288729906


In [9]:
userId = 55

t0 = time.time()
recoms, scores = recommend_cf_ii(userId, 30, all_scores, dists=dists)
print time.time()-t0
scores[:10], recoms[:10]

8.01727485657


(array([ 5.86124955,  5.6582584 ,  5.51159073,  5.12848955,  5.01159073,
         4.8212078 ,  4.7521022 ,  4.69712561,  4.68013245,  4.67951706]),
 array([ 4361, 10616,  9408, 10626, 10337, 10545,  1390, 10303,  9767,  1205]))

In [None]:
# ewaluacja na zbiorze testowym

In [19]:
dists = calculate_similarities_cf_ii(train)

In [20]:
t0 = time.time()
S_ii = scores_cf_ii(train, query, target > 0, 20, dists, min_rate=min_rate)
time.time() - t0

0.16721701622009277

In [21]:
evaluate_ratings(S_ii, target, method='mae')

0.66497166549689679

## testy SO basic

In [None]:
# rekomendacje

In [5]:
t0 = time.time()
dists_cards = calculate_similarities_SO(all_scores)
print time.time()-t0

259.405522108


In [6]:
userId = 55

t0 = time.time()
recoms, scores = recommend_SO(userId, all_scores, dists_cards=dists_cards, mode='basic')
print time.time()-t0
scores[:10], recoms[:10]

7.79314899445


(array([ 4.67724868,  4.5625    ,  4.22222222,  4.21385666,  4.1912037 ,
         4.17361111,  4.16187209,  4.15883114,  4.14116622,  4.13657407]),
 array([ 9031,  9083,  9408,  9390, 10675,  8405,   315,  6557,    49,  9008]))

In [None]:
# ewaluacja na zbiorze testowym

In [22]:
dists_cards = calculate_similarities_SO(train)

In [23]:
t0 = time.time()
S = scores_SO_basic(train, query, target > 0, dists_cards, min_rate=min_rate)
time.time() - t0

0.22868895530700684

In [24]:
evaluate_ratings(np.nan_to_num(S), target, method='mae')

0.67124640091995291

## testy SO weighted

In [None]:
# rekomendacje

In [12]:
t0 = time.time()
dists_cards = calculate_similarities_SO(all_scores)
print time.time()-t0

274.431782961


In [13]:
userId = 55

t0 = time.time()
recoms, scores = recommend_SO(userId, all_scores, dists_cards=dists_cards, mode='weighted')
print time.time()-t0
scores[:10], recoms[:10]

8.15755295753


(array([ 6.16666667,  5.23529412,  5.19736842,  5.14285714,  5.125     ,
         4.83333333,  4.7826087 ,  4.7375    ,  4.73529412,  4.68181818]),
 array([10628,  9408,  9083,  9911, 10614, 10442,  9031, 10588, 10337,  3141]))

In [None]:
# ewaluacja na zbiorze testowym

In [28]:
dists_cards = calculate_similarities_SO(train)

In [33]:
t0 = time.time()
S = scores_SO_weighted(train, query, target > 0, dists_cards, min_rate=min_rate)
time.time() - t0

0.21298503875732422

In [34]:
evaluate_ratings(np.nan_to_num(S), target, method='mae')

0.66004597485397054

## testy SO bi-polar

In [None]:
# rekomendacje

In [14]:
t0 = time.time()
dists_cards = calculate_similarities_SO_bipolar(all_scores)
print time.time()-t0

556.821367025


In [15]:
userId = 55

t0 = time.time()
recoms, scores = recommend_SO(userId, all_scores, dists_cards=dists_cards, mode='bipolar')
print time.time()-t0
scores[:10], recoms[:10]

8.47992110252


(array([ 5.5       ,  5.15      ,  5.125     ,  5.05555556,  5.05      ,
         5.02941176,  4.94642857,  4.91666667,  4.91666667,  4.9       ]),
 array([10614,  9805,  9408,  9031,  3141,  9083,  9929, 10549, 10442,  9911]))

In [None]:
# ewaluacja na zbiorze testowym

In [35]:
dists_cards_l, dists_cards_d = calculate_similarities_SO_bipolar(train)

In [36]:
t0 = time.time()
S = scores_SO_bipolar(train, query, target > 0, dists_cards_l, dists_cards_d, min_rate=min_rate)
time.time() - t0

0.33114194869995117

In [37]:
evaluate_ratings(np.nan_to_num(S), target, method='mae')

0.66642484426670612