In [1]:
import numpy as np
import time
from testing import cross_valid, analyze_cross_valid_res, train_test_split, query_target_split, evaluate_ratings
from utils import read_10M, read_1M, read_small
from collaborative_filtering import recommend_cf_ii, recommend_cf_uu, calculate_similarities_cf_ii, \
                                    scores_cf_uu, scores_cf_ii
from slope_one import recommend_SO, calculate_similarities_SO, calculate_similarities_SO_bipolar, \
                      scores_SO_basic, scores_SO_weighted, scores_SO_bipolar
from baselines import baseline_predictors

Funkcje scores_... szacują oceny filmów o indeksach target_inds 
dla użytkowników w query w oparciu o zbiór uczący train.

Funkcje recommend_... służą do polecania filmów, przykład użycia na dole tego notebooka.

Funkcje calculate\_similarities_... obliczają macierze podobieństwa między filmami dla różnych metod.

In [2]:
movies, ratings, min_rate = read_10M()

In [3]:
movie_index = {k:v for (v,k) in movies.movieId.to_dict().items()}

In [4]:
all_scores = np.zeros((max(ratings.userId), len(movies)))
for (user, movie, rating, _) in ratings.values:
    all_scores[int(user)-1, movie_index[int(movie)]] = rating
all_scores = all_scores[np.where(all_scores.sum(axis=1) > 0)[0]]

# cross validation

In [None]:
# to są główne testy, wyniki w raporcie

In [13]:
t0=time.time()
mean_scores, std_scores = analyze_cross_valid_res(cross_valid(all_scores, method='uucf', min_rate=min_rate, 
                                                              cf_n=30, uucf_metric='cosine'))
print mean_scores
print std_scores
print time.time()-t0

Started fold 1 out of 10
Started fold 2 out of 10
Started fold 3 out of 10
Started fold 4 out of 10
Started fold 5 out of 10
Started fold 6 out of 10
Started fold 7 out of 10
Started fold 8 out of 10
Started fold 9 out of 10
Started fold 10 out of 10
[ 0.63429062  0.83307861]
[ 0.00423445  0.00520165]
13098.8459928


In [14]:
t0=time.time()
mean_scores, std_scores = analyze_cross_valid_res(cross_valid(all_scores, method='uucf', min_rate=min_rate, 
                                                              cf_n=20, uucf_metric='cosine'))
print mean_scores
print std_scores
print time.time()-t0

Started fold 1 out of 10
Started fold 2 out of 10


KeyboardInterrupt: 

In [None]:
t0=time.time()
mean_scores, std_scores = analyze_cross_valid_res(cross_valid(all_scores, method='uucf', min_rate=min_rate, 
                                                              cf_n=10, uucf_metric='cosine'))
print mean_scores
print std_scores
print time.time()-t0

# pozostałe testy

In [None]:
# podział na zbiór uczący i testowy

In [5]:
seed = 1234
train, test = train_test_split(all_scores, seed)
query, target = query_target_split(test, seed, True)

## baseline evaluation

In [None]:
# ewaluacja na zbiorze testowym

In [6]:
evaluate_ratings(baseline_predictors(train, query), target, method='mae')

0.68612817295587047

## testy uu

In [None]:
# rekomendacje

In [7]:
userId = 55

t0 = time.time()
recoms, scores, sample_size = recommend_cf_uu(userId, 50, all_scores, 10, metric='cosine')
print time.time()-t0
scores[:10], recoms[:10]

0.35307598114


(array([ 4.72342042,  4.59176516,  4.58948437,  4.54762371,  4.54109994,
         4.53453507,  4.51377053,  4.5108084 ,  4.47293052,  4.47084276]),
 array([ 284, 7364,  953, 3054, 8439, 6901, 1958, 2187, 3871, 1892]))

In [None]:
# ewaluacja na zbiorze testowym

In [8]:
t0 = time.time()
S = scores_cf_uu(train, query, target > 0, 50, metric='pearson', min_rate=min_rate)[0]
time.time() - t0

0.3459289073944092

In [9]:
evaluate_ratings(S, target, method='mae')

0.66482114440825846

## testy ii

In [None]:
# rekomendacje

In [10]:
dists = calculate_similarities_cf_ii(all_scores)

In [11]:
userId = 55

t0 = time.time()
recoms, scores = recommend_cf_ii(userId, 50, all_scores, dists=dists)
print time.time()-t0
scores[:10], recoms[:10]

0.202964067459


(array([ 6.64586466,  6.64586466,  6.64586466,  6.64586466,  6.64586466,
         6.64586466,  6.64586466,  6.64586466,  6.26360142,  5.54824561]),
 array([3611, 3261, 1174, 1244, 3751, 4131, 3830, 1413, 8485, 8529]))

In [None]:
# ewaluacja na zbiorze testowym

In [19]:
dists = calculate_similarities_cf_ii(train)

In [20]:
t0 = time.time()
S_ii = scores_cf_ii(train, query, target > 0, 20, dists, min_rate=min_rate)
time.time() - t0

0.16721701622009277

In [21]:
evaluate_ratings(S_ii, target, method='mae')

0.66497166549689679

## testy SO basic

In [None]:
# rekomendacje

In [15]:
dists_cards = calculate_similarities_SO(all_scores)

In [17]:
userId = 55

t0 = time.time()
recoms, scores = recommend_SO(userId, all_scores, dists_cards=dists_cards, mode='basic')
print time.time()-t0
scores[:10], recoms[:10]

0.275269031525


(array([ 8.        ,  7.5       ,  7.5       ,  7.        ,  7.        ,
         7.        ,  7.        ,  6.75      ,  6.75      ,  6.64586466]),
 array([9119, 8504, 9057, 5372,  356, 8185, 2323, 7903, 3728, 1174]))

In [None]:
# ewaluacja na zbiorze testowym

In [22]:
dists_cards = calculate_similarities_SO(train)

In [23]:
t0 = time.time()
S = scores_SO_basic(train, query, target > 0, dists_cards, min_rate=min_rate)
time.time() - t0

0.22868895530700684

In [24]:
evaluate_ratings(np.nan_to_num(S), target, method='mae')

0.67124640091995291

## testy SO weighted

In [None]:
# rekomendacje

In [25]:
dists_cards = calculate_similarities_SO(all_scores)

In [27]:
userId = 55

t0 = time.time()
recoms, scores = recommend_SO(userId, all_scores, dists_cards=dists_cards, mode='weighted')
print time.time()-t0
scores[:10], recoms[:10]

0.328321933746


(array([ 8.        ,  7.5       ,  7.5       ,  7.        ,  7.        ,
         7.        ,  7.        ,  6.75      ,  6.75      ,  6.64586466]),
 array([9119, 8504, 9057, 5372, 8185, 2323,  356, 3728, 7903, 1413]))

In [None]:
# ewaluacja na zbiorze testowym

In [28]:
dists_cards = calculate_similarities_SO(train)

In [33]:
t0 = time.time()
S = scores_SO_weighted(train, query, target > 0, dists_cards, min_rate=min_rate)
time.time() - t0

0.21298503875732422

In [34]:
evaluate_ratings(np.nan_to_num(S), target, method='mae')

0.66004597485397054

## testy SO bi-polar

In [None]:
# rekomendacje

In [38]:
dists_cards = calculate_similarities_SO_bipolar(all_scores)

In [39]:
userId = 55

t0 = time.time()
recoms, scores = recommend_SO(userId, all_scores, dists_cards=dists_cards, mode='bipolar')
print time.time()-t0
scores[:10], recoms[:10]

0.565158128738


(array([ 6.64586466,  6.64586466,  6.64586466,  6.64586466,  6.64586466,
         6.64586466,  6.64586466,  6.64586466,  6.5       ,  6.5       ]),
 array([1244, 1413, 3751, 3830, 1174, 3261, 4131, 3611, 6558, 7903]))

In [None]:
# ewaluacja na zbiorze testowym

In [35]:
dists_cards_l, dists_cards_d = calculate_similarities_SO_bipolar(train)

In [36]:
t0 = time.time()
S = scores_SO_bipolar(train, query, target > 0, dists_cards_l, dists_cards_d, min_rate=min_rate)
time.time() - t0

0.33114194869995117

In [37]:
evaluate_ratings(np.nan_to_num(S), target, method='mae')

0.66642484426670612