# Surprise homework

In [1]:
import io
from collections import defaultdict
import pandas as pd
from surprise import Dataset
from surprise import SVD
from surprise import NormalPredictor
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection import train_test_split
from operator import itemgetter
import pandas as pd

### Load data

In [2]:
data = Dataset.load_builtin('ml-100k')

### Define algorithms

In [3]:
algos = {
    "SVD": SVD(),
    "Base": NormalPredictor(),
    "KNN_cos": KNNWithMeans(30, sim_options={'name': 'cosine'}, verbose=False),
    "KNN_msd": KNNWithMeans(30, sim_options={'name': 'msd'}, verbose=False),
    "KNN_pirs": KNNWithMeans(30, sim_options={'name': 'pearson_baseline'}, verbose=False)
}

### Select algorithm

In [4]:
RMSE = {}
for [name, algo] in algos.items():
    crv = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True)
    RMSE[name] = crv['test_rmse'].mean()
selected_algo = min(RMSE.items(), key=lambda unit: unit[1])
print("Лучший алгоритм и значение RMSE для него: ", selected_algo)

trainset, testset = train_test_split(data, test_size=.25)
selected_algo_obj = algos[selected_algo[0]]
selected_algo_obj.fit(trainset)
predictions = selected_algo_obj.test(testset)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9383  0.9366  0.9351  0.9271  0.9418  0.9358  0.0049  
Fit time          5.19    5.66    5.07    5.21    5.15    5.26    0.21    
Test time         0.19    0.16    0.19    0.20    0.27    0.20    0.03    
Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5221  1.5158  1.5316  1.5184  1.5151  1.5206  0.0060  
Fit time          0.15    0.15    0.15    0.15    0.16    0.15    0.00    
Test time         0.16    0.21    0.21    0.15    0.15    0.18    0.03    
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9524  0.9617  0.9607  0.9589  0.9559  0.9579  0.0034  
Fit time          1.25    1.38    1.14    1.04    1.08    1.18    0.12    
Test ti

### Calculate precision@k and recall@k

In [5]:
    def precision_recall_at_k(predictions, k=5, threshold=3.52):
        user_est_true = defaultdict(list)
        for uid, _, true_r, est, _ in predictions:
            user_est_true[uid].append((est, true_r))

        precisions = dict()
        recalls = dict()
        for uid, user_ratings in user_est_true.items():
            user_ratings.sort(key=lambda x: x[0], reverse=True)

            n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

            n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

            n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                                  for (est, true_r) in user_ratings[:k])

            precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

            recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

        return precisions, recalls

precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=3.52)

print(sum(prec for prec in precisions.values()) / len(precisions))
print(sum(rec for rec in recalls.values()) / len(recalls))

0.738034641215977
0.36471649451618143


### Predict

In [6]:
variant = '25'
n = 5
trainset = data.build_full_trainset()
selected_algo_obj.fit(trainset)
testset = trainset.build_anti_testset()
predictions = selected_algo_obj.test(testset)
predictions_for_variant = list(filter(lambda unit: unit.uid == variant, predictions))  
best_predictions = sorted(predictions_for_variant, key=lambda unit: unit.est, reverse=True)[:n]

In [7]:
import os
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.item')
films = pd.read_csv(file_path, sep="|", encoding='latin-1')
info = {}
for id in list(map(lambda x: x.iid, best_predictions)):
       row = films.iloc[int(id) - 1] 
       info[id] = (row[1], row[2])
        
print("User 25")
for prediction in best_predictions:
    print("{} {} {}".format(prediction.iid, info[prediction.iid], prediction.est))

User 25
318 ('Everyone Says I Love You (1996)', '06-Dec-1996') 4.718652790494702
64 ("What's Eating Gilbert Grape (1993)", '01-Jan-1993') 4.658420096828414
178 ('Clockwork Orange, A (1971)', '01-Jan-1971') 4.606022918632068
963 ('Month by the Lake, A (1995)', '01-Jan-1995') 4.577345744625782
513 ('Annie Hall (1977)', '01-Jan-1977') 4.56513883270906
