In [1]:
# 使用movielens數據集

from surprise import KNNBasic, SVD
from surprise import Dataset
from surprise import evaluate, print_perf

# Load the movielens-100k dataset (download it if need)
# and split it into 3 folds for cross-validation
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

# use KNNBasic algorithm(協同過濾)
algo = KNNBasic()

# Evaluate performances of our algorithm on the dataset
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

print_perf(perf)



Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9860
MAE:  0.7813
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9817
MAE:  0.7751
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9967
MAE:  0.7878
------------
------------
Mean RMSE: 0.9881
Mean MAE : 0.7814
------------
------------
        Fold 1  Fold 2  Fold 3  Mean    
RMSE    0.9860  0.9817  0.9967  0.9881  
MAE     0.7813  0.7751  0.7878  0.7814  


In [2]:
from surprise import GridSearch

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'FCP'])
data = Dataset.load_builtin('ml-100k')
data.split(n_folds=3)

grid_search.evaluate(data)



Running grid search for the following parameter combinations:
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}




Resulsts:
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}
{'RMSE': 0.99716259717048483, 'FCP': 0.68395821875349549}
----------
{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}
{'RMSE': 1.0032636390693888, 'FCP': 0.68668470289303662}
----------
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
{'RMSE': 0.9735888606178883, 'FCP': 0.69462933940887694}
----------
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}
{'RMSE': 0.9823400741837881, 'FCP': 0.69543242031508734}
----------
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}
{'RMSE': 0.977912390413051, 'FCP': 0.69301935358231692}
----------
{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}
{'RMSE': 0.98608479433042318, 'FCP': 0.69396866273192759}
----------
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
{'RMSE': 0.96370500486701161, 'FCP': 0.69880576174198328}
----------
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}
{'RMSE': 0.97355008552714217, 'FCP': 0.69901571453127698}
----------


In [3]:
# best RMSE score
print(grid_search.best_score['RMSE'])

# combination of parameters that gave the best RMSE score
print(grid_search.best_params['RMSE'])

# best FCP score
print(grid_search.best_score['FCP'])

# combination of parameters that gave the best FCP score
print(grid_search.best_params['FCP'])

0.963705004867
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
0.699015714531
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}


In [4]:
import pandas as pd

results_df = pd.DataFrame.from_dict(grid_search.cv_results)
results_df

Unnamed: 0,params,scores
0,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}","{'RMSE': 0.99716259717, 'FCP': 0.683958218753}"
1,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}","{'RMSE': 1.00326363907, 'FCP': 0.686684702893}"
2,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}","{'RMSE': 0.973588860618, 'FCP': 0.694629339409}"
3,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}","{'RMSE': 0.982340074184, 'FCP': 0.695432420315}"
4,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}","{'RMSE': 0.977912390413, 'FCP': 0.693019353582}"
5,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.6}","{'RMSE': 0.98608479433, 'FCP': 0.693968662732}"
6,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}","{'RMSE': 0.963705004867, 'FCP': 0.698805761742}"
7,"{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.6}","{'RMSE': 0.973550085527, 'FCP': 0.699015714531}"


In [5]:
from __future__ import (absolute_import, division, print_function, unicode_literals)
import os
import io
from surprise import KNNBaseline
from surprise import Dataset

def read_item_names():
    file_name = ('./ml-100k/u.item')
    rid_to_name = {}
    name_to_rid = {}
    
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]
            
    return rid_to_name, name_to_rid

data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.train(trainset)



Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1132a3b38>

In [6]:
rid_to_name, name_to_rid = read_item_names()

toy_story_raw_id = name_to_rid['Now and Then (1995)']
toy_story_raw_id

'1053'

In [7]:
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)
toy_story_inner_id

961

In [8]:
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)
toy_story_neighbors

[291, 82, 366, 528, 179, 101, 556, 310, 431, 543]

In [9]:
iid_list = []
for inner_id in toy_story_neighbors:
    iid_list.append(algo.trainset.to_raw_iid(inner_id) )

result = []
for rid in iid_list:
    result.append(rid_to_name[rid])

print('The 10 nearest neighbors of Toy Story are:')
for movie in result:
    print(movie)

The 10 nearest neighbors of Toy Story are:
While You Were Sleeping (1995)
Batman (1989)
Dave (1993)
Mrs. Doubtfire (1993)
Groundhog Day (1993)
Raiders of the Lost Ark (1981)
Maverick (1994)
French Kiss (1995)
Stand by Me (1986)
Net, The (1995)
