In [1]:
#import numpy as np
#import scipy
#import scipy.io
#import scipy.sparse as sp
#import csv

#import matplotlib.pyplot as plt
#import seaborn as sns
#sns.set_style("whitegrid")

import os
import random

from surprise.dataset import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import GridSearch
from surprise import BaselineOnly

from numpy.random import RandomState

%load_ext autoreload
%autoreload 2
%matplotlib inline

# Load and prepare data

### Training and testing data

#### Load the Data

In [2]:
DATA_PATH = '../data/surprise/'
OUTPUT_DATA_PATH = '../data/surprise/'

SAMPLE_DATA_FILE = 'movielens100k.csv'
TRAIN_DATA_FILE = 'data_train.csv'
SUBMISSION_DATA_FILE = 'sample_submission.csv'

In [3]:
# Define paths to dataset files
sample_file_path = os.path.expanduser('{}{}'.format(DATA_PATH, SAMPLE_DATA_FILE))
train_file_path = os.path.expanduser('{}{}'.format(DATA_PATH, TRAIN_DATA_FILE))
submission_file_path = os.path.expanduser('{}{}'.format(DATA_PATH, SUBMISSION_DATA_FILE))

# Define a reader
reader = Reader(line_format='item user rating', sep=',')

# Load datasets
sample_data = Dataset.load_from_file(sample_file_path, reader=reader)
train_data = Dataset.load_from_file(train_file_path, reader=reader)
submission_data = Dataset.load_from_file(submission_file_path, reader=reader)

In [4]:
# Define folds for cross-validation
random.seed(50)
sample_data.split(n_folds=5)
sample_trainset = sample_data.build_full_trainset()

random.seed(50)
train_data.split(n_folds=5)

Example:

### Slope One

In [8]:
from surprise import SlopeOne

In [9]:
rand_state = RandomState(0)
so_alg = SlopeOne()
so_perf = evaluate(so_alg, train_data, measures=['RMSE'])

Evaluating RMSE of algorithm SlopeOne.

------------
Fold 1
RMSE: 1.0028
------------
Fold 2
RMSE: 1.0027
------------
Fold 3
RMSE: 1.0000
------------
Fold 4
RMSE: 1.0029
------------
Fold 5
RMSE: 0.9990
------------
------------
Mean RMSE: 1.0015
------------
------------


### CoClustering

In [10]:
from surprise import CoClustering

In [11]:
rand_state = RandomState(0)
cc_alg = CoClustering()
cc_perf = evaluate(cc_alg, train_data, measures=['RMSE'])

Evaluating RMSE of algorithm CoClustering.

------------
Fold 1
RMSE: 1.0126
------------
Fold 2
RMSE: 1.0131
------------
Fold 3
RMSE: 1.0094
------------
Fold 4
RMSE: 1.0126
------------
Fold 5
RMSE: 1.0092
------------
------------
Mean RMSE: 1.0114
------------
------------


In [None]:
param_grid = {'n_cltr_u': [2,3,4,6,10,20,30,40],
              'n_cltr_i': [2,3,4,6,10,20,30,40],
              'n_epochs': [30]
             }

grid_search = GridSearch(CoClustering, param_grid, measures=['RMSE'])

grid_search.evaluate(train_data)

In [None]:
param_grid = {'n_cltr_u': [25,30,35],
              'n_cltr_i': [3],
              'n_epochs': [30]
             }

grid_search = GridSearch(CoClustering, param_grid, measures=['RMSE'])

grid_search.evaluate(train_data)

In [None]:
param_grid = {'n_cltr_u': [60, 75, 90],
              'n_cltr_i': [3, 2],
              'n_epochs': [100]
             }

grid_search = GridSearch(CoClustering, param_grid, measures=['RMSE'])

grid_search.evaluate(train_data)

### kNN with Baseline

In [14]:
from surprise import KNNBaseline

In [None]:
rand_state = RandomState(0)
knnb_alg = KNNBaseline()
knnb_perf = evaluate(knnb_alg, train_data, measures=['RMSE'])

In [33]:
param_grid = {'k': [10,20,30,40,60],
              'min_k': [0,1,5],
              'sim_options': {'name': ['pearson_baseline'],
                              'user_based': [True, False]}
             }

grid_search = GridSearch(KNNBaseline, param_grid, measures=['RMSE'])

grid_search.evaluate(train_data)

In [None]:
param_grid = {'k': [60],
              'min_k': [10,15],
              'sim_options': {'name': ['pearson_baseline'],
                              'user_based': [True, False]}
             }

grid_search = GridSearch(KNNBaseline, param_grid, measures=['RMSE'])

grid_search.evaluate(train_data)

In [None]:
param_grid = {'k': [60],
              'min_k': [20, 25],
              'sim_options': {'name': ['pearson_baseline'],
                              'user_based': [True, False]}
             }

grid_search = GridSearch(KNNBaseline, param_grid, measures=['RMSE'])

grid_search.evaluate(train_data)

In [45]:
param_grid = {'k': [120, 100],
              'min_k': [20],
              'sim_options': {'name': ['pearson_baseline'],
                              'user_based': [False]}
             }

grid_search = GridSearch(KNNBaseline, param_grid, measures=['RMSE'])

grid_search.evaluate(train_data)

[{'k': 120, 'min_k': 20, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}, {'k': 100, 'min_k': 20, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}]
------------
Parameters combination 1 of 2
params:  {'k': 120, 'min_k': 20, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
------------
Mean RMSE: 0.9935
------------
------------
Parameters combination 2 of 2
par

In [46]:
param_grid = {'k': [300, 400, 500],
              'min_k': [20],
              'sim_options': {'name': ['pearson_baseline'],
                              'user_based': [True]}
             }

grid_search = GridSearch(KNNBaseline, param_grid, measures=['RMSE'])

grid_search.evaluate(train_data)

[{'k': 300, 'min_k': 20, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}}, {'k': 400, 'min_k': 20, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}}, {'k': 500, 'min_k': 20, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}}]
------------
Parameters combination 1 of 3
params:  {'k': 300, 'min_k': 20, 'sim_options': {'name': 'pearson_baseline', 'user_based': True}}
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
---

### Create submission