# Getting Started

## Basic usage

### Automatic cross-validation

In [1]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate


# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# We'll use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/nicky/.surprise_data/ml-100k
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9290  0.9270  0.9379  0.9429  0.9399  0.9353  0.0062  
MAE (testset)     0.7340  0.7316  0.7401  0.7412  0.7378  0.7369  0.0036  
Fit time          3.77    3.84    3.81    3.77    3.83    3.80    0.03    
Test time         0.14    0.13    0.11    0.12    0.14    0.13    0.01    


{'test_rmse': array([0.9290046 , 0.92696979, 0.93787357, 0.94289768, 0.93992605]),
 'test_mae': array([0.73402329, 0.73155848, 0.74011006, 0.74119297, 0.73778748]),
 'fit_time': (3.7664849758148193,
  3.836266040802002,
  3.809438943862915,
  3.7721452713012695,
  3.8252220153808594),
 'test_time': (0.14040493965148926,
  0.13493990898132324,
  0.11177706718444824,
  0.12076091766357422,
  0.13738393783569336)}

### Train-test split and the fit() method

In [3]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9313


0.9313360520457532

### Train on a whole trainset and the predict() method

In [5]:
from surprise import KNNBasic
from surprise import Dataset

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = KNNBasic()
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x104a5b220>

In [6]:
uid = str(196)  # raw user id (as in the ratings file). They are **strings**!
iid = str(302)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.06   {'actual_k': 40, 'was_impossible': False}


## Use a custom dataset

In [10]:
import os

from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# path to dataset file
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')

# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
reader = Reader(line_format='user item rating timestamp', sep='\t')

data = Dataset.load_from_file(file_path, reader=reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9357  0.9467  0.9344  0.9493  0.9552  0.9443  0.0080  
MAE (testset)     0.7402  0.7497  0.7434  0.7548  0.7549  0.7486  0.0059  
Fit time          0.17    0.18    0.20    0.19    0.19    0.19    0.01    
Test time         0.12    0.07    0.11    0.11    0.12    0.11    0.02    


{'test_rmse': array([0.93569216, 0.9467341 , 0.93438614, 0.94930261, 0.95517661]),
 'test_mae': array([0.74020606, 0.749749  , 0.74339053, 0.75480618, 0.7548673 ]),
 'fit_time': (0.17069387435913086,
  0.1827390193939209,
  0.20243096351623535,
  0.19364500045776367,
  0.19159889221191406),
 'test_time': (0.11597681045532227,
  0.07389020919799805,
  0.11450505256652832,
  0.11461114883422852,
  0.11800599098205566)}

In [11]:
data

<surprise.dataset.DatasetAutoFolds at 0x104adbaf0>