In [1]:
import pandas as pd
from src.constant import *
from src.utils import *

from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import accuracy
from surprise import KNNBasic
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut
from src.data_loader import load_raw_data, RawDataPaths
from src.data_split import data_split, left_out_split
from src.models import KNNRecommender
from src.evaluation import evaluate_predictions, get_hit_rate
from collections import defaultdict


# Data Source

`movielens/1m-ratings`: This dataset contains 1,000,209 anonymous ratings of approximately 3,900 movies made by 6,040 MovieLens users who joined MovieLens. It is the largest dataset that contains demographic data.
Each user has rated at least 20 movies. Ratings are in whole-star increments. In demographic data, age values are divided into ranges and the lowest age value for each range is used in the data instead of the actual values.

`movielens/1m-movies`: This dataset contains data of approximately 3,900 movies rated in the 1m dataset.

# Load data
Download the MovieLens 1M dataset from [groupLens](https://grouplens.org/datasets/movielens/1m/)

## Movies

In [2]:
movie_df = pd.read_csv("data/raw/ml-1m/movies.dat", delimiter='::', header=None, engine='python', encoding='latin-1')

In [3]:
movie_df.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# movie
movie_df = pd.read_csv("data/raw/ml-1m/movies.dat",  delimiter='::', header=None, engine='python', encoding='latin-1')
# remove year from title and add year column
movie_df[3] = movie_df[1].str.extract(r"\((\d{4})\)$")
movie_df[1] = movie_df[1].str.replace(r"\(\d{4}\)$", "", regex=True)
movie_df[2] = movie_df[2].str.split("|")
movie_df.columns = MOVIE_COLUMN_NAMES
print(movie_df.shape)
movie_df.head()

(3883, 4)


Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,"[Animation, Children's, Comedy]",1995
1,2,Jumanji,"[Adventure, Children's, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama]",1995
4,5,Father of the Bride Part II,[Comedy],1995


## Rating
UserID::MovieID::Rating::Timestamp

- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

In [5]:
rating_df = pd.read_csv("data/raw/ml-1m/ratings.dat", delimiter='::', header=None, engine='python', encoding='latin-1')
rating_df.columns = RATING_COLUMN_NAMES
print(rating_df.shape)
rating_df.head()

(1000209, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## User
All demographic information is provided voluntarily by the users and is
not checked for accuracy.  Only users who have provided some demographic
information are included in this data set.
- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges:
	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"

In [6]:
user_df = pd.read_csv("data/raw/ml-1m/users.dat", delimiter='::', header=None, engine='python', encoding='latin-1')
user_df.columns = USER_COLUMN_NAMES
# map occupation to occupation name
user_df[USER_COLUMN_NAMES[3]] = user_df[USER_COLUMN_NAMES[3]].map(OCCUPATION_MAP)
print(user_df.shape)
user_df.head()

(6040, 5)


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,K-12 student,48067
1,2,M,56,self-employed,70072
2,3,M,25,scientist,55117
3,4,M,45,executive/managerial,2460
4,5,M,25,writer,55455


In [7]:
# join three tables
rating_df = rating_df.merge(user_df, on=USER_COLUMN_NAMES[0], how='left')
full_df = rating_df.merge(movie_df, on=MOVIE_COLUMN_NAMES[0], how='left')
print(full_df.shape)
full_df.head()

(1000209, 11)


Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip_code,title,genres,year
0,1,1193,5,978300760,F,1,K-12 student,48067,One Flew Over the Cuckoo's Nest,[Drama],1975
1,1,661,3,978302109,F,1,K-12 student,48067,James and the Giant Peach,"[Animation, Children's, Musical]",1996
2,1,914,3,978301968,F,1,K-12 student,48067,My Fair Lady,"[Musical, Romance]",1964
3,1,3408,4,978300275,F,1,K-12 student,48067,Erin Brockovich,[Drama],2000
4,1,2355,5,978824291,F,1,K-12 student,48067,"Bug's Life, A","[Animation, Children's, Comedy]",1998


In [8]:
full_df[full_df[USER_COLUMN_NAMES[0]] == 1]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip_code,title,genres,year
0,1,1193,5,978300760,F,1,K-12 student,48067,One Flew Over the Cuckoo's Nest,[Drama],1975
1,1,661,3,978302109,F,1,K-12 student,48067,James and the Giant Peach,"[Animation, Children's, Musical]",1996
2,1,914,3,978301968,F,1,K-12 student,48067,My Fair Lady,"[Musical, Romance]",1964
3,1,3408,4,978300275,F,1,K-12 student,48067,Erin Brockovich,[Drama],2000
4,1,2355,5,978824291,F,1,K-12 student,48067,"Bug's Life, A","[Animation, Children's, Comedy]",1998
5,1,1197,3,978302268,F,1,K-12 student,48067,"Princess Bride, The","[Action, Adventure, Comedy, Romance]",1987
6,1,1287,5,978302039,F,1,K-12 student,48067,Ben-Hur,"[Action, Adventure, Drama]",1959
7,1,2804,5,978300719,F,1,K-12 student,48067,"Christmas Story, A","[Comedy, Drama]",1983
8,1,594,4,978302268,F,1,K-12 student,48067,Snow White and the Seven Dwarfs,"[Animation, Children's, Musical]",1937
9,1,919,4,978301368,F,1,K-12 student,48067,"Wizard of Oz, The","[Adventure, Children's, Drama, Musical]",1939


In [9]:
rating_df[rating_df[RATING_COLUMN_NAMES[0]] == 1]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip_code
0,1,1193,5,978300760,F,1,K-12 student,48067
1,1,661,3,978302109,F,1,K-12 student,48067
2,1,914,3,978301968,F,1,K-12 student,48067
3,1,3408,4,978300275,F,1,K-12 student,48067
4,1,2355,5,978824291,F,1,K-12 student,48067
5,1,1197,3,978302268,F,1,K-12 student,48067
6,1,1287,5,978302039,F,1,K-12 student,48067
7,1,2804,5,978300719,F,1,K-12 student,48067
8,1,594,4,978302268,F,1,K-12 student,48067
9,1,919,4,978301368,F,1,K-12 student,48067


# Train Test split

In [10]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(rating_df[['user_id', 'movie_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25, random_state=17)
LOOCV = LeaveOneOut(n_splits=1, random_state=1)
train_loocv, test_loocv = list(LOOCV.split(data))[0]

# Evaluation Functions

In [None]:
model = KNNRecommender()
hit_rate = get_hit_rate(model, train_loocv, test_loocv)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [1]:
from collections import defaultdict

from surprise import Dataset, SVD


def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


# First train an SVD algorithm on the movielens dataset.
data = Dataset.load_builtin("ml-100k")
trainset = data.build_full_trainset()
algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Dataset ml-100k could not be found. Do you want to download it? [Y/n] Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /Users/Placebo/.surprise_data/ml-100k
196 ['483', '114', '64', '272', '134', '408', '169', '515', '83', '963']
186 ['195', '318', '199', '190', '165', '661', '515', '134', '169', '480']
22 ['64', '22', '178', '169', '114', '87', '615', '489', '483', '480']
244 ['515', '285', '137', '187', '316', '98', '408', '483', '59', '127']
166 ['174', '98', '408', '22', '515', '318', '496', '603', '489', '169']
298 ['408', '64', '963', '12', '272', '191', '313', '480', '513', '316']
115 ['134', '179', '169', '408', '199', '483', '191', '135', '285', '320']
253 ['603', '963', '169', '357', '199', '251', '143', '480', '513', '651']
305 ['647', '114', '657', '603', '136', '513', '525', '185', '124', '487']
6 ['179', '603', '923', '

[Prediction(uid='196', iid='302', r_ui=3.52986, est=4.164942007389914, details={'was_impossible': False}),
 Prediction(uid='196', iid='377', r_ui=3.52986, est=2.557990260218335, details={'was_impossible': False}),
 Prediction(uid='196', iid='51', r_ui=3.52986, est=3.5537011912154037, details={'was_impossible': False}),
 Prediction(uid='196', iid='346', r_ui=3.52986, est=3.6423348952753827, details={'was_impossible': False}),
 Prediction(uid='196', iid='474', r_ui=3.52986, est=4.16108548824682, details={'was_impossible': False}),
 Prediction(uid='196', iid='265', r_ui=3.52986, est=3.6483812638404296, details={'was_impossible': False}),
 Prediction(uid='196', iid='465', r_ui=3.52986, est=3.5739513014155806, details={'was_impossible': False}),
 Prediction(uid='196', iid='451', r_ui=3.52986, est=3.003367902073778, details={'was_impossible': False}),
 Prediction(uid='196', iid='86', r_ui=3.52986, est=4.1850227609161825, details={'was_impossible': False}),
 Prediction(uid='196', iid='1014', 