## Домашнее задание

Для датасета *MovieLens_1M* получить RMSE на тестовой выборке 0.87 или ниже

In [1]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd

In [25]:
movies = pd.read_table('MovieLens_1M/movies.dat', sep='::', names=['movieId', 'title', 'genres'], engine='python')
ratings = pd.read_table('MovieLens_1M/ratings.dat', sep='::', names=['userId', 'movieId', 'rating', 'timestamp'], engine='python')

In [21]:
# movies = pd.read_csv('MovieLens/movies.csv')
# ratings = pd.read_csv('MovieLens/ratings.csv')

In [26]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.shape

(1000209, 6)

In [27]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [28]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [29]:
trainset, testset = train_test_split(data, test_size=0.15)

In [8]:
def calculate_rmse(i, ctr, user_based, trainset, testset):
    print(f'#{ctr}: n_neighbors={i};\t', end='')
    algo = KNNWithMeans(k=i, sim_options={'name': 'pearson_baseline', 'user_based': user_based}, verbose=False)
    algo.fit(trainset)
    test_pred = algo.test(testset)
    return accuracy.rmse(test_pred, verbose=True)

def processing_data_analysis(n, approach, data):
    for ctr, i in enumerate(range(0, n + 1, 5)):
        trainset, testset = train_test_split(data, test_size=0.15)
        rmse = calculate_rmse(i, ctr, approach == 'user_based', trainset, testset)
        if rmse <= 0.87:
            rmses[approach].append(rmse)

In [30]:
from time import time

rmses = dict({
    'user_based': list(),
    'item_based': list()
})

n = 51

ptime = time()
print('Processing user-based approach....')
processing_data_analysis(n, 'user_based', data)
print(f'Elapsed: {time()-ptime:.4f}')

print(40*'-')
      
ptime = time()
print('\nProcessing item-based approach....')
processing_data_analysis(n, 'item_based', data)
print(f'Elapsed: {time()-ptime:.4f}')

Processing user-based approach....
#0: n_neighbors=0;	RMSE: 1.0373
#1: n_neighbors=5;	RMSE: 0.9420
#2: n_neighbors=10;	RMSE: 0.9085
#3: n_neighbors=15;	RMSE: 0.8978
#4: n_neighbors=20;	RMSE: 0.8932
#5: n_neighbors=25;	RMSE: 0.8912
#6: n_neighbors=30;	RMSE: 0.8906
#7: n_neighbors=35;	RMSE: 0.8877
#8: n_neighbors=40;	RMSE: 0.8883
#9: n_neighbors=45;	RMSE: 0.8858
#10: n_neighbors=50;	RMSE: 0.8850
Elapsed: 2074.0813
----------------------------------------

Processing item-based approach....
#0: n_neighbors=0;	RMSE: 0.9800
#1: n_neighbors=5;	RMSE: 0.8876
#2: n_neighbors=10;	RMSE: 0.8619
#3: n_neighbors=15;	RMSE: 0.8555
#4: n_neighbors=20;	RMSE: 0.8533
#5: n_neighbors=25;	RMSE: 0.8535
#6: n_neighbors=30;	RMSE: 0.8541
#7: n_neighbors=35;	RMSE: 0.8563
#8: n_neighbors=40;	RMSE: 0.8585
#9: n_neighbors=45;	RMSE: 0.8556
#10: n_neighbors=50;	RMSE: 0.8561
Elapsed: 931.8066


In [31]:
for key, value in rmses.items():
    if value:
        print(f'Best RMSE for {key} approach is: {min(value):.4f}')

Best RMSE for item_based approach is: 0.8533
