In [125]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import surprise as sp
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV, RandomizedSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm, tqdm_notebook

import os

## Ejemplo  Surprise
### https://surprise.readthedocs.io/en/stable/

In [2]:
df_train = pd.read_csv('./data/ratings_train.csv')
df_test = pd.read_csv('./data/ratings_test.csv')

In [38]:
trainset, testset = train_test_split(df_train, test_size=0.30)
#trainset = df_train
#testset = df_test
print(trainset.shape, testset.shape)

(770089, 10) (85509, 10)


In [39]:
# Escala del rating (lo usar para hacer maximo y minimo), podria levantar info del archivo.

reader = sp.reader.Reader(rating_scale=(0.5, 5))

data= sp.dataset.Dataset.load_from_df(trainset[['userID', 'movieID', 'rating']], reader)
#testset= sp.dataset.Dataset.load_from_df(testset[['userID', 'movieID', 'rating']], reader)

In [40]:
# opciones del algoritmo
sim_options = {
    'name': 'pearson_baseline', # Podria ser "cosine" o "MSD", "pearson", "pearson_baseline"
    'user_based': False, #False,   # TRUE basado en Usuarios (compara usuarios con usuarios), False basado en Items. (Hay que probar)
    #'shrinkage': 1  # no shrinkage
}

# Algoritmo a usar para predecir
# algo = sp.NormalPredictor() toma un numero al azar

# min_k minima cantidad de vecinos
#algo = sp.prediction_algorithms.knns.KNNBasic(k=40, min_k=10, sim_options=sim_options)
algo = sp.prediction_algorithms.knns.KNNWithMeans(k=30, min_k=4, sim_options=sim_options)
# n_jobs= -1 para que tome todos los procesadores.
# sp.model_selection.validation.cross_validate(algo, data, measures=['rmse', 'mae'], cv=5, n_jobs=-1)


In [41]:
trainset = data.build_full_trainset()

In [42]:
predictions = algo.fit(trainset)#.test(testset)
#accuracy.rmse(predictions)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [43]:
#uid = str(33349)  # raw user id (as in the ratings file). They are **strings**!
#iid = str(2791)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
#pred_1 = algo.predict(uid, iid, r_ui=4, verbose=True)

In [44]:
len(testset)

85509

In [45]:
list_pred = []

for index, row in tqdm_notebook(testset.iterrows()):
    row_pred = dict()
    uid = row.userID
    iid = row.movieID
    pred = algo.predict(uid, iid, verbose=False)
    row_pred["ID"] = row.ID.astype(int)
    row_pred["rating"] = np.around(pred.est, 2)
    list_pred.append(row_pred)
   

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [46]:
df_list_pred = pd.DataFrame(list_pred)
len(df_list_pred)

85509

In [27]:
### Cosine: 0.7726, pearson: 0.75, MSD_True: 0.6851 , MSD_False=0.7023
###
### KNNWithMeans
### MSD_True (min=5, max=40): 0.6612
### MSD_True (min=2, max=40): 0.6611
### MSD_True (min=1, max=30): 0.6640
### MSD_True (min=2, max=50): 0.6603
### pearson_baseline_True (min=2, max=50): 0.6299 >> 0.77689
### pearson_baseline_True (min=5, max=30): 0.6287
### pearson_baseline_True (min=5, max=40): 0.6276  
### pearson_baseline_True (min=5, max=50): 0.6276
### pearson_baseline_False (min=5, max=50): 0.6033 >>>>>> 0.75570
### pearson_baseline_False (min=6, max=25): 0.6021 
### pearson_baseline_False (min=5, max=15): 0.6081
### pearson_baseline_False (min=4, max=30): 0.6015 <=======================
### pearson_baseline_False (min=2, max=30): 0.6015
### pearson_baseline_True (min=5, max=40, 'shrinkage': 0): 0.6425
### pearson_baseline_True (min=5, max=40, 'shrinkage': 1): 0.6403
### KNNWithZScore
### pearson_baseline_False (min=6, max=40): 0.6021

In [36]:
print('Mean squared error: %.4f' % mean_squared_error(testset.rating, df_list_pred.rating))

Mean squared error: 0.6015


## Corrida sobre TEST entrenando con todo TRAIN

In [47]:
df_list_pred.to_csv("./results/ignacio_submit7.csv", index=False)