In [11]:
import pandas as pd
import numpy as np
import math

import time

In [12]:
# Importando .csv
train = pd.read_csv('dataset/train_data.csv')

In [13]:
# Implementando baseline
def baseline(train):
    mean = train["rating"].mean(skipna = True)
    users = train.pivot(index='movie_id', columns='user_id', values='rating')
    items = train.pivot(index='user_id', columns='movie_id', values='rating')
    nusers = train.values[:, 0].max()+1
    nitems = train.values[:, 1].max()+1
    bu = np.full(nusers, np.nan, dtype=float)
    bi = np.full(nitems, np.nan, dtype=float)

    for i in items.columns.values:
        bi[i] = np.nanmean(items[i] - mean)

    aux = bi[~np.isnan(bi)]
    for u in users.columns.values:
        bu[u] = np.nanmean(users[u] - mean - aux)

    bi = np.nan_to_num(bi)
    bu = np.nan_to_num(bu)
    
    return { "bu" : bu, "bi": bi, "mean": mean }

In [14]:
# Predizendo nota
def predict(model, u, i):
    return model["mean"] + model["bu"][u] + model["bi"][i]

In [15]:
# Avaliando predições
def rmse(model, test):
    sum_err = 0
    for t in test:
        u = t[0]
        i = t[1]
        r_ui = t[2]
        pred = predict(model, u, i)
        error = (r_ui - pred)**2
        sum_err += error
    return math.sqrt(sum_err/len(test))

In [16]:
def cross_validation(model, data, k_folds = 10):
    folds = np.array_split(data.sample(frac=1), k_folds)
    for i in range(k_folds):
        train = folds.copy()
        test = folds[i].values
        del train[i]
        train = pd.concat(train, sort=False)
        model_k = model(train)
        print("Iteração {0}: {1}".format(i+1, rmse(model_k, test)))

def validation(model, data, train_split = 0.75):
    data = data.sample(frac=1)
    train_len = int(train_split * len(data))
    train = data[:train_len]
    test = data[train_len:].values

    start_time = time.time()
    model = model(train)
    print("Tempo de treinamento em segundos: ", time.time() - start_time)
    start_time = time.time()
    print("RMSE :", rmse(model, test))
    print("Tempo de predição em segundos: ", time.time() - start_time)

In [17]:
# baseline = baseline(train)

In [18]:
# rmse(baseline, train.values[401838:])

In [19]:
validation(baseline, train, 0.75)

Tempo de treinamento em segundos:  2.885563850402832
RMSE : 0.90971936077354
Tempo de predição em segundos:  0.5346899032592773


## Execução 1 treino 75%

Tempo de treinamento em segundos: 3.1792304515838623  <br />
RMSE : 0.9093269779125659  <br />
Tempo de predição em segundos:  0.6293540000915527  <br />