# Filtragem Baseada em Conteudo
## Linear

In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
# Importando .csv
r = pd.read_csv('dataset/train_data.csv', header=None,  skiprows=[0], usecols=[0,1,2])
m = pd.read_csv('dataset/movies_data.csv')

# Obtendo generos dos filmes
genres = pd.get_dummies(m.set_index(['movie_id']).genres.str.split('|', expand=True).stack(dropna=False)).sum(level=0)

In [3]:
# Função para cria modelo
def fbc_linear(train, test, features, lr = 0.05, reg = 0.002, miter = 10):
    nusers = np.append(train[:,0], test[:,0]).max()
    nitems = np.append(train[:,1], test[:,1]).max()
    nfeatures = len(features[1])
    features = np.hstack((features,np.ones((len(features),1))))
    profiles = np.random.normal(loc = 0, scale = 0.1, size=(nusers, nfeatures+1))
    error = list()
    for l in range(0, miter):
        sq_error = 0
        for j in range(0, len(train)):
            u = train[j, 0]-1
            i = train[j, 1]-1
            r_ui = train[j, 2]-1
            e_ui = np.dot(profiles[u, ], features[i, ]) - r_ui
            sq_error += e_ui**2
            for k in range(nfeatures-1):
                profiles[u, k] = profiles[u, k] - lr * (e_ui * features[i, k] + reg * profiles[u, k])
            k = nfeatures
            profiles[u, k] = profiles[u, k] - lr * (e_ui * features[i, k])
        error.append(math.sqrt(sq_error/len(train)))
    return { "profiles": profiles, "error": error }



In [4]:
# Predizendo nota
def predict(model, user, item, features):
    features = np.hstack((features,np.ones((len(features),1))))
    return np.dot(model["profiles"][user-1, ], features[item-1, ])

In [5]:
# Avaliando predições
def rmse(model, test, features):
    sum_err = 0
    for t in test:
        u = t[0]
        i = t[1]
        r_ui = t[2]
        pred = predict(model, u, i, features)
        error = (r_ui - pred)**2
        sum_err += error
    return math.sqrt(sum_err/len(test))

In [6]:
model = fbc_linear(r.values[:482205], r.values[482205:], genres.values)

In [7]:
rmse(model, r.values[482205:], genres.values)

3.808252453567712