# Filtragem Baseada em Conteudo
## KNN

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
import math

In [2]:
# Não precisamos considerar viús do usuário
# A similariadade será feita com Jaccard e os metadados servem para calcular a similaridade dos itens

In [3]:
# Importando .csv
r = pd.read_csv('dataset/train_data.csv', header=None,  skiprows=[0], usecols=[0,1,2])
m = pd.read_csv('dataset/movies_data.csv')

# Obtendo generos dos filmes
genres = pd.get_dummies(m.set_index(['movie_id']).genres.str.split('|', expand=True).stack(dropna=False)).sum(level=0)

In [4]:
# Função para cria modelo
def fbc_knn(train, features, K = 4):
    ratings = r.pivot(index=1, columns=0, values=2)
    ratings.fillna(0.0, inplace=True)
    # Calculando similaridade Jaccard
    sim = genres.T.corr(method=distance.jaccard) # features.corr(method=distance.jaccard)
    sim.fillna(0.0, inplace=True)

    return { "sim": sim, "K": K, "ratings": ratings }

In [5]:
model = fbc_knn(r, genres)

In [6]:
# Função de predição
def predict(model, user, item, k = 5):
    sim_items = np.argsort(-model["sim"][item])
    rated_items = model["ratings"][user].index[model["ratings"][user] > 0].tolist()
    sim_k = np.intersect1d(sim_items, rated_items)
    sim_k = [x for x in sim_items if x in sim_k][:k]
    sumSim = 0.0
    sumWeight = 0.0
    for j in sim_k:
        sumSim += model["sim"][item][j]
        sumWeight += model["sim"][item][j] * model["ratings"][user][j]
    if sumSim == 0.0:
        return 0

    return sumWeight/sumSim

In [7]:
# Avaliando predições
def rmse(model, test):
    sum_err = 0
    for t in test:
        u = t[0]
        i = t[1]
        r_ui = t[2]
        pred = predict(model, u, i)
        error = (r_ui - pred)**2
        sum_err += error
    return math.sqrt(sum_err/len(test))

In [8]:
rmse(model, r.values[535000:])

1.1847087913695233