In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import distance
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

import time

In [None]:
train = pd.read_csv('dataset/train_data.csv', header=None,  skiprows=[0], usecols=[0,1,2])
test = pd.read_csv('dataset/test_data.csv')

In [None]:
mv = pd.read_csv('dataset/movie_reviews.csv')

train_movies = pd.read_csv('dataset/movies_data.csv')

In [None]:
def fbc_knn(train, features, k = 5):
    ratings = train.pivot(index=1, columns=0, values=2)
    ratings.fillna(0.0, inplace=True)

    sim = cosine_similarity(csr_matrix(features))
    sim = pd.DataFrame(data=sim, index=mv["movie_id"].unique())
    sim = pd.DataFrame(data=sim, index=train_movies["movie_id"].unique())

    sim.fillna(0.0, inplace=True)

    return { "sim": sim, "K": k, "ratings": ratings }

In [None]:
def predict(model, user, item, k = 5):
    sim = model["sim"]
    ratings = model["ratings"]
    if item not in sim or user not in ratings:
        return 0
    sim_items = sim[item].sort_values(ascending=False).index
    rated_items = ratings[user][ratings[user] > 0].index
    sim_k = np.intersect1d(sim_items, rated_items)
    top_k = []
    for x in sim_items:
        if k <= -1:
            break
        if x in sim_k:
            top_k.append(x)
            k-=1
    # sim_k = [x for x in sim_items if x in sim_k][:k]
    top_k = sim_k
    sumSim = 0.0
    sumWeight = 0.0
    for j in sim_k:
        sumSim += sim[item][j]
        sumWeight += sim[item][j] * ratings[user][j]
    if sumSim == 0.0:
        return 0

    return sumWeight/sumSim

In [None]:
# Avaliando predições
def rmse(model, test, k = 5):
    sum_err = 0
    for t in test:
        u = t[0]
        i = t[1]
        r_ui = t[2]
        pred = predict(model, u, i, k)
        error = (r_ui - pred)**2
        sum_err += error
    return math.sqrt(sum_err/len(test))

In [None]:
def validation(model, data, features,  train_split = 0.75, k = 5):
    data = data.sample(frac=1)
    train_len = int(train_split * len(data))
    train = data[:train_len]
    test = data[train_len:].values

    start_time = time.time()
    model = model(train, features)
    print("Tempo de treinamento em segundos: ", time.time() - start_time)
    start_time = time.time()
    print("RMSE :", rmse(model, test, k))
    print("Tempo de predição em segundos: ", time.time() - start_time)

In [None]:
v = TfidfVectorizer()
features = v.fit_transform(mv.groupby("movie_id")["text"].apply(lambda x: (x + " ").sum()))
# feature_names_x = v.get_feature_names()
# dense_x = x.todense()
# denselist_x = dense_x.tolist()

# tfidf = pd.DataFrame(denselist_x, columns=feature_names_x)
validation(fbc_knn, train, features, 0.93, 5)

## Execução teste com distância de cosseno k=10 treino=93%
Tempo de treinamento em segundos:  5.410625219345093 <br>
RMSE : 2.0719085230048155 <br>
Tempo de predição em segundos:  569.2161264419556 <br>

## Execução teste com similaridade de cosseno k=10 treino=93%
Tempo de treinamento em segundos:  5.874245882034302 <br>
RMSE : 2.0298423556333227 <br>
Tempo de predição em segundos:  493.27765107154846 <br>

## Execução teste com similaridade de cosseno k=5 treino=93%
Tempo de treinamento em segundos:  5.544472694396973 <br>
RMSE : 2.0474095654885294 <br>
Tempo de predição em segundos:  512.1445598602295 <br>

In [None]:
# tfidf = pd.DataFrame(data=sim, index=mv["movie_id"].unique())
# tfidf = pd.DataFrame(data=tfidf, index=train_movies["movie_id"].unique())
# tfidf = tfidf.fillna(0)

In [None]:
# df_tfidf = pd.read_csv('dataset/tfidf_movies.csv')
# df_tfidf = pd.DataFrame(data=df_tfidf.values, index=mv["movie_id"].unique(), columns=df_tfidf.columns)
# tfidf = pd.DataFrame(data=df_tfidf, index=train_movies["movie_id"].unique(), columns=df_tfidf.columns)

In [None]:
# sim = tfidf.T.corr(method=distance.jaccard)