### Importar base de dados e instalar framework de recomendação

### Importar bibliotecas

In [None]:
! wget https://github.com/mmanzato/MBABigData/raw/master/ml-20m-compact.tar.gz
! tar -xvzf ml-20m-compact.tar.gz
! pip install caserecommender

In [None]:
import pandas as pd
import numpy as np

### Explorar Dados

In [None]:
movies = pd.read_csv('./dataset/movies_sample.csv')
movies.tail()

In [None]:
ratings = pd.read_csv('./dataset/ratings_sample.csv')
ratings.head()

In [None]:
ratings.rating.value_counts().plot(kind='bar', color=['r', 'g', 'y', 'c', 'b']);

In [None]:
df = ratings[['userId', 'movieId', 'rating']]
df.tail()

In [None]:
df = df.merge(movies[['movieId', 'title']])
df.head()

### Números de usuários e número de itens

In [None]:
print(
"""
Número de usuários: {}
Número de itens: {}
Número de interações: {}
""".format(
    df.userId.nunique(),
    df.movieId.nunique(),
    df.shape[0]
)
)

### Mapeamento em idx

In [None]:
map_users = {user: idx for idx, user in enumerate(df.userId.unique())}
map_items = {item: idx for idx, item in enumerate(df.movieId.unique())}

In [None]:
df['userId'] = df['userId'].map(map_users)
df['movieId'] = df['movieId'].map(map_items)
df.head()

In [None]:
map_title = {}

for _, row in df.iterrows():
    map_title[row.movieId] = row.title

In [None]:
print(len(map_title))
print(map_title[100])

In [None]:
# qtd interações dos usuários
df.groupby('userId').count()

### Divisão do dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df, test_size=.2, random_state=2)

In [None]:
train.to_csv('train.txt', index=False, header=False, sep='\t')
test.to_csv('test.txt', index=False, header=False, sep='\t')

In [None]:
! ls -l

In [None]:
test.shape[0], train.shape[0]

### Recomendadores

## Prever notas

In [None]:
from caserec.recommenders.rating_prediction.most_popular import MostPopular

In [None]:
MostPopular('train.txt', 'test.txt', 'out_mp_pred.txt').compute()

In [None]:
df_pred = pd.read_csv('out_mp_pred.txt', sep='\t', names=['userId', 'movieId', 'pred'])
df_pred.head()

In [None]:
test[test.userId == 0]

In [None]:
df_pred = df_pred.merge(test)

In [None]:
df_pred.rating.value_counts().plot(kind='bar')

In [None]:
df_pred['pred'] = round(df_pred['pred']*2)/2
df_pred.pred.value_counts().plot(kind='bar');

### Top N

In [None]:
from caserec.recommenders.item_recommendation.most_popular import MostPopular as MPR

In [None]:
MPR('train.txt', 'test.txt', 'out_mp_pred.txt').compute()

In [None]:
MPR('train.txt', 'test.txt', 'out_mp_pred_binary.txt', as_binary=True).compute()

In [None]:
ranking = pd.read_csv('out_mp_pred.txt', sep='\t', names=['user_id', 'movieId', 'score'])
ranking['title'] = ranking.movieId.map(map_title)
ranking.head(15)

In [None]:
ranking = pd.read_csv('out_mp_pred_binary.txt', sep='\t', names=['userId', 'movieId', 'score'])
ranking['title'] = ranking.movieId.map(map_title)
ranking.head(15)