# 06. Model Training

### Importando as bibliotecas

In [1]:
import sys
sys.path.append('..')
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import functions.fn_stats as fn_stats
import functions.fn_charts as fn_charts
import params.consts as consts
from sklearn.neighbors import NearestNeighbors
import joblib

### Lendo o dataset tratado e visualizando o overview atual

In [2]:
df = pd.read_csv(consts.DATASET_LEAN) # Armazenando o dataset tratado em uma variável

In [3]:
df # Exbindo uma visão geral do dataset

Unnamed: 0,userId,rating,original_language,original_title,vote_count
0,229,1.0,en,Finding Nemo,6292.0
1,229,3.0,en,Million Dollar Baby,2519.0
2,229,3.0,en,Memento,4168.0
3,229,3.0,en,Raiders of the Lost Ark,3949.0
4,229,4.0,en,Predator,2129.0
...,...,...,...,...,...
189877,270887,5.0,en,Transformers: Revenge of the Fallen,3192.0
189878,270887,4.0,en,Bad Boys II,1588.0
189879,270887,5.0,en,Cars 2,2088.0
189880,270887,5.0,en,In Time,3512.0


In [4]:
df.info() # Exibindo as informações das variáveis do dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189882 entries, 0 to 189881
Data columns (total 5 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   userId             189882 non-null  int64  
 1   rating             189882 non-null  float64
 2   original_language  189882 non-null  object 
 3   original_title     189882 non-null  object 
 4   vote_count         189882 non-null  float64
dtypes: float64(2), int64(1), object(2)
memory usage: 7.2+ MB


In [5]:
fn_stats.describe(df) # Usando a função que exibe as estatísticas das colunas numéricas

Unnamed: 0,userId,rating,vote_count
count,189882.0,189882.0,189882.0
mean,133140.57,3.21,2575.61
std,77788.21,1.08,1637.11
min,229.0,0.5,1005.0
25%,65876.0,2.5,1424.0
50%,132001.0,3.0,1974.0
75%,200222.0,4.0,3198.0
max,270887.0,5.0,14075.0


In [6]:
df.describe(exclude='number') # Exibindo as estatísticas das colunas categóricas

Unnamed: 0,original_language,original_title
count,189882,189882
unique,1,457
top,en,Terminator 3: Rise of the Machines
freq,189882,2399


In [7]:
df.isnull().sum() # Somando todos os valores nulos de cada coluna

userId               0
rating               0
original_language    0
original_title       0
vote_count           0
dtype: int64

### Transformando o dataset em formato pivot

In [8]:
df_pivot = df.pivot_table(columns='userId', index='original_title', values='rating') # Fazendo o pivot table do dataset

df_pivot = df_pivot.fillna(0) # Preenchendo os valores vazios com 0

df_pivot.head() # Exibindo as 5 primeiras linhas do dataset

userId,229,231,741,836,1104,1136,1243,1380,1652,1846,...,269632,269750,269913,270071,270123,270213,270237,270564,270654,270887
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.5,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0
127 Hours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.5,2.0,0.0,0.0,0.0,0.0
2 Fast 2 Furious,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Salvando o dataset pivot

In [40]:
df_pivot.to_csv(consts.DATASET_PIVOT) # Salvando o dataset lean

### Transformando o dataset em uma matriz sparsa

In [41]:
df_sparse = csr_matrix(df_pivot) # Criando o dataset como matriz sparsa indicado para dataset com muitos zeros para compactar a matriz

### Criando o modelo KNN

In [42]:
model = NearestNeighbors(algorithm='brute') # Criando o modelo KNN

### Treinando o modelo

In [43]:
model.fit(df_sparse) # Treinando o modelo com os dados da matriz sparsa

### Persistindo (salvando) o modelo treinado com os melhores parâmetros encontrados pelo Grid Search

In [44]:
joblib.dump(model, consts.MODEL_RECOMENDATION_JOBLIB) # Persistindo o modelo no formato joblib
joblib.dump(model, consts.MODEL_RECOMENDATION_PKL) # Persistindo o modelo no formato pkl

['../models/model_recomendation.pkl']

In [46]:
distances, sugestions = model.kneighbors(df_pivot.filter(items=['Toy Story'], axis=0).values.reshape(1, -1))

for index in range(len(sugestions)):
    print(df_pivot.index[sugestions[index]])

Index(['Toy Story', 'Meet the Fockers', 'Top Gun',
       'Harry Potter and the Chamber of Secrets',
       'Austin Powers: International Man of Mystery'],
      dtype='object', name='original_title')
