In [None]:
# Carregar os datasets
#Se usarmos o Google Colab
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
# Baixa os dados de treinamento
#Link do arquivo no Drive: https://drive.google.com/file/d/1QtndkGEtJM5jwlR_HYtu3ob7uKSZ0FBP/view

link = 'https://drive.google.com/file/d/1QtndkGEtJM5jwlR_HYtu3ob7uKSZ0FBP/view'
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('aug_train.csv')
df_train = pd.read_csv('aug_train.csv')
df_train.shape

In [None]:
# Baixa os dados de teste
link = 'https://drive.google.com/file/d/1FisazChSyPNGQf32tVseN2D3iLBTYMwJ/view?usp=sharing'
id = link.split("/")[-2]
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('aug_test.csv')
df_test = pd.read_csv('aug_test.csv')
df_test.shape

(78273, 11)

In [None]:
# Merge dos datasets 
df_train.sample(3)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
231602,194004,Male,67,1,28.0,0,1-2 Year,Yes,48376.0,26.0,76,0
373978,375708,Male,21,1,33.0,1,< 1 Year,No,29347.0,152.0,104,0
27992,276790,Male,47,1,28.0,1,1-2 Year,No,24558.0,26.0,259,0


In [None]:
df_test.sample(2)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
51955,462484,Male,47,1,28.0,1,1-2 Year,No,31779.0,26.0,261
23464,35745,Female,39,1,28.0,0,1-2 Year,Yes,29753.0,124.0,140


In [None]:
df_test['Response'] = 2 # Criando coluna "Response" no dataset de teste para concatenar com treino
frames = [df_train, df_test]  
df_full = pd.concat(frames)
df_full.sample(3)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
34895,54085,Male,26,1,19.0,1,< 1 Year,No,50316.0,152.0,183,0
171801,88199,Male,53,1,28.0,1,1-2 Year,No,29990.0,26.0,273,0
321686,470881,Female,32,1,8.0,1,1-2 Year,No,42205.0,7.0,170,0


In [None]:
df_train['Response'].value_counts()

0    319553
1     62601
Name: Response, dtype: int64

In [None]:
# Executar a instalação, reiniciar ambiente de execução, comentar essa célula e executar tudo novamente
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(df_full, title="Pandas Profiling Report", explorative=True)

In [None]:
profile.to_widgets()

#### feature engineering

In [None]:
import pandas as pd
import seaborn as sn
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_val_predict 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import (roc_curve,
                             recall_score,
                             roc_auc_score,
                             accuracy_score,
                             precision_score,
                             f1_score)
from sklearn.ensemble import (AdaBoostClassifier, 
                              GradientBoostingClassifier, 
                              ExtraTreesClassifier,
                              RandomForestClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

##### Transformações de categorias para numeros sequenciais

In [None]:
# Label encoder transforma variáveis categoricas em numeros sequenciais
le = preprocessing.LabelEncoder() # Instancia objeto chamado LE para o Label Encoder
df_full['Gender'] = le.fit_transform(df_full['Gender'])

In [None]:
df_full['Vehicle_Damage'] = df_full['Vehicle_Damage'].astype('category') # transforma variável em categorica
df_full['Vehicle_Damage'] = df_full['Vehicle_Damage'].cat.codes # gera numeros sequenciais para categorias

##### Get Dummies 
 
 Transforma variável categorica em colunas binárias

In [None]:
df_full = pd.get_dummies(df_full, columns=['Vehicle_Age', 'Driving_License'])

In [None]:
df_full['Meses_segurado'] = round(df_full['Vintage']/30)

#### Divisão do dataset para treinamento

In [None]:
df_full.set_index('id')

In [None]:
df_train = df_full.loc[df_full['Response'] != 2] # Tudo que é diferente de 2 volta para o dataset de treinamento
df_test = df_full.loc[df_full['Response'] == 2] # Igual a 2 é o dataset de teste
df_test.drop('Response', axis=1, inplace=True) # Apaga coluna Response do dataset de teste. axis=1(coluna), inplace=True persiste em memoria

In [None]:
X = df_train.drop('Response', axis=1)
y = df_train['Response']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

##### Random Forest com parametros aleatórios

In [None]:
# Random Forest com parametros aleatórios
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

rf = RandomForestClassifier(n_estimators=3,       # Numero de arvores do algoritmo (padrão 100) 
                            max_depth = 2,        # Padrão None, numeros inteiros. profundidade máxima da arvore
                            n_jobs=10,            # Numero de execuções paralelas
                            min_samples_split=10, # Padrão 2, numeros inteiros numero de divisão de um nó
                            min_samples_leaf=4,   # Padrão 1, numeros inteiros. Numero minimo necessário para estar em um nó folha
                            max_features='sqrt')  # “sqrt”, “log2”, None, ou valor inteiro limitado ao numero de features

In [None]:
rf.fit(X_train, y_train) # Treinamento do modelo

RandomForestClassifier(max_depth=2, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=10, n_estimators=3, n_jobs=10)

In [None]:
y_pred = rf.predict(X_test) # predict no dataset de test

In [None]:
def metrics(y_pred, y_test):
  print('Acurácia {}'.format(accuracy_score(y_test, y_pred)))
  print('Recall {}'.format(recall_score(y_test, y_pred)))
  print('F1 Score {}'.format(f1_score(y_test, y_pred)))
  print('Precision {}'.format(precision_score(y_test, y_pred)))

In [None]:
metrics(y_pred, y_test)

Acurácia 0.8344483501530786
Recall 0.0
F1 Score 0.0
Precision 0.0


## Tuning de Modelos

##### Random Search

In [None]:
n_estimator = [1, 50, 100, 150, 200, 250, 300, 400, 500]
min_samples_split = [2, 4, 8, 16]
min_samples_leaf = [1, 2, 4, 8]
max_features = ['auto', 'sqrt', 'log2', None]
max_depth = [2, 4, 6, 8, 10]

# Dicionário com os parametros
rf_params = {
    'n_estimators':n_estimator,
    'min_samples_split': min_samples_split,
    'min_samples_leaf':min_samples_leaf,
    'max_features':max_features,
    'max_depth':max_depth
}

In [None]:
rf_tuned = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=37), 
                              param_distributions=rf_params, # passagem dos parametros
                              cv=5, # validação cruzada 
                              n_iter=5, # numero de interações
                              scoring='precision', # metrica de avaliação 
                              n_jobs=-1, # uso de processadores 
                              verbose=0) # log

In [None]:
rf_tuned.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=37),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'max_depth': [2, 4, 6, 8, 10],
                                        'max_features': ['auto', 'sqrt', 'log2',
                                                         None],
                                        'min_samples_leaf': [1, 2, 4, 8],
                                        'min_samples_split': [2, 4, 8, 16],
                                        'n_estimators': [1, 50, 100, 150, 200,
                                                         250, 300, 400, 500]},
                   scoring='precision')

In [None]:
y_pred = rf_tuned.predict(X_test)

In [None]:
metrics(y_pred, y_test)

Acurácia 0.8972149293047355
Recall 0.3791359325605901
F1 Score 0.5498166259168704
Precision 1.0


In [None]:
print('Melhor número de árvores: {}'.format(rf_tuned.best_params_['n_estimators']))
print('Melhor número número mínimo de amostras necessárias para dividir um nó interno: {}'.format(rf_tuned.best_params_['min_samples_split']))
print('Melhor número mínimo de amostras necessárias para estar em um nó da folha: {}'.format(rf_tuned.best_params_['min_samples_leaf']))
print('Melhor número de variáveis a serem considerados ao procurar a melhor divisão: {}'.format(rf_tuned.best_params_['max_features']))
print('Melhor nível de profundidade máxima da arvore: {}'.format(rf_tuned.best_params_['max_depth']))

Melhor número de árvores: 400
Melhor número número mínimo de amostras necessárias para dividir um nó interno: 8
Melhor número mínimo de amostras necessárias para estar em um nó da folha: 4
Melhor número de variáveis a serem considerados ao procurar a melhor divisão: log2
Melhor nível de profundidade máxima da arvore: 10


In [None]:
import joblib 
joblib.dump(rf_tuned, '/content/rf_tuned.pkl')
joblib.dump(rf_tuned, '/content/rf_tuned.joblib')

['/content/rf_tuned.joblib']