## Pacotes

In [None]:
import zipfile
import time

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import model_selection, ensemble, tree, neighbors, gaussian_process
from fancyimpute import SoftImpute

## Funções e Variáveis

In [None]:
# https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html

# Lista de modelos
modelos = [
    ensemble.GradientBoostingClassifier,
    ensemble.RandomForestClassifier,
    tree.DecisionTreeClassifier,
    neighbors.KNeighborsClassifier,
    gaussian_process.GaussianProcessClassifier
]

## Datasets

In [None]:
# Small_df
df_small_train = pd.read_csv(zipfile.ZipFile('../../dist/orange_small_train.data.zip').open('orange_small_train.data'),sep='\t')
df_small_test = pd.read_csv(zipfile.ZipFile('../../dist/orange_small_test.data.zip').open('orange_small_test.data'),sep='\t')

# labels
df_small_train['appetency'] = pd.read_csv('../../dist/orange_small_train_appetency.labels',header = None)
df_small_train['churn'] = pd.read_csv('../../dist/orange_small_train_churn.labels',header = None)
df_small_train['upselling'] = pd.read_csv('../../dist/orange_small_train_upselling.labels',header = None)

In [None]:
df_small_train.describe()

# Data Cleaning, Preparation &  Missing Values

##  #6 - Selecionar fatos com 30%+ de dados

In [None]:
df = df_small_train.copy()
df_v2 = df[[column for column in df if df[column].count() / len(df) >= 0.3]]

print("Lista de fatos excluídos:", end=" ")

for c in df.columns:
    if c not in df_v2.columns:
        print(c, end=", ")

In [None]:
df_v2.head()

## #7 - Criação de dummies para modelos

In [None]:
#df_v2.iloc[0:100].copy()
df_v3 = df_v2.copy()

for cat_feature in df_v3.select_dtypes(include=['object']).columns:
    df_v3[cat_feature] = pd.Categorical(df_v3[cat_feature]).codes
    df_v3[cat_feature] = df_v3[cat_feature].replace(-1,np.nan)
    
df_v3.head()

## #8 - Tratamento de missing values

In [None]:
imp_cols = df_v3.columns.values
imputer = SoftImpute()
df_v4 = pd.DataFrame(imputer.fit_transform(df_v3), columns= imp_cols)

In [None]:
df_v4.describe()

# Exploratory Data Analysis

## #9 - Histogramas

In [None]:
df_v4.hist(figsize=(32, 40), bins=50, xlabelsize=8, ylabelsize=8);

## #10 - Regressão linear (out)

In [None]:
# features_to_analyse = df_v4.columns

# fig, ax = plt.subplots(round(len(features_to_analyse) / 3), 3, figsize = (32,40))

# for i, ax in enumerate(fig.axes):
#     if i < len(features_to_analyse) - 1:
#         sns.regplot(x=features_to_analyse[i],y='appetency', data=df_v4[features_to_analyse], ax=ax)

## #11 - Matriz de correlação

In [None]:
corr = df_v4.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,annot=False, annot_kws={"size": 8}, 
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
var_interesse = corr[['appetency','churn','upselling']]
var_interesse = var_interesse.drop(['appetency','churn','upselling'])
var_interesse.describe()

# Model Selection & Building

In [None]:
# Preparar dataset para treinos
df_v5 = df_v4.copy()

lista_var_interesse = ['appetency','churn','upselling']

del df_v5['appetency']
del df_v5['churn']
del df_v5['upselling']

# Lista de resultados
resultados = [['status','model','mean','std','time']]

# Score dos modelos
for var_interesse in lista_var_interesse:
    print('='*100)
    print(var_interesse)
    print('='*100)
    
    Y = df_v4[[var_interesse]].values.ravel()
    for var in modelos:
        start = time.time()
        try:
            print(var)
            clf = var()
            scores = model_selection.cross_val_score(clf, df_v5, Y, cv=10, error_score='raise')
            print('Mean score: ',np.mean(scores), '/ Std Score: ',np.std(scores))
            resultados.append(['ok',var.__name__,np.mean(scores),np.std(scores),time.time() - start])
        except(Exception):
            print('>> Validar parâmetros.')
            resultados.append(['erro',var.__name__,None,None,time.time() - start])
            pass
        finally:            
            print('-'*100)

In [None]:
writer = pd.ExcelWriter('../../dist/resultados_modelos.xlsx', engine='xlsxwriter')
df_final = pd.DataFrame(resultados[1:])
df_final.columns = resultados[0]
df_final.to_excel(writer, sheet_name='Sheet1', index=False)
writer.save()

In [None]:
df_final.sort_values(by='mean', ascending=False)