## import

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

from yellowbrick.regressor import ResidualsPlot

import lime
import lime.lime_tabular

## df

In [None]:
df_registros_prod = pd.read_excel("../../dist/registros-prod.xlsx")
df_analise_preditiva = pd.read_excel("../../dist/analise-preditiva.xlsx")
df_analise_prescritiva = pd.read_excel("../../dist/analise-prescritiva.xlsx")

## functions & var

In [None]:
def fnc_Dummies(df):
    for cat_feature in df.select_dtypes(include=['object']).columns:
        df[cat_feature] = pd.Categorical(df[cat_feature]).codes
        df[cat_feature] = df[cat_feature].replace(-1,np.nan)
    return pd.DataFrame(df)

## 1. Análise Descritiva

In [None]:
# Tipo de dados original
df_registros_prod.info()

In [None]:
# Conversão de variáveis categóricas
df_registros_prod_v2 = fnc_Dummies(df_registros_prod.copy())
df_registros_prod_v2.info()

In [None]:
df_registros_prod_v2.head()

In [None]:
# Descritiva básica de features
df_registros_prod_v2.describe()

In [None]:
# Histogramas
df_registros_prod_v2.hist(bins=30);

In [None]:
# Regressão Linear Simples (todos vs todos)
lista = df_registros_prod_v2.columns
for var_interesse in lista:
    features_to_analyse = lista
    fig, ax = plt.subplots(2, 2, figsize = (10,10))
    for i, ax in enumerate(fig.axes):
        if i < len(features_to_analyse):
            sns.regplot(x=features_to_analyse[i],y=var_interesse, data=df_registros_prod_v2[features_to_analyse], ax=ax)

## 2. Análise Diagnóstica

In [None]:
# Matriz de Correlação de Pearson
corr = df_registros_prod_v2.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots()

# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot=True, annot_kws={"size": 15}, 
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
corr

In [None]:
# Boxplot com bucket/8 + faixa ideal para peso
for var_boxplot in ['QTD_CHOC', 'VAR_1', 'VAR_2']:    
    plt.figure(figsize = (10, 6))
    ax = sns.boxplot(x=pd.cut(df_registros_prod_v2[var_boxplot], 8), y='PESO_BOMBOM', data=df_registros_prod_v2)
    
    # Faixa de peso ideal 9-10g
    rect = plt.Rectangle((-1,9),100,1,color='green', alpha=0.1, ec='red')
    ax.add_patch(rect)
    ax.set_title(str(var_boxplot+' vs PESO_BOMBOM'))
    
    plt.setp(ax.artists, alpha=.5, linewidth=2, edgecolor="k")
    plt.xticks(rotation=45)

## 3. Análise Preditiva

In [None]:
# Modelo preditivo - GradientBoostingRegressor
df_mqo_v3 = df_registros_prod_v2

# Variáveis para treino
feature_names = ['QTD_CHOC', 'VAR_1', 'VAR_2']
target_name = ['PESO_BOMBOM']

X = df_mqo_v3[feature_names]
y = df_mqo_v3[target_name].values.ravel()

# Separa dados para treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=50)

# Prepara modelo para gráfico
model = GradientBoostingRegressor()
visualizer = ResidualsPlot(model)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.poof()

In [None]:
# Mean squared error
train_model = model.fit(X,y)
y_true = y
y_pred = train_model.predict(X)
print('MSE: ', mean_squared_error(y_true, y_pred))

In [None]:
# Comparativo
df_metric = pd.DataFrame(columns=['y_true','y_pred'])
df_metric['y_true'] = y_true
df_metric['y_pred'] = y_pred
df_metric.hist()

In [None]:
# Faz cópia de dataset
df_analise_preditiva_v2 = fnc_Dummies(df_analise_preditiva)

# Variáveis para predict
feature_names = ['QTD_CHOC', 'VAR_1', 'VAR_2']
X_2 = df_analise_preditiva_v2[feature_names]

# Carrega modelo para predict
df_analise_preditiva_v2['PESO_BOMBOM'] = model.predict(X_2)

In [None]:
df_analise_preditiva_v2.describe()

In [None]:
# Salva predições
writer = pd.ExcelWriter('../../dist/analise-preditiva-new.xlsx', engine='xlsxwriter')
df_analise_preditiva_v2.to_excel(writer, sheet_name='Sheet1', index=False)
writer.save()

## Bônus - lime

In [None]:
# Preparando modelo Lime
explainer = lime.lime_tabular.LimeTabularExplainer(
    X_train.values,
    feature_names=list(X_train.columns), 
    class_names=var_interesse,
    verbose=True, 
    mode='regression'
)

# Define seed para modelo
def explain(instance, predict_fn, **kwargs):
  np.random.seed(50)
  return explainer.explain_instance(instance, predict_fn, **kwargs)

# Modelo de predição para teste
i = 99
exp = explain(X_test.values[i], train_model.predict, num_features=3)

In [None]:
# Resultados
exp.show_in_notebook(show_table=True)

## 4. Análise Prescritiva