In [None]:
#### Carregando dados

import warnings
warnings.filterwarnings('ignore')

import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns; sns.set()

from google.cloud.bigquery import Client, QueryJobConfig
client = Client()
query = """SELECT * FROM `hacka-dados.hacka_dados.tabela_dados_hacka`"""
job = client.query(query)
df = job.to_dataframe()

## Limpeza

#### Notação de células vazias

def uniform_missing_values(df, notacoes=['nan', '']):
    obj_cols = df.columns[df.dtypes=='object']
    for col in obj_cols:
        for value in notacoes:
            df[col] = df[col].replace(value, np.nan)
    return df

df = uniform_missing_values(df, notacoes=['nan', ''])

# Descomente para conferir valores diferentes para notações ou conferir substituição (se já foi feita).
# for col in obj_cols: print(f'{col}: {df[col].unique()}'); print()

#### Células vazias e nulas por coluna

empty = df.isna().sum().nlargest(df.shape[1]) / df.shape[0]
null = (df==0).sum().nlargest(df.shape[1]) / df.shape[0]

fig, ax = plt.subplots(figsize=(6, 3.5))
empty.plot()
null.plot() # descomente para incluir valores nulos
(empty+null.loc[empty.index]).plot() # descomente para incluir valores nulos
ax.set(
    xticks=[],
    xlabel='Colunas',
    ylabel='Células (%)',
    title='Células vazias e nulas por coluna (%)',
)
ax.legend(['Vazios (%)', 'Nulos (%)', 'Vazios+Nulos]); plt.show()

null.head(3)

#### Células vazias e nulas por linha

empty = df.isna().sum(1).nlargest(df.shape[0]) / df.shape[1]
null = (df==0).sum(1).nlargest(df.shape[0]) / df.shape[1]

fig, ax = plt.subplots(figsize=(6, 3.5))
ax.plot(np.percentile(empty, range(100)))
ax.plot(np.percentile(null, range(100)))
ax.set(
    xticks=[],
    xlabel='Células (%)',
    ylabel='Linhas (%)',
    title='Percentil vazios/nulos por linha (%/%)',
)
ax.legend(['Vazios (%)', 'Nulos (%) dentre não vazios']); plt.show()

---
## Modelagem preliminar de classificação da renovação

# pessoal não se assuntem, copiei de outro projeto meu pra datasets desbalanceados

### Transformação de dados

data = df.copy()

# extract columns for each data type
int_cols, obj_cols, float_cols = (
    data.select_dtypes([datatype]).columns.tolist() for datatype in ('int64', 'object', 'float')
)

# fill missing values for float columns
for col in float_cols:
    # data[col].fillna(data[col].mean(), inplace=True)
    data[col].fillna(0, inplace=True)

# import preprocessing functions
from sklearn.preprocessing import LabelEncoder as le, MinMaxScaler as mms

#### Label Encode Categorical (+ Integer) Columns
for col in obj_cols + int_cols:
    data[col] = le().fit_transform(data[col]) # fill missing values automatically

#### Scale Float Columns
data[float_cols] = mms().fit_transform(data[float_cols]) # eliminates negative values

### Defina a variável dependente

target = 'fl_renovou'

# define X and Y
Y = data[target].copy()
X = data.drop(target, 1).copy()
    
# target variable classes' count
display(Y.value_counts().rename('Target variable class count').to_frame())

### Under sampling

from módulos.splitter import UnderSampleSplit
from módulos.cv_samplers import print_cls_cnt

uss = UnderSampleSplit(
    train_size=0.8, train_prct=1,
    test_size=0.2, test_prct=None,
    replace=False, shuffle=True,
    random_state=None
)
train_index, test_index = uss.train_test_undersample(Y, random_state=0)
xt, xe, yt, ye = X.loc[train_index], X.loc[test_index], Y.loc[train_index], Y.loc[test_index], 
print_cls_cnt(Y, train_index, test_index)

cv = uss.split(Y, n_splits=10, param_list=None) # under sampling 
# for i in range(5): cv[i] = (cv[i][0], test_index) # to force same test set

left_index = set(Y.index).difference(train_index)

### Modelos e metricas de avaliação

# define model collection
from sklearn.utils import all_estimators
classifiers = dict(all_estimators('classifier'))

#### define score functions
import sklearn.metrics as metrics
recall_0, recall_1 = (metrics.make_scorer(metrics.recall_score, pos_label=label) for label in (0,1))
precision_0, precision_1 = (metrics.make_scorer(metrics.precision_score, pos_label=label) for label in (0,1))
scoring = {
    'accuracy': 'accuracy', 'f1_macro': 'f1_macro',
    'recall_macro': 'recall_macro', 'precision_macro': 'precision_macro',
    'recall_0': recall_0, 'recall_1': recall_1,
    'precision_0': precision_0, 'precision_1': precision_1,
}

### Comparando shuffle splits aleatorios

modelname = 'GradientBoostingClassifier' # 'DecisionTreeClassifier', 'LinearSVC', 'LogisticRegression',
Model = classifiers[modelname]
model = Model(n_estimators=100, random_state=0)

#### score cross validation splits
from sklearn.model_selection import cross_validate
model_scrs = cross_validate(
    model, X, Y, cv=cv,
    scoring=scoring,
    groups=None,
    n_jobs=-1,
    pre_dispatch='all',
    verbose=5
)

scrs_df = pd.DataFrame({scr: model_scrs['test_'+scr] for scr in scoring})
scrs_df.index.name='cv split'

#### Scores médios

scrs_df.agg([np.mean, np.var])#.to_frame('mean').T

#### Scores por split

scrs_df

### Curvas de aprendizado Shuffle splits

from sklearn.model_selection import learning_curve
order=2
train_sizes = np.linspace(0.1, 1, 11)**order
# Calculate recall learning curves per class
lc_0, lc_1 = ( learning_curve(
    model,
    X, Y,
    train_sizes=train_sizes,
    cv=cv,
    scoring=[recall_0, recall_1][i],
    groups=None,
    random_state=0,
    shuffle=True,
    verbose=5,
    n_jobs=-1,
    pre_dispatch='all',
    error_score='raise'
) for i in (0, 1) )

### Plot recall learning curves
for lc_df, cls_label in zip([lc_0, lc_1], ['class 0', 'class 1']):
    train_lc, test_lc = (pd.DataFrame(lc_df[i], index=lc_df[0]) for i in [1,2])
    fig, ax = plt.subplots(1,2, figsize=(9,3), tight_layout=True)
    ax[0].plot(train_lc)
    ax[1].plot(test_lc)
    for i, label in zip([0,1], ['train set', 'test set']):
        ax[i].set(
            title=f'Cross Validation Splits´ Learning Curves - {label}',
            ylabel=f'Recall - {cls_label}',
            xlabel='Train size (nº of samples)'
        )
    plt.show()

---
#### Instalações

!pip install imblearn