## 0.0 - Imports

In [1]:
import joblib
import pandas as pd
import pandera
from sklearn.model_selection import train_test_split
from pandera import Check, Column, DataFrameSchema
from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

## 0.1 - Utils

In [7]:
columns_to_use = ['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                  'Idade', 'NumeroDeVezes30-59DiasAtrasoNaoPior',
                  'TaxaDeEndividamento', 'RendaMensal',
                  'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
                  'NumeroDeVezes90DiasAtraso',
                  'NumeroDeEmprestimosOuLinhasImobiliarias',
                  'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes']

# 1.0 - Data Load

<p>É importante a criação desta etapa de Data Load porque é bem possível que apenas esta parte mude em seu projeto.</p>
<p>Ex.:</p>
    <p style="padding-left: 50px">O projeto criado estava lendo os dados de um arquivo .csv e agora o arquivo muda para um banco de dados PostgreSQL</>

In [4]:
class DataLoad:

    def __init__(self):
        pass

    def load_data(self) -> pd.DataFrame:
        """
        Função vai carregar os dados

        return:
            pandas DataFrame
        """

        loaded_data = pd.read_csv('../data/train.csv')
        return loaded_data

In [5]:
dl = DataLoad()

In [9]:
df = dl.load_data()[columns_to_use]
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


## 2.0 - Data Validation

In [12]:
class DataValidation:
    """"""

    def __init__(self, columns_to_use) -> None:
        self.columns_to_use = columns_to_use

    def check_shape_data(self, dataframe: pd.DataFrame) -> bool:
        try:
            print('Validação Iniciou')
            dataframe.columns = self.columns_to_use
            return True
        except Exception as e:
            print(f'Validação errou: {e}')
            return False

    def check_columns(self, dataframe: pd.DataFrame) -> bool:
        schema = DataFrameSchema(
            {
                'target': Column(int, Check.isin([0, 1]), Check(lambda x: x > 0), coerce=True),
                'TaxaDeUtilizacaoDeLinhasNaoGarantidas': Column(float, nullable=True),
                'Idade': Column(int, nullable=True),
                'NumeroDeVezes30-59DiasAtrasoNaoPior': Column(int, nullable=True),
                'TaxaDeEndividamento': Column(float, nullable=True),
                'RendaMensal': Column(float, nullable=True),
                'NumeroDeLinhasDeCreditoEEmprestimosAbertos': Column(int, nullable=True),
                'NumeroDeVezes90DiasAtraso': Column(int, nullable=True),
                'NumeroDeEmprestimosOuLinhasImobiliarias': Column(int, nullable=True),
                'NumeroDeVezes60-89DiasAtrasoNaoPior': Column(int, nullable=True),
                'NumeroDeDependentes': Column(float, nullable=True)
            }
        )

        try:
            schema.validate(dataframe)
            print("Validation columns passed...")
            return True
        except pandera.errors.SchemaErros as exc:
            print("Validation columns failed...")
            pandera.display(exc.failure_cases)
        return False

    def run(self, dataframe: pd.DataFrame) -> bool:

        if self.check_shape_data(dataframe) and self.check_columns(dataframe):
            print('Validação com sucesso')
            return True
        else:
            print('Validação Falhou!')
            return False

In [13]:
dv = DataValidation(columns_to_use)

In [14]:
dv.run(df)

Validação Iniciou
Validation columns passed...
Validação com sucesso


True