## 0.0 - Imports

In [4]:
import joblib
import pandas as pd
import pandera
from sklearn.model_selection import train_test_split
from pandera import Check, Column, DataFrameSchema
from sklearn.pipeline import Pipeline
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

## 0.1 - Utils

In [5]:
columns_to_use = ['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                  'Idade', 'NumeroDeVezes30-59DiasAtrasoNaoPior',
                  'TaxaDeEndividamento', 'RendaMensal',
                  'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
                  'NumeroDeVezes90DiasAtraso',
                  'NumeroDeEmprestimosOuLinhasImobiliarias',
                  'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes']

# 1.0 - Data Load

<p>É importante a criação desta etapa de Data Load porque é bem possível que apenas esta parte mude em seu projeto.</p>
<p>Ex.:</p>
    <p style="padding-left: 50px">O projeto criado estava lendo os dados de um arquivo .csv e agora o arquivo muda para um banco de dados PostgreSQL</>

In [6]:
class DataLoad:

    def __init__(self):
        pass

    def load_data(self) -> pd.DataFrame:
        """
        Função vai carregar os dados

        return:
            pandas DataFrame
        """

        loaded_data = pd.read_csv('../data/train.csv')
        return loaded_data

In [7]:
dl = DataLoad()

In [8]:
df = dl.load_data()[columns_to_use]
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


## 2.0 - Data Validation

In [9]:
class DataValidation:
    """"""

    def __init__(self, columns_to_use) -> None:
        self.columns_to_use = columns_to_use

    def check_shape_data(self, dataframe: pd.DataFrame) -> bool:
        try:
            print('Validação Iniciou')
            dataframe.columns = self.columns_to_use
            return True
        except Exception as e:
            print(f'Validação errou: {e}')
            return False

    def check_columns(self, dataframe: pd.DataFrame) -> bool:
        schema = DataFrameSchema(
            {
                'target': Column(int, Check.isin([0, 1]), Check(lambda x: x > 0), coerce=True),
                'TaxaDeUtilizacaoDeLinhasNaoGarantidas': Column(float, nullable=True),
                'Idade': Column(int, nullable=True),
                'NumeroDeVezes30-59DiasAtrasoNaoPior': Column(int, nullable=True),
                'TaxaDeEndividamento': Column(float, nullable=True),
                'RendaMensal': Column(float, nullable=True),
                'NumeroDeLinhasDeCreditoEEmprestimosAbertos': Column(int, nullable=True),
                'NumeroDeVezes90DiasAtraso': Column(int, nullable=True),
                'NumeroDeEmprestimosOuLinhasImobiliarias': Column(int, nullable=True),
                'NumeroDeVezes60-89DiasAtrasoNaoPior': Column(int, nullable=True),
                'NumeroDeDependentes': Column(float, nullable=True)
            }
        )

        try:
            schema.validate(dataframe)
            print("Validation columns passed...")
            return True
        except pandera.errors.SchemaErros as exc:
            print("Validation columns failed...")
            pandera.display(exc.failure_cases)
        return False

    def run(self, dataframe: pd.DataFrame) -> bool:

        if self.check_shape_data(dataframe) and self.check_columns(dataframe):
            print('Validação com sucesso')
            return True
        else:
            print('Validação Falhou!')
            return False

In [10]:
dv = DataValidation(columns_to_use)

In [11]:
dv.run(df)

Validação Iniciou
Validation columns passed...
Validação com sucesso


True

## 3.0 - Data Transformation

In [12]:
class DataTransformation:
    def __init__(self, dataframe: pd.DataFrame, target_name: str) -> None:
        self.dataframe = dataframe
        self.target_name = target_name

    def train_test_spliting(self):
        X = self.dataframe.drop(self.target_name, axis=1)
        y = self.dataframe[self.target_name]

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y)

        return X_train, X_valid, y_train, y_valid

In [13]:
dt = DataTransformation(df, 'target')

In [14]:
X_train, X_valid, y_train, y_valid = dt.train_test_spliting()

In [15]:
X_train.shape

(112500, 10)

In [17]:
X_valid.shape

(37500, 10)

## 4.0 - Data Preprocessing

In [18]:
class DataProcessing:

    def __init__(self, dataframe: pd.DataFrame, pipe: Pipeline) -> None:
        self.dataframe = dataframe
        self.pipe = pipe

    def pipeline(self):
        train_pipe = self.pipe
        train_pipe.fit(self.dataframe)
        return train_pipe

    def run(self):
        print('Preprocessador Iniciou...')
        trained_pipeline = self.pipeline()
        data_preprocessed = trained_pipeline.transform(self.dataframe)
        return data_preprocessed

In [19]:
pipe = Pipeline(
    [
        ('imputer',
         MeanMedianImputer(
             variables=['RendaMensal', 'NumeroDeDependentes']
         )
         ),
        ('discretizer',
         EqualFrequencyDiscretiser(variables=[
             'TaxaDeUtilizacaoDeLinhasNaoGarantidas',
             'TaxaDeEndividamento', 'RendaMensal'])
         ),
        ('scaler',
         SklearnTransformerWrapper(StandardScaler()))
    ]
)

In [20]:
dp = DataProcessing(X_train, pipe)

In [21]:
x_train_processed = dp.run()

Preprocessador Iniciou...


In [22]:
X_train.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
11794,1.0,51,0,716.0,,4,0,0,0,0.0
20170,0.024781,64,0,7781.0,,14,0,4,0,1.0
48593,0.343776,23,0,0.239044,250.0,5,0,0,0,0.0
10245,1.0,28,98,0.0,1700.0,0,98,0,98,0.0
27729,0.604061,39,0,0.147104,3228.0,6,1,0,0,3.0


In [23]:
x_train_processed.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
11794,1.566699,-0.085477,-0.100617,1.219439,-0.137498,-0.866464,-0.063765,-0.907348,-0.058123,-0.666475
20170,-0.870388,0.794217,-0.100617,1.567737,-0.137498,1.08149,-0.063765,2.660299,-0.058123,0.236945
48593,0.522233,-1.980202,-0.100617,-0.522055,-1.529647,-0.671668,-0.063765,-0.907348,-0.058123,-0.666475
10245,1.566699,-1.641858,23.262003,-1.566951,-1.529647,-1.645646,23.43191,-0.907348,23.512937,-0.666475
27729,0.870388,-0.897502,-0.100617,-0.870354,-1.18161,-0.476873,0.175986,-0.907348,-0.058123,2.043785


In [24]:
joblib.dump(dp.pipeline(), 'preprocessor.joblib')

['preprocessor.joblib']

## 5.0 - Train Models

In [None]:
class TrainModels:
    def __init__(self, dados_X: pd.DataFrame,
                 dados_y: pd.DataFrame) -> None:
        self.dados_X = dados_X
        self.dados_y = dados_y

    def train(self, model):
        model.fit(self.dados_X, self.dados_y)
        joblib.dump(model, 'modelo.joblib')

    def predict(self, dados_para_prever: pd.DataFrame):
        model_fitted = self._load_model()
        dados_pred = model_fitted.predict_proba(dados_para_prever)
        return dados_pred

    def _load_model(self):
        model = joblib.load('modelo.joblib')
        return model

In [None]:
tm = TrainModels(dados_X=x_train_processed, dados_y=y_train)

In [None]:
tm.train(model=LogisticRegression())

In [None]:
y_train_pred = tm.predict(x_train_processed)

In [None]:
y_train_pred

array([[0.86287652, 0.13712348],
       [0.98208053, 0.01791947],
       [0.90694381, 0.09305619],
       ...,
       [0.989915  , 0.010085  ],
       [0.94596469, 0.05403531],
       [0.89512581, 0.10487419]])

## 6.0 - Models Evaluation

In [None]:
preprocessor = dp.pipeline()

In [None]:
preprocessor

In [None]:
X_valid_processed = preprocessor.transform(X_valid)

In [None]:
y_valid_pred = tm.predict(X_valid_processed)

In [None]:
class ModelEvaluation:

    def __init__(self) -> None:
        pass

    def eval_metrics(self, dados_reais, dados_preditos):
        roc_auc = roc_auc_score(dados_reais, dados_preditos)
        return roc_auc

In [None]:
me = ModelEvaluation()

In [None]:
me.eval_metrics(y_train, y_train_pred[:, 1])

0.7973704635482746

In [None]:
me.eval_metrics(y_valid, y_valid_pred[:, 1])

0.78555542254399

In [None]:
y_valid_pred

NameError: name 'y_valid_pred' is not defined