## AAAAA

Estou usando esse arquivo apenas para desenvolver ideias de maneira rapida sobre o funcionamento do fluxo de dados, provavelmente todas essas funcoes foram movidas para uma lib.

O objetivo aqui 'e criar uma funcao que facilite o acesso dos datasets processados de teste e treino. importante fazer eles passarem pelos mesmos caminhos para evitar divergencia no tratamento dos dados de treino e de teste

In [63]:
import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

Em um projeto maior, esses valore provavelmente ficariam salvos em arquivos yaml

In [35]:

# Static paths
SCALER_PATH = '../models/scaler.joblib'

TRAINING_FILE_BASE_PATH = '../data/Abandono_clientes.csv'

TEST_FILE_BASE_PATH = '../data/Abandono_teste.csv'

# Static lists
REQUIRED_TRAINING_COLUMNS = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited']

REQUIRED_PREDICT_COLUMNS = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']


# numeric columns
NUMERICS = [
    'CreditScore',
    'Age',
    'Tenure',
    'Balance',
    'NumOfProducts',
    'EstimatedSalary',
]

# every possible value for each categorical column
CATEGORICALS = {
    'Geography': ['France', 'Germany', 'Spain'], 
    'Gender': ['Male', 'Female'], 
    'HasCrCard': [0, 1], 
    'IsActiveMember': [0, 1], 
}

In [131]:
def load_training_data() -> pd.core.frame.DataFrame:
    
    """
    Returns the data for training
    
    Parameters:
        None
    
    Returns:
        Pandas DataFrame containing the training data
    """
    
    df = pd.read_csv(TRAINING_FILE_BASE_PATH)
    
    return df

def load_data() -> pd.core.frame.DataFrame:
    
    """
    Returns the data for predicting
    
    Parameters:
        None
    
    Returns:
        Pandas DataFrame containing the predicting data
    """
    
    df = pd.read_csv(TEST_FILE_BASE_PATH, sep=';')
    
    return df

def assert_categoricals(df: pd.core.frame.DataFrame) -> bool:
    
    """Asserts Categorical data to avoid unexpected new categorical values in each column"""
    
    #Asserting categorical values
    for i in CATEGORICALS:
        # Iterating over every distinct value in column to check if it maches the CATEGORICALS dict
        for unique_value in df[i].unique():
            if not unique_value in CATEGORICALS[i]:
                print('{} value for column {}, is not contained in CATEGORICALS dict'
                      .format(unique_value, i))
                return False
    return True

def assert_data(df: pd.core.frame.DataFrame) -> None:
    
    """
    Do Assertions for each column from dataframe
    
    Parameters:
        Training data or Test data, as Pandas DataFrame
        
    Returns:
        None
    """
    
    assertions_pass = True
    
    assertion_pass = assertions_pass and assert_categoricals(df)
    
    # Asserting that df contains all columns from REQUIRED_TRAINING_COLUMNS
    
    for i in REQUIRED_TRAINING_COLUMNS:
        if not i in df.columns:
            assertion_pass = False
            print("Missing Column: {}".format(i))
            

    
    if assertions_pass:
        print('All assertions passed')
    
    return None
            
def treat_categorical_data(data: pd.core.frame.DataFrame, training = False) -> pd.core.frame.DataFrame:
    
    """Returns the Dataframe with treated categorical columns"""
    
    # Avoiding overwriting
    data = data.copy()
    
    assert(assert_categoricals(data))
    
    columns = list(CATEGORICALS.keys())
    
    return pd.get_dummies(data[columns])

def treat_numeric_data(data: pd.core.frame.DataFrame, training = False) -> pd.core.frame.DataFrame:
    
    """docstring"""
    
#     Avoiding overwriting
    data = data.copy()
    
    columns = NUMERICS
    
    if training:
        scaler = MinMaxScaler()
        scaler.fit(data[columns])
        joblib.dump(scaler, SCALER_PATH)
    else:
        scaler = joblib.load(SCALER_PATH)
        
    transformed_data = scaler.transform(data[columns])
    
    scaled_data = pd.DataFrame(transformed_data, columns=data[columns].columns)
    
    return scaled_data


def treat_data(data: pd.core.frame.DataFrame, training=False) -> pd.core.frame.DataFrame:
    
    ""
    
    data = data.copy()
    
    return pd.concat([treat_numeric_data(data, training=training), treat_categorical_data(data, training=training)], axis=1)

def get_training_dataset() -> pd.core.frame.DataFrame:
    
    """Returns training dataset ready for model train"""
    
    # The main idea about the training parameter is to Reset the scaler when we train the model
    data = load_training_data()
    treated = treat_data(data, training=True)
    treated['Labels'] = data['Exited']
    
    return treated.sample(frac=1).reset_index(drop=True)
    
    
def get_test_dataset() -> pd.core.frame.DataFrame:
    
    """Returns test dataset ready for predictions"""
    
    return treat_data(load_data())
    

In [132]:
get_training_dataset()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,HasCrCard,IsActiveMember,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male,Labels
0,1.000,0.486486,0.4,0.482079,0.000000,0.334790,1,0,0,0,1,1,0,0
1,0.688,0.162162,1.0,0.576665,0.000000,0.158985,1,1,1,0,0,0,1,0
2,0.580,0.202703,0.8,0.325539,0.333333,0.174587,0,0,0,1,0,0,1,0
3,0.626,0.459459,0.0,0.543243,0.000000,0.722978,1,0,0,0,1,0,1,1
4,0.294,0.351351,0.2,0.483264,0.000000,0.398437,0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.564,0.189189,0.6,0.444760,0.000000,0.851938,1,1,1,0,0,0,1,0
9996,0.628,0.378378,0.2,0.000000,0.000000,0.887142,1,1,1,0,0,0,1,1
9997,0.188,0.216216,0.2,0.575210,0.000000,0.563336,1,0,1,0,0,0,1,0
9998,0.678,0.175676,0.4,0.000000,0.333333,0.683057,1,1,0,0,1,1,0,0


In [118]:
get_test_dataset()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,EstimatedSalary,HasCrCard,IsActiveMember,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,0.430,0.175676,0.1,0.000000,0.000000,0.102167,0,1,1,0,0,0,1
1,0.438,0.216216,0.4,0.000000,0.000000,0.020174,0,1,1,0,0,0,1
2,0.638,0.027027,0.7,0.000000,0.333333,0.644197,1,0,1,0,0,1,0
3,0.688,0.283784,0.4,0.690541,0.000000,0.406446,1,1,1,0,0,0,1
4,0.308,0.135135,1.0,0.435601,0.000000,0.937997,1,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.362,0.216216,1.0,0.471533,0.000000,0.132420,1,0,1,0,0,1,0
996,0.450,0.418919,0.2,0.545332,0.000000,0.012382,1,0,0,1,0,0,1
997,0.340,0.756757,0.4,0.000000,0.000000,0.133669,0,0,1,0,0,1,0
998,0.650,0.067568,0.8,0.000000,0.333333,0.811731,0,0,0,0,1,0,1
