In [48]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

# import statements for plotly
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objects as go
import plotly.tools as tls
# import warnings and supress it
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from category_encoders import TargetEncoder, LeaveOneOutEncoder, WOEEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer

import configparser

# Access Config

In [49]:
config = configparser.ConfigParser()
config.read("./notebooks_config.ini")

['./notebooks_config.ini']

# Access Data

In [3]:
data = pd.read_parquet(config['PATH']['RAW_DATA_PATH'])

# DATA TRASFORMATIONS

In [4]:
# Change column name to lower
data.columns = [col.lower() for col in data.columns]
# Drop `over18`, `employeecount`, `standardhours`, `employeenumber` columns
data.drop(['over18','employeecount', 'standardhours', 'employeenumber'], axis="columns", inplace=True)

## 1. SAVE INTERIM Data

In [5]:
interim_data_path = os.path.join(config['PATH']['INTERIM_DATA_PATH'], "interim_data.parquet")
data.to_parquet(interim_data_path, index=False)

# CHOOSE CATEGORICAL AND NUMERICAL DATA

In [6]:
categorical_columns = ['businesstravel','department', 'education', 'environmentsatisfaction','educationfield','gender','jobrole', 'jobinvolvement', 'joblevel','maritalstatus', 'jobsatisfaction','overtime', 'performancerating', 'relationshipsatisfaction', 'stockoptionlevel', 'trainingtimeslastyear', 'worklifebalance']
numerical_columns = ['age','dailyrate','distancefromhome', 'hourlyrate', 'monthlyincome', 'monthlyrate','percentsalaryhike', 'numcompaniesworked', 'totalworkingyears', 'yearsatcompany', 'yearsincurrentrole', 'yearssincelastpromotion', 'yearswithcurrmanager']
target_colummns = ["attrition"]

# SPLIT DATA INTO TRAIN AND TEST

## 1. Read Interim Data

In [7]:
data = pd.read_parquet(config['PATH']['INTERIM_DATA_PATH'])

## 2. Choose Relevant Columns

In [8]:
X = data[numerical_columns + categorical_columns]
y = data[target_colummns]

# SPLIT THE DATA INTO TRAIN AND TEST

In [9]:
split_ration = config.getfloat('ML', 'train_test_split_ratio')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ration, random_state=42)

## Save the Train and Test

In [10]:
processed_train_path = config.get('PATH', 'PROCESSED_DATA_PATH')
train_path = os.path.join(processed_train_path, "train")
test_path = os.path.join(processed_train_path, "test")

os.makedirs(train_path, exist_ok=True)
os.makedirs(test_path, exist_ok=True)

# Save training
X_train.to_parquet(os.path.join(train_path, "x_train.parquet"), index=False)
y_train.to_parquet(os.path.join(train_path, "y_train.parquet"), index=False)

# Save testing
X_test.to_parquet(os.path.join(test_path, "x_test.parquet"), index=False)
y_test.to_parquet(os.path.join(test_path, "y_test.parquet"), index=False)

# FEATURE ENGINEERING

## TARGET COLUMN - LABEL ENCODING



In [11]:
y_train = np.squeeze(y_train)
y_train.shape
le = LabelEncoder()
le.fit_transform(y_train)

array([0, 0, 0, ..., 1, 0, 0])

## Save the ESTIMATOR

In [16]:
label_encoder_estimator_name = config.get('ML', 'label_encoder_estimator_name')
label_encoder_estimator_path = os.path.join(config.get('PATH', 'ESTIMATORS_PATH'), label_encoder_estimator_name)
os.makedirs(config.get('PATH', 'ESTIMATORS_PATH'), exist_ok=True)

with open(label_encoder_estimator_path, 'wb') as fid:
    pickle.dump(le, fid)

## NUMERICAL COLUMNS ESTIMATOR

In [46]:
numerical_lookup_dict = {
    "simple_imputer": SimpleImputer(strategy="mean"),
    "standard_scaler": StandardScaler(),
    "min_max_scaler": MinMaxScaler()
}
def make_numerical_pipeline(config):
    num_proc = make_pipeline()
    estimators = config.get('ML', 'numerical_estimators').split(",")
    try:
        for est in estimators:
            num_proc.steps.append(numerical_lookup_dict.get(est))
    except Exception as e:
        print(str(e))
        raise Exception
    
    return num_proc

## CATEGORICAL COLUMNS ESTIMATORS

In [50]:
categorical_lookup_dict = {
    "one_hot_encoding": OneHotEncoder(handle_unknown="ignore"),
    "ordinal_encoding": OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
    "target_encoding": TargetEncoder(min_samples_leaf=5, smoothing=5, cols=categorical_columns),
    "leave_one_out_encoding": LeaveOneOutEncoder(cols=categorical_columns, sigma=0.05), 
    "woe_encoding": WOEEncoder(cols=categorical_columns, sigma=0.5)
}

def make_categorical_pipeline(config):
    cat_proc = make_pipeline()
    estimator = config.get('ML', 'categorical_estimators').split(",")
    try:
        for est in estimator:
            cat_proc.steps.append(categorical_lookup_dict.ge(est))
    except Exception as e:
        print(str(e))
        raise Exception
    
    return cat_proc