# Preprocessing

Source: http://archive.ics.uci.edu/ml/datasets/Statlog+%28Australian+Credit+Approval%29

In [1]:
import os
import numpy as np
import pandas as pd
import config as cfg

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from pandas_profiling import ProfileReport

pd.set_option("display.max_columns", None)

### Train/test split

In [2]:
df = pd.read_csv(
    os.path.join("Data", "data_original", "australian.dat"),
    header=None,
    delimiter=r"\s+",
    names=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15'],
)

df["BAD"] =  1 - df["A15"]
df = df.drop(["A15"], axis=1)

df

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,BAD
0,1,22.08,11.460,2,4,4,1.585,0,0,0,1,2,100,1213,1
1,0,22.67,7.000,2,8,4,0.165,0,0,0,0,2,160,1,1
2,0,29.58,1.750,1,4,4,1.250,0,0,0,1,2,280,1,1
3,0,21.67,11.500,1,5,3,0.000,1,1,11,1,2,0,1,0
4,1,20.17,8.170,2,6,4,1.960,1,1,14,0,2,60,159,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,31.57,10.500,2,14,4,6.500,1,0,0,0,2,0,1,0
686,1,20.67,0.415,2,8,4,0.125,0,0,0,0,2,0,45,1
687,0,18.83,9.540,2,6,4,0.085,1,0,0,0,2,100,1,0
688,0,27.42,14.500,2,14,8,3.085,1,1,1,0,2,120,12,0


In [3]:
print("Bad rate:", df["BAD"].mean())

Bad rate: 0.5550724637681159


In [4]:
X = df.drop(['BAD'], axis=1)
y = df['BAD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=cfg.TEST_SIZE, random_state=cfg.SEED, stratify=y)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# rus = RandomUnderSampler(sampling_strategy=cfg.SAMPLING_STRATEGY)
# X_train, y_train = rus.fit_resample(X_train, y_train)

X_train.to_csv(os.path.join("Data", "data_preprocessed", "X_train.csv"), index=False)
X_test.to_csv(os.path.join("Data", "data_preprocessed", "X_test.csv"), index=False)
y_train.to_csv(os.path.join("Data", "data_preprocessed", "y_train.csv"), index=False)
y_test.to_csv(os.path.join("Data", "data_preprocessed", "y_test.csv"), index=False)

ProfileReport(X_train, minimal=True).to_file(os.path.join("Results", "X_train.html"))

Summarize dataset:   0%|          | 0/23 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("Bad rate:", y_train.mean())

X_train: (517, 14)
X_test: (173, 14)
Bad rate: 0.5551257253384912


### Train/test split with binning

In [6]:
df_binned = df.copy()

df_binned['A2'] = pd.qcut(df_binned['A2'], 10)
df_binned['A3'] = pd.qcut(df_binned['A3'], 10)
df_binned['A5'] = pd.qcut(df_binned['A5'], 6)
df_binned['A7'] = pd.qcut(df_binned['A7'], 8)
df_binned['A10'] = pd.qcut(df_binned['A10'], 1)
df_binned['A13'] = pd.qcut(df_binned['A13'], 3)
df_binned['A14'] = pd.qcut(df_binned['A14'], 2)
        
df_binned

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,BAD
0,1,"(21.734, 23.33]","(9.432, 11.852]",2,"(3.0, 6.0]",4,"(1.0, 1.625]",0,0,"(-0.001, 67.0]",1,2,"(-0.001, 100.0]","(6.0, 100001.0]",1
1,0,"(21.734, 23.33]","(6.0, 9.432]",2,"(6.0, 8.0]",4,"(0.04, 0.165]",0,0,"(-0.001, 67.0]",0,2,"(100.0, 216.0]","(0.999, 6.0]",1
2,0,"(28.625, 31.92]","(1.25, 1.934]",1,"(3.0, 6.0]",4,"(1.0, 1.625]",0,0,"(-0.001, 67.0]",1,2,"(216.0, 2000.0]","(0.999, 6.0]",1
3,0,"(19.314, 21.734]","(9.432, 11.852]",1,"(3.0, 6.0]",3,"(-0.001, 0.04]",1,1,"(-0.001, 67.0]",1,2,"(-0.001, 100.0]","(0.999, 6.0]",0
4,1,"(19.314, 21.734]","(6.0, 9.432]",2,"(3.0, 6.0]",4,"(1.625, 2.625]",1,1,"(-0.001, 67.0]",0,2,"(-0.001, 100.0]","(6.0, 100001.0]",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,"(28.625, 31.92]","(9.432, 11.852]",2,"(11.0, 14.0]",4,"(5.085, 28.5]",1,0,"(-0.001, 67.0]",0,2,"(-0.001, 100.0]","(0.999, 6.0]",0
686,1,"(19.314, 21.734]","(0.375, 0.75]",2,"(6.0, 8.0]",4,"(0.04, 0.165]",0,0,"(-0.001, 67.0]",0,2,"(-0.001, 100.0]","(6.0, 100001.0]",1
687,0,"(13.749, 19.314]","(9.432, 11.852]",2,"(3.0, 6.0]",4,"(0.04, 0.165]",1,0,"(-0.001, 67.0]",0,2,"(-0.001, 100.0]","(0.999, 6.0]",0
688,0,"(25.67, 28.625]","(11.852, 28.0]",2,"(11.0, 14.0]",8,"(2.625, 5.085]",1,1,"(-0.001, 67.0]",0,2,"(100.0, 216.0]","(6.0, 100001.0]",0


In [7]:
print("Bad rate:", df_binned["BAD"].mean())

Bad rate: 0.5550724637681159


In [8]:
X = df_binned.drop(['BAD'], axis=1)
y = df_binned['BAD']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=cfg.TEST_SIZE, random_state=cfg.SEED, stratify=y)

# rus = RandomUnderSampler(sampling_strategy=cfg.SAMPLING_STRATEGY)
# X_train, y_train = rus.fit_resample(X_train, y_train)

X_train.to_csv(os.path.join("Data", "data_preprocessed_binned", "X_train.csv"), index=False)
X_test.to_csv(os.path.join("Data", "data_preprocessed_binned", "X_test.csv"), index=False)
y_train.to_csv(os.path.join("Data", "data_preprocessed_binned", "y_train.csv"), index=False)
y_test.to_csv(os.path.join("Data", "data_preprocessed_binned", "y_test.csv"), index=False)

In [9]:
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("Bad rate:", y_train.mean())

X_train: (517, 14)
X_test: (173, 14)
Bad rate: 0.5551257253384912
