# Data Preprocess

In [1]:
import os
root_dir = '../../' if os.getcwd().split('/')[-1] != 'credit-now' else './'
os.chdir(root_dir)

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import model_selection
import joblib

df = pd.read_csv('original_data/train.csv')
df.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [2]:
def load_data(name='train', test_size=0.3, encoding=True) -> tuple:
    if not name:
        name = 'train'
        train_data = pd.read_csv(f'original_data/{name}.csv')
        train_data = preprocess_data(train_data)
    else:
        train_data = pd.read_csv(f'credit_data/{name}_data.csv')
    train_label = np.array(train_data[['credit']])

    if test_size:
        train_data = train_data.drop(['index', 'credit'], axis=1)
        train_data, test_data, train_label, test_label = \
            model_selection.train_test_split(train_data, train_label, test_size=test_size,
                                            random_state=0, stratify=train_label)
    else:
        test_data, test_label = train_data.copy(), train_label.copy()

    if encoding:
        pipe = joblib.load(f'credit_data/{name}_pipe.pkl')
        train_data = pipe.fit_transform(train_data)
        test_data = pipe.transform(test_data)

    data = ((train_data, test_data, train_label, test_label)
            if test_size else (train_data, train_label))

    return data

In [3]:
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    data = df.drop(['FLAG_MOBIL'], axis=1).copy()
    data['credit'] = data['credit'].astype(int)

    data = data[data['occyp_type'].notnull() | (data['DAYS_EMPLOYED'] > 0)]
    data['occyp_type'] = data['occyp_type'].fillna('Unemployeed')

    data['child_num'] = data['child_num'].apply(lambda x: 4 if x > 4 else x)
    data['family_size'] = data['family_size'].apply(lambda x: 6 if x > 6 else x)
    data['family_size'] = data['family_size'].astype(int)

    data['DAYS_BIRTH'] = data['DAYS_BIRTH'].apply(lambda x: (x*-1)/365 if x < 0 else 0)
    data['DAYS_EMPLOYED'] = data['DAYS_EMPLOYED'].apply(lambda x: (x*-1)/365 if x < 0 else 0)
    data['begin_month'] = data['begin_month'].apply(lambda x: (x*-1)/12 if x < 0 else 0)
    data.rename(columns={'DAYS_BIRTH':'age','DAYS_EMPLOYED':'employed_year',
                        'begin_month':'begin_year'}, inplace=True)

    category_dict = get_category_dict()
    for column, cat_dict in category_dict.items():
        data[column].replace(cat_dict, inplace=True)

    return data

In [4]:
def get_category_dict() -> dict:
    category_dict = dict()

    category_dict['gender'] = {'M':0,'F':1}
    category_dict['car'] = {'N':0,'Y':1}
    category_dict['reality'] = {'N':0,'Y':1}
    category_dict['income_type'] = {'Working': 0, 'Commercial associate': 1, 'Pensioner': 2, 'State servant': 3, 'Student': 4}
    category_dict['edu_type'] = {'Secondary / secondary special': 0, 'Higher education': 1, 'Incomplete higher': 2,
                        'Lower secondary': 3, 'Academic degree': 4}
    category_dict['family_type'] = {'Married': 0, 'Single / not married': 1, 'Civil marriage': 2, 'Separated': 3, 'Widow': 4}
    category_dict['house_type'] = {'House / apartment': 0, 'With parents': 1, 'Municipal apartment': 2, 'Rented apartment': 3,
                            'Office apartment': 4, 'Co-op apartment': 5}
    category_dict['occyp_type'] = {'Unemployeed': 0, 'Laborers': 1, 'Core staff': 2, 'Sales staff': 3, 'Managers': 4, 'Drivers': 5,
                            'High skill tech staff': 6, 'Accountants': 7, 'Medicine staff': 8, 'Cooking staff': 9,
                            'Security staff': 10, 'Cleaning staff': 11, 'Private service staff': 12, 'Low-skill Laborers': 13,
                            'Waiters/barmen staff': 14, 'Secretaries': 15, 'Realty agents': 16, 'HR staff': 17, 'IT staff': 18}

    return category_dict

In [5]:
train_data, train_label = load_data(name=None, test_size=None, pipeline=False)
train_data.head()
# train_label.shape

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,age,employed_year,work_phone,phone,email,occyp_type,family_size,begin_year,credit
1,1,1,0,1,1,247500.0,1,0,2,0,31.178082,4.219178,0,0,1,1,3,0.416667,1
2,2,0,1,1,0,450000.0,0,1,0,0,52.293151,12.147945,0,1,0,4,2,1.833333,2
3,3,1,0,1,0,202500.0,1,0,0,0,41.336986,5.731507,0,1,0,3,2,3.083333,0
4,4,1,1,1,0,157500.0,3,1,0,0,41.19726,5.767123,0,0,0,4,2,2.166667,2
5,5,1,0,1,2,270000.0,0,0,0,0,36.747945,13.687671,0,0,1,6,4,1.5,1


In [6]:
train_data.set_index('index').to_csv('credit_data/train_data.csv')

## Data Encoding

In [7]:
numerical_transformer = StandardScaler()
numerical_features = ['income_total', 'age', 'employed_year', 'begin_year']

categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')
categorical_features = ['gender', 'car', 'reality', 'child_num',
                        'income_type', 'edu_type', 'family_type', 'house_type',
                        'work_phone', 'phone', 'email', 'occyp_type', 'family_size']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

pipe = Pipeline(steps=[('preprocessor', preprocessor)])

In [8]:
import joblib
joblib.dump(pipe, 'credit_data/train_pipe.pkl', compress=True)

['credit_data/train_pipe.pkl']

## All Categorical

In [9]:
df = pd.read_csv('original_data/train.csv')

data = df.drop(['FLAG_MOBIL'], axis=1).copy()
data['credit'] = data['credit'].astype(int)

data = data[data['occyp_type'].notnull() | (data['DAYS_EMPLOYED'] > 0)]
data['occyp_type'] = data['occyp_type'].fillna('Unemployeed')

data['child_num'] = data['child_num'].apply(lambda x: 4 if x > 4 else x)
data['family_size'] = data['family_size'].apply(lambda x: 6 if x > 6 else x)
data['family_size'] = data['family_size'].astype(int)

data['DAYS_BIRTH'] = data['DAYS_BIRTH'].apply(lambda x: (x*-1)/365 if x < 0 else 0)
data['DAYS_EMPLOYED'] = data['DAYS_EMPLOYED'].apply(lambda x: (x*-1)/365 if x < 0 else 0)
data['begin_month'] = data['begin_month'].apply(lambda x: (x*-1)/12 if x < 0 else 0)

data['income_total'] = pd.cut(data['income_total'], 10, labels=list(range(10)))
data['DAYS_BIRTH'] = pd.cut(data['DAYS_BIRTH'], 10, labels=list(range(10)))
data['DAYS_EMPLOYED'] = pd.cut(data['DAYS_EMPLOYED'], 10, labels=list(range(10)))
data['begin_month'] = pd.cut(data['begin_month'], 10, labels=list(range(10)))

data.rename(columns={'income_total':'income_range','DAYS_BIRTH':'age_range',
                    'DAYS_EMPLOYED':'employed_range', 'begin_month':'begin_range'},
                    inplace=True)

category_dict = get_category_dict()
for column, cat_dict in category_dict.items():
    data[column].replace(cat_dict, inplace=True)

data.head()

Unnamed: 0,index,gender,car,reality,child_num,income_range,income_type,edu_type,family_type,house_type,age_range,employed_range,work_phone,phone,email,occyp_type,family_size,begin_range,credit
1,1,1,0,1,1,1,1,0,2,0,2,0,0,0,1,1,3,0,1
2,2,0,1,1,0,2,0,1,0,0,6,2,0,1,0,4,2,3,2
3,3,1,0,1,0,1,1,0,0,0,4,1,0,1,0,3,2,6,0
4,4,1,1,1,0,0,3,1,0,0,4,1,0,0,0,4,2,4,2
5,5,1,0,1,2,1,0,0,0,0,3,3,0,0,1,6,4,2,1


In [10]:
data.set_index('index').to_csv('credit_data/train_cat_data.csv')

In [11]:
categorical_transformer = OneHotEncoder(categories='auto', handle_unknown='ignore')
categorical_features = ['gender', 'car', 'reality', 'child_num', 'income_range',
                        'income_type', 'edu_type', 'family_type', 'house_type',
                        'age_range', 'employed_range', 'work_phone', 'phone', 'email',
                        'occyp_type', 'family_size', 'begin_range']

cat_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

cat_pipe = Pipeline(steps=[('preprocessor', cat_preprocessor)])

In [12]:
import joblib
joblib.dump(cat_pipe, 'credit_data/train_cat_pipe.pkl', compress=True)

['credit_data/train_cat_pipe.pkl']

## For Analysis

In [13]:
df = pd.read_csv('original_data/train.csv')

data = df.drop(['FLAG_MOBIL'], axis=1).copy()
data['income_total'] = data['income_total'].astype(int)
data['credit'] = data['credit'].astype(int)

data = data[data['occyp_type'].notnull() | (data['DAYS_EMPLOYED'] > 0)]
data['occyp_type'] = data['occyp_type'].fillna('Unemployeed')

data['child_num'] = data['child_num'].apply(lambda x: 4 if x > 4 else x)
data['family_size'] = data['family_size'].apply(lambda x: 6 if x > 6 else x)
data['family_size'] = data['family_size'].astype(int)

data['DAYS_BIRTH'] = data['DAYS_BIRTH'].apply(lambda x: (x*-1)//365 if x < 0 else 0)
data['DAYS_EMPLOYED'] = data['DAYS_EMPLOYED'].apply(lambda x: (x*-1)//365 if x < 0 else 0)
data['begin_month'] = data['begin_month'].apply(lambda x: int(x*-1) if x < 0 else 0)

data.rename(columns={'DAYS_BIRTH':'age','DAYS_EMPLOYED':'employed_year'}, inplace=True)

data.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,age,employed_year,work_phone,phone,email,occyp_type,family_size,begin_month,credit
1,1,F,N,Y,1,247500,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,31,4,0,0,1,Laborers,3,5,1
2,2,M,Y,Y,0,450000,Working,Higher education,Married,House / apartment,52,12,0,1,0,Managers,2,22,2
3,3,F,N,Y,0,202500,Commercial associate,Secondary / secondary special,Married,House / apartment,41,5,0,1,0,Sales staff,2,37,0
4,4,F,Y,Y,0,157500,State servant,Higher education,Married,House / apartment,41,5,0,0,0,Managers,2,26,2
5,5,F,N,Y,2,270000,Working,Secondary / secondary special,Married,House / apartment,36,13,0,0,1,High skill tech staff,4,18,1


In [14]:
data.set_index('index').to_csv('credit_data/analysis_data.csv')