#Encoding and Scaling


In [1]:
import pandas as pd
df = pd.read_csv('../data/processed/german_credit_data_processed.csv')
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,little,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,little,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,little,little,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,little,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad


In [2]:
df['Saving accounts'].unique()

array(['little', 'quite rich', 'rich', 'moderate'], dtype=object)

In [3]:
df['Checking account'].unique()

array(['little', 'moderate', 'rich'], dtype=object)

In [4]:
# Ordinal Encoding (for categories)
mapping_savings={
    'little':1,
    'moderate':2,
    'quite rich':3,
    'rich':4
}
mapping_checking={
    'little':1,
    'moderate':2,
    'rich':3

}

In [5]:
df['Saving accounts']= df['Saving accounts'].map(mapping_savings)
df['Checking account']=df['Checking account'].map(mapping_checking)
df

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,1,1,1169,6,radio/TV,good
1,22,female,2,own,1,2,5951,48,radio/TV,bad
2,49,male,1,own,1,1,2096,12,education,good
3,45,male,2,free,1,1,7882,42,furniture/equipment,good
4,53,male,2,free,1,1,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...
995,31,female,1,own,1,1,1736,12,furniture/equipment,good
996,40,male,3,own,1,1,3857,30,car,good
997,38,male,2,own,1,1,804,12,radio/TV,good
998,23,male,2,free,1,1,1845,45,radio/TV,bad


In [6]:
categorical_columns = ['Sex', 'Housing', 'Purpose']
df= pd.get_dummies(df, columns = categorical_columns, drop_first=True,dtype=int)
df

Unnamed: 0,Age,Job,Saving accounts,Checking account,Credit amount,Duration,Risk,Sex_male,Housing_own,Housing_rent,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,67,2,1,1,1169,6,good,1,1,0,0,0,0,0,1,0,0
1,22,2,1,2,5951,48,bad,0,1,0,0,0,0,0,1,0,0
2,49,1,1,1,2096,12,good,1,1,0,0,0,1,0,0,0,0
3,45,2,1,1,7882,42,good,1,0,0,0,0,0,1,0,0,0
4,53,2,1,1,4870,24,bad,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,31,1,1,1,1736,12,good,0,1,0,0,0,0,1,0,0,0
996,40,3,1,1,3857,30,good,1,1,0,1,0,0,0,0,0,0
997,38,2,1,1,804,12,good,1,1,0,0,0,0,0,1,0,0
998,23,2,1,1,1845,45,bad,1,0,0,0,0,0,0,1,0,0


In [7]:
df['Risk']= df['Risk'].map({'good':1, 'bad':0})
df

Unnamed: 0,Age,Job,Saving accounts,Checking account,Credit amount,Duration,Risk,Sex_male,Housing_own,Housing_rent,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,67,2,1,1,1169,6,1,1,1,0,0,0,0,0,1,0,0
1,22,2,1,2,5951,48,0,0,1,0,0,0,0,0,1,0,0
2,49,1,1,1,2096,12,1,1,1,0,0,0,1,0,0,0,0
3,45,2,1,1,7882,42,1,1,0,0,0,0,0,1,0,0,0
4,53,2,1,1,4870,24,0,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,31,1,1,1,1736,12,1,0,1,0,0,0,0,1,0,0,0
996,40,3,1,1,3857,30,1,1,1,0,1,0,0,0,0,0,0
997,38,2,1,1,804,12,1,1,1,0,0,0,0,0,1,0,0
998,23,2,1,1,1845,45,0,1,0,0,0,0,0,0,1,0,0


In [8]:
df.dtypes

Age                            int64
Job                            int64
Saving accounts                int64
Checking account               int64
Credit amount                  int64
Duration                       int64
Risk                           int64
Sex_male                       int64
Housing_own                    int64
Housing_rent                   int64
Purpose_car                    int64
Purpose_domestic appliances    int64
Purpose_education              int64
Purpose_furniture/equipment    int64
Purpose_radio/TV               int64
Purpose_repairs                int64
Purpose_vacation/others        int64
dtype: object

In [9]:
from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler()
cols_to_scale = ['Age', 'Credit amount', 'Duration',]

df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
df

Unnamed: 0,Age,Job,Saving accounts,Checking account,Credit amount,Duration,Risk,Sex_male,Housing_own,Housing_rent,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others
0,2.766456,2,1,1,-0.745131,-1.236478,1,1,1,0,0,0,0,0,1,0,0
1,-1.191404,2,1,2,0.949817,2.248194,0,0,1,0,0,0,0,0,1,0,0
2,1.183312,1,1,1,-0.416562,-0.738668,1,1,1,0,0,0,1,0,0,0,0
3,0.831502,2,1,1,1.634247,1.750384,1,1,0,0,0,0,0,1,0,0,0
4,1.535122,2,1,1,0.566664,0.256953,0,1,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-0.399832,1,1,1,-0.544162,-0.738668,1,0,1,0,0,0,0,1,0,0,0
996,0.391740,3,1,1,0.207612,0.754763,1,1,1,0,1,0,0,0,0,0,0
997,0.215835,2,1,1,-0.874503,-0.738668,1,1,1,0,0,0,0,0,1,0,0
998,-1.103451,2,1,1,-0.505528,1.999289,0,1,0,0,0,0,0,0,1,0,0


In [11]:
df.to_csv('../data/processed/german_credit_data_preprocessed_final.csv', index=False)