In [1]:
#importing packages 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np

#setting default precision
pd.options.display.float_format = "{:.2f}".format
sns.set(color_codes=True)
%matplotlib inline 

  import pandas.util.testing as tm


# Pre-processing of train

In [104]:
train_df = "train.csv"
df_train = pd.read_csv(train_df, index_col = 0)
df_train.head()

Unnamed: 0,minority,sex,ZIP,rent,education,age,income,loan_size,payment_timing,year,job_stability,default,occupation
0,1,0,MT04PA,1,57.23,36.05,205168.02,7600.29,3.3,0,3.02,True,MZ10CD
1,1,0,MT04PA,1,45.89,59.53,187530.41,5534.27,3.84,0,5.94,True,MZ10CD
2,1,0,MT04PA,1,46.78,67.34,196912.01,2009.9,2.06,0,2.19,True,MZ10CD
3,1,0,MT04PA,1,41.78,24.07,132911.65,3112.28,3.94,0,1.73,True,MZ10CD
4,1,0,MT04PA,1,41.74,47.5,161162.55,1372.08,3.71,0,0.88,True,MZ10CD


In [105]:
# moving target col to end
cols = list(df_train.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('default')) #Remove default from list
df_train = df_train[cols+['default']]
df_train.head()

Unnamed: 0,minority,sex,ZIP,rent,education,age,income,loan_size,payment_timing,year,job_stability,occupation,default
0,1,0,MT04PA,1,57.23,36.05,205168.02,7600.29,3.3,0,3.02,MZ10CD,True
1,1,0,MT04PA,1,45.89,59.53,187530.41,5534.27,3.84,0,5.94,MZ10CD,True
2,1,0,MT04PA,1,46.78,67.34,196912.01,2009.9,2.06,0,2.19,MZ10CD,True
3,1,0,MT04PA,1,41.78,24.07,132911.65,3112.28,3.94,0,1.73,MZ10CD,True
4,1,0,MT04PA,1,41.74,47.5,161162.55,1372.08,3.71,0,0.88,MZ10CD,True


In [106]:
# turn zip and occupation to categorical variables
df_train['ZIP'] = pd.Categorical(df_train.ZIP)
df_train['occupation'] = pd.Categorical(df_train.occupation)
df_train["default"] = df_train["default"].astype(int)

In [107]:
# using z score as signed number of standard deviations by which the value is above mean value
numeric_cols = df_train.select_dtypes(include=[np.number]).columns
z_values = abs(df_train[numeric_cols].apply(stats.zscore))
df_train['outliers']=z_values.max(axis=1)
df_train = df_train[df_train['outliers'] < 3]  

#shape after 8k outliers have been revmoved
df_train.drop('outliers', axis = 1,inplace = True)
df_train.shape

(471136, 13)

In [108]:
#convert categorical values to numeric
zip_values = dict(zip(set(df_train['ZIP']), range(len(df_train['ZIP']))))
occupation_values = dict(zip(set(df_train['occupation']), range(len(df_train['occupation']))))

df_train['ZIP'] = [zip_values[x] for x in df_train['ZIP']]
df_train['occupation'] = [occupation_values[x] for x in df_train['occupation']]

In [109]:
df_train.head()

Unnamed: 0,minority,sex,ZIP,rent,education,age,income,loan_size,payment_timing,year,job_stability,occupation,default
0,1,0,1,1,57.23,36.05,205168.02,7600.29,3.3,0,3.02,1,1
1,1,0,1,1,45.89,59.53,187530.41,5534.27,3.84,0,5.94,1,1
2,1,0,1,1,46.78,67.34,196912.01,2009.9,2.06,0,2.19,1,1
3,1,0,1,1,41.78,24.07,132911.65,3112.28,3.94,0,1.73,1,1
4,1,0,1,1,41.74,47.5,161162.55,1372.08,3.71,0,0.88,1,1


In [110]:
#export pre-processed train
df_train.to_csv('train_preprocessed.csv')

# Pre-processing test

In [111]:
test_df = "test.csv"
df_test = pd.read_csv(test_df, index_col = 0)
df_test.drop('Unnamed: 0.1', inplace=True, axis=1)
df_test.head()

Unnamed: 0,minority,sex,ZIP,rent,education,age,income,loan_size,payment_timing,job_stability,year,default,occupation
0,1,0,MT04PA,1,51.27,25.71,166455.21,8064.95,3.87,43.76,30,True,MZ10CD
1,0,0,MT04PA,0,58.88,39.69,216752.89,7166.7,3.81,46.9,30,False,MZ01CD
2,0,0,MT04PA,0,56.5,25.85,183764.48,3322.05,3.5,63.45,30,False,MZ01CD
3,1,0,MT04PA,1,47.07,26.38,154057.0,15.22,3.54,56.24,30,False,MZ10CD
4,1,0,MT04PA,1,48.92,18.78,143463.04,7860.53,3.66,49.88,30,False,MZ10CD


In [112]:
# moving target col to end
cols = list(df_test.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('default')) #Remove default from list
df_test = df_test[cols+['default']]
df_test.head()

Unnamed: 0,minority,sex,ZIP,rent,education,age,income,loan_size,payment_timing,job_stability,year,occupation,default
0,1,0,MT04PA,1,51.27,25.71,166455.21,8064.95,3.87,43.76,30,MZ10CD,True
1,0,0,MT04PA,0,58.88,39.69,216752.89,7166.7,3.81,46.9,30,MZ01CD,False
2,0,0,MT04PA,0,56.5,25.85,183764.48,3322.05,3.5,63.45,30,MZ01CD,False
3,1,0,MT04PA,1,47.07,26.38,154057.0,15.22,3.54,56.24,30,MZ10CD,False
4,1,0,MT04PA,1,48.92,18.78,143463.04,7860.53,3.66,49.88,30,MZ10CD,False


In [113]:
# turn zip and occupation to categorical variables
df_test['ZIP'] = pd.Categorical(df_test.ZIP)
df_test['occupation'] = pd.Categorical(df_test.occupation)
df_test["default"] = df_test["default"].astype(int)

In [114]:
df_test.head()

Unnamed: 0,minority,sex,ZIP,rent,education,age,income,loan_size,payment_timing,job_stability,year,occupation,default
0,1,0,MT04PA,1,51.27,25.71,166455.21,8064.95,3.87,43.76,30,MZ10CD,1
1,0,0,MT04PA,0,58.88,39.69,216752.89,7166.7,3.81,46.9,30,MZ01CD,0
2,0,0,MT04PA,0,56.5,25.85,183764.48,3322.05,3.5,63.45,30,MZ01CD,0
3,1,0,MT04PA,1,47.07,26.38,154057.0,15.22,3.54,56.24,30,MZ10CD,0
4,1,0,MT04PA,1,48.92,18.78,143463.04,7860.53,3.66,49.88,30,MZ10CD,0


In [115]:
df_test.shape

(160000, 13)

In [116]:
df_test['ZIP'] = [zip_values[x] for x in df_test['ZIP']]
df_test['occupation'] = [occupation_values[x] for x in df_test['occupation']]

In [117]:
df_test.head()

Unnamed: 0,minority,sex,ZIP,rent,education,age,income,loan_size,payment_timing,job_stability,year,occupation,default
0,1,0,1,1,51.27,25.71,166455.21,8064.95,3.87,43.76,30,1,1
1,0,0,1,0,58.88,39.69,216752.89,7166.7,3.81,46.9,30,0,0
2,0,0,1,0,56.5,25.85,183764.48,3322.05,3.5,63.45,30,0,0
3,1,0,1,1,47.07,26.38,154057.0,15.22,3.54,56.24,30,1,0
4,1,0,1,1,48.92,18.78,143463.04,7860.53,3.66,49.88,30,1,0


In [118]:
#export pre-processed train
df_test.to_csv('test_preprocessed.csv')