In [3]:
#Imports
import pandas as pd
import numpy as np
import os
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import time

#skitlearn packages
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

<b>Load Data</b>

In [13]:
#load training data
app_train_df = pd.read_csv(r'./data/application_train.csv')

#save an original copy for future use
original_app_train_df = app_train_df.copy(deep=True)



In [14]:
print(f'Training data:')
print(f'Rows: {app_train_df.shape[0]}')
print(f'Attributes: {app_train_df.shape[1]}')

Training data:
Rows: 307511
Attributes: 122


In [15]:
#load testing data
app_test_df = pd.read_csv(r'./data/application_test.csv')

#save an original copy for future use
original_app_test_df = app_test_df.copy(deep=True)

In [16]:
print(f'Testing data:')
print(f'Rows: {app_test_df.shape[0]}')
print(f'Attributes: {app_test_df.shape[1]}')

Testing data:
Rows: 48744
Attributes: 121


<b>Exploratory Data Analysis (EDA)</b>
Looking into the data

In [19]:
#finding missing values in training df

missing_val = app_train_df.isnull().sum()
        
# % of missing values
missing_val_percent = 100 * app_train_df.isnull().sum() / len(app_train_df)

# making a table for both 
missing_val_table = pd.concat([missing_val, missing_val_percent], axis=1)

# Naming the cols
misssing_val_table1 = missing_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% Missing Values'})

# Sort the table by percentage of missing descending
misssing_val_table1 = misssing_val_table1[misssing_val_table1.iloc[:,1] != 0
                                                     ].sort_values('% Missing Values', ascending=False).round(1)

# Print some summary information
print ("The dataframe has {} columns.\n".format(str(app_train_df.shape[1])),      
        "There are {} columns that have missing values.\n".format(str(misssing_val_table1.shape[0])),
       "There are {} columns that have no missing values".format(int(app_train_df.shape[1]) - int(misssing_val_table1.shape[0])) )



misssing_val_table1

The dataframe has 122 columns.
 There are 67 columns that have missing values.
 There are 55 columns that have no missing values


Unnamed: 0,Missing Values,% Missing Values
COMMONAREA_MEDI,214865,69.9
COMMONAREA_AVG,214865,69.9
COMMONAREA_MODE,214865,69.9
NONLIVINGAPARTMENTS_MEDI,213514,69.4
NONLIVINGAPARTMENTS_MODE,213514,69.4
NONLIVINGAPARTMENTS_AVG,213514,69.4
FONDKAPREMONT_MODE,210295,68.4
LIVINGAPARTMENTS_MODE,210199,68.4
LIVINGAPARTMENTS_MEDI,210199,68.4
LIVINGAPARTMENTS_AVG,210199,68.4


<b>Looking into Attributes(columns) and their Dataypes </b>

In [21]:
train_dtypes = pd.DataFrame(app_train_df.dtypes.value_counts()).reset_index()

train_dtypes

Unnamed: 0,index,0
0,float64,65
1,int64,41
2,object,16


In [22]:
# Listing all Attributes by their datatypes
columns = app_train_df.columns.to_series().groupby(app_train_df.dtypes).groups
for data_type in columns.keys():
    print('\nData Type {} Columns:'.format(data_type))
    pprint(list(columns[data_type]))


Data Type int64 Columns:
['SK_ID_CURR',
 'TARGET',
 'CNT_CHILDREN',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21']

Data Type float64 Columns:
['AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',


<b>Encoding Categorical Attributes</b>
converting values to either(1/0) or adding columns for each unique value of a class

1. Label encoding

In [24]:
#Label Encoder
le = LabelEncoder()
le_cnt = 0

#for each column
for c in app_train_df:
    if app_train_df[c].dtype == 'object':
        #if 2 or less unique categories
        if len(list(app_train_df[c].unique())) <= 2:
            print(f'{c} was encoded')
            
            #training le on training data
            le.fit(app_train_df[c])
            
            #Transforming training and testing data
            app_train_df[c] = le.transform(app_train_df[c])
            app_test_df[c] = le.transform(app_test_df[c])
            
            le_cnt += 1
            
print(f'{le_cnt} columns were encoded')
            
            

NAME_CONTRACT_TYPE was encoded
FLAG_OWN_CAR was encoded
FLAG_OWN_REALTY was encoded
3 columns were encoded


2. One-hot encoding

In [26]:
app_train_df = pd.get_dummies(app_train_df)
app_test_df = pd.get_dummies(app_test_df)

print(f'Shapes after One-hot encoding')
print(f'Training data frame: {app_train_df.shape}')
print(f'Testing data frame: {app_test_df.shape}')

labels = app_train_df['TARGET']

#Align training and testing data
app_train_df, app_test_df = app_train_df.align(app_test_df, join='inner', axis=1)

app_train_df['TARGET'] = labels
print(f'Shapes Aligning')
print(f'Training data frame: {app_train_df.shape}')
print(f'Testing data frame: {app_test_df.shape}')

Shapes after One-hot encoding
Training data frame: (307511, 243)
Testing data frame: (48744, 239)
Shapes Aligning
Training data frame: (307511, 240)
Testing data frame: (48744, 239)
