In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
plt.rc("font", size=13)
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv("data/bank_data.csv", header=0)
print(f"Training file: {train_data.shape[0]} rows, {train_data.shape[-1]} columns")

Training file: 100514 rows, 19 columns


In [3]:
test_data = pd.read_csv("data/credit_test.csv", header=0)
print(f"Test file: {test_data.shape[0]} rows, {test_data.shape[-1]} columns")

Test file: 10353 rows, 18 columns


## Training data

In [4]:
train_data.drop(['Loan ID', 'Customer ID'], axis = 1, inplace = True)
train_data.drop_duplicates(inplace = True)

In [5]:
train_data.isnull().sum()

Loan Status                         1
Current Loan Amount                 1
Term                                1
Credit Score                    19155
Annual Income                   19155
Years in current job             3803
Home Ownership                      1
Purpose                             1
Monthly Debt                        1
Years of Credit History             1
Months since last delinquent    48338
Number of Open Accounts             1
Number of Credit Problems           1
Current Credit Balance              1
Maximum Open Credit                 3
Bankruptcies                      191
Tax Liens                          10
dtype: int64

In [6]:
train_data.drop(['Months since last delinquent'], axis = 1, inplace = True)
train_data.dropna(axis = 0, subset = ['Credit Score'], inplace = True)
train_data.dropna(axis = 0, subset = ['Years in current job'], inplace = True)
train_data.dropna(axis = 0, subset = ['Bankruptcies'], inplace = True)
train_data.dropna(axis = 0, subset = ['Maximum Open Credit'], inplace = True)

In [7]:
train_data.isnull().sum()

Loan Status                  0
Current Loan Amount          0
Term                         0
Credit Score                 0
Annual Income                0
Years in current job         0
Home Ownership               0
Purpose                      0
Monthly Debt                 0
Years of Credit History      0
Number of Open Accounts      0
Number of Credit Problems    0
Current Credit Balance       0
Maximum Open Credit          0
Bankruptcies                 0
Tax Liens                    0
dtype: int64

In [8]:
print(f"Training set: {train_data.shape[0]} elements, {train_data.shape[-1] -1} features") # one column is for labels

Training set: 67490 elements, 15 features


## Test data

In [9]:
test_data.drop(['Loan ID', 'Customer ID'], axis = 1, inplace = True)
test_data.drop_duplicates(inplace = True)

In [10]:
test_data.isnull().sum()

Current Loan Amount                1
Term                               1
Credit Score                    1982
Annual Income                   1982
Years in current job             428
Home Ownership                     1
Purpose                            1
Monthly Debt                       1
Years of Credit History            1
Months since last delinquent    5307
Number of Open Accounts            1
Number of Credit Problems          1
Current Credit Balance             1
Maximum Open Credit                1
Bankruptcies                      23
Tax Liens                          2
dtype: int64

In [11]:
test_data.drop(['Months since last delinquent'], axis = 1, inplace = True)
test_data.dropna(axis = 0, subset = ['Credit Score'], inplace = True)
test_data.dropna(axis = 0, subset = ['Years in current job'], inplace = True)
test_data.dropna(axis = 0, subset = ['Bankruptcies'], inplace = True)
test_data.dropna(axis = 0, subset = ['Maximum Open Credit'], inplace = True)

In [12]:
test_data.isnull().sum()

Current Loan Amount          0
Term                         0
Credit Score                 0
Annual Income                0
Years in current job         0
Home Ownership               0
Purpose                      0
Monthly Debt                 0
Years of Credit History      0
Number of Open Accounts      0
Number of Credit Problems    0
Current Credit Balance       0
Maximum Open Credit          0
Bankruptcies                 0
Tax Liens                    0
dtype: int64

In [13]:
print(f"Test set: {test_data.shape[0]} elements, {test_data.shape[-1]} features")

Test set: 7658 elements, 15 features


## Labels

In [14]:
le = LabelEncoder()
train_data['Loan Status'] = le.fit_transform(train_data['Loan Status'])
train_data['Term'] = le.fit_transform(train_data['Term'])
train_data['Years in current job'] = le.fit_transform(train_data['Years in current job'])
train_data['Home Ownership'] = le.fit_transform(train_data['Home Ownership'])
train_data['Purpose'] = le.fit_transform(train_data['Purpose'])

In [15]:
X = train_data.loc[:, train_data.columns != 'Loan Status'].values
y = train_data.loc[ : , ["Loan Status"]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)