In [7]:
#imports
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

#pandas profiling
from pandas_profiling import ProfileReport

#sklearn stuff
#from feature_engine.discretisers import EqualFrequencyDiscretiser
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, SVC

<b>Data Import</b>

In [8]:
#data import
credit = pd.read_csv('credit.csv', skiprows=1)

<b>Preprocessing</b>

In [9]:
#Rename col
credit['default'] = credit['default payment next month']
#drop unneeded cols
credit = credit.drop(['ID', 'default payment next month'], axis=1)
#drop duplicates
credit = credit.drop_duplicates()
#check for nulls
nulls = credit.isnull().sum()
nulls

LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
default      0
dtype: int64

In [10]:
credit.dtypes

LIMIT_BAL     int64
SEX          object
EDUCATION    object
MARRIAGE      int64
AGE           int64
PAY_0         int64
PAY_2         int64
PAY_3         int64
PAY_4         int64
PAY_5         int64
PAY_6         int64
BILL_AMT1     int64
BILL_AMT2     int64
BILL_AMT3     int64
BILL_AMT4     int64
BILL_AMT5     int64
BILL_AMT6     int64
PAY_AMT1      int64
PAY_AMT2      int64
PAY_AMT3      int64
PAY_AMT4      int64
PAY_AMT5      int64
PAY_AMT6      int64
default      object
dtype: object

In [11]:
#Encode as needed
le = LabelEncoder()
le.fit(credit['SEX'])
credit['SEX'] = le.transform(credit['SEX'])
#credit['SEX']

In [12]:
#Encode as needed
le = LabelEncoder()
le.fit(credit['default'])
credit['default'] = le.transform(credit['default'])
#credit['default']

In [13]:
#Encode as needed
le = LabelEncoder()
le.fit(credit['EDUCATION'])
credit['EDUCATION'] = le.transform(credit['EDUCATION'])
#credit['default']

In [14]:
#Discretize amounts
#credit['LIMIT_BAL'] = pd.qcut(credit['LIMIT_BAL'],q=4)

In [15]:
#Encode as needed
le = LabelEncoder()
le.fit(credit['LIMIT_BAL'])
credit['LIMIT_BAL'] = le.transform(credit['LIMIT_BAL'])
#credit['default']

In [16]:
#re-check dtypes
credit.dtypes

LIMIT_BAL    int64
SEX          int32
EDUCATION    int32
MARRIAGE     int64
AGE          int64
PAY_0        int64
PAY_2        int64
PAY_3        int64
PAY_4        int64
PAY_5        int64
PAY_6        int64
BILL_AMT1    int64
BILL_AMT2    int64
BILL_AMT3    int64
BILL_AMT4    int64
BILL_AMT5    int64
BILL_AMT6    int64
PAY_AMT1     int64
PAY_AMT2     int64
PAY_AMT3     int64
PAY_AMT4     int64
PAY_AMT5     int64
PAY_AMT6     int64
default      int32
dtype: object

<b>Descriptive Stats</b>

<b>Correlation</b>

<b>Feature Selection via Filtering</b>

In [None]:
credit.head()

<b>Modelling - Classification<b>

In [17]:
#features
X = credit.iloc[:,0:23]
#dep Var
y = credit['default']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 123)

In [18]:
X.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
0,2,0,3,1,24,2,2,-1,-1,-2,...,689,0,0,0,0,689,0,0,0,0
1,12,0,3,2,26,-1,2,0,0,0,...,2682,3272,3455,3261,0,1000,1000,1000,0,2000
2,9,0,3,2,34,0,0,0,0,0,...,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000
3,5,0,3,1,37,0,0,0,0,0,...,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000
4,5,1,3,1,57,-1,0,-1,0,0,...,35835,20940,19146,19131,2000,36681,10000,9000,689,679


In [19]:
X_train.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
16383,2,0,1,2,46,1,2,0,0,0,...,18067,18917,18717,17144,0,1800,1600,400,0,0
137,21,0,3,1,33,0,0,0,0,0,...,8792,9189,4404,5708,1500,1500,1000,500,2000,546
12720,2,1,3,1,32,0,0,0,0,0,...,11127,12928,13721,14492,1500,1300,2000,1000,1000,1000
10509,15,1,3,1,39,2,2,0,0,0,...,116491,118839,126102,123914,0,4225,4282,9207,0,4679
6679,11,0,0,2,25,1,2,-1,-1,0,...,1426,5660,5660,0,5,1426,5660,0,0,0


In [20]:
y_train.head()

16383    1
137      1
12720    1
10509    0
6679     1
Name: default, dtype: int32

In [21]:
algos_Class = []
algos_Class.append(('Random Forest', RandomForestClassifier()))
algos_Class.append(('SVM', SVC()))
algos_Class.append(('GBM', GradientBoostingClassifier()))

In [22]:
#Classification
results = []
names = []
for name, model in algos_Class:
        result = cross_val_score(model, X,y, cv=3)
        names.append(name)
        results.append(result)



In [23]:
for i in range(len(names)):
    print(names[i],results[i].mean())

Random Forest 0.796563000103745
SVM 0.7799099333656289
GBM 0.8198902085579413


In [24]:
#Modelling
algo = RandomForestClassifier(n_estimators=500,max_features=None)
model = algo.fit(X_train, y_train)

In [25]:
#predictions
preds = model.predict(X_test)
#Assessment
print('Accuracy Score is: %.3f' % accuracy_score(y_test, preds))

Accuracy Score is: 0.813
