In [2]:
#imports
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from math import sqrt

#pandas profiling
from pandas_profiling import ProfileReport

#sklearn stuff
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR, SVC

<b>Data Import</b>

In [3]:
#data import
credit = pd.read_csv('credit.csv', skiprows=1)

<b>Preprocessing</b>

In [4]:
#Rename col
credit['default'] = credit['default payment next month']
#drop unneeded cols
credit = credit.drop(['ID', 'default payment next month'], axis=1)
#drop duplicates
credit = credit.drop_duplicates()
#check for nulls
nulls = credit.isnull().sum()
nulls

LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_0        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
default      0
dtype: int64

In [5]:
#Bin the limits into quartiles and name them
df_bin_limits = credit['Limit bins'] = pd.qcut(credit['LIMIT_BAL'], q=4,
                                       labels = ['Bronze','Silver','Gold','Diamond'])
df_bin_limits

0        Bronze
1        Silver
2        Silver
3        Bronze
4        Bronze
          ...  
29995      Gold
29996      Gold
29997    Bronze
29998    Silver
29999    Bronze
Name: LIMIT_BAL, Length: 29965, dtype: category
Categories (4, object): [Bronze < Silver < Gold < Diamond]

In [6]:
credit

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default,Limit bins
0,20000,female,university,1,24,2,2,-1,-1,-2,...,0,0,0,689,0,0,0,0,default,Bronze
1,120000,female,university,2,26,-1,2,0,0,0,...,3455,3261,0,1000,1000,1000,0,2000,default,Silver
2,90000,female,university,2,34,0,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,not default,Silver
3,50000,female,university,1,37,0,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,not default,Bronze
4,50000,male,university,1,57,-1,0,-1,0,0,...,19146,19131,2000,36681,10000,9000,689,679,not default,Bronze
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,220000,male,high school,1,39,0,0,0,0,0,...,31237,15980,8500,20000,5003,3047,5000,1000,not default,Gold
29996,150000,male,high school,2,43,-1,-1,-1,-1,0,...,5190,0,1837,3526,8998,129,0,0,not default,Gold
29997,30000,male,university,2,37,4,3,2,-1,0,...,20582,19357,0,0,22000,4200,2000,3100,default,Bronze
29998,80000,male,high school,1,41,1,-1,0,0,0,...,11855,48944,85900,3409,1178,1926,52964,1804,default,Silver


In [7]:
credit['SEX'].describe()

count      29965
unique         2
top       female
freq       18091
Name: SEX, dtype: object

In [8]:
#Encode as needed
le = LabelEncoder()
le.fit(credit['SEX'])
credit['SEX'] = le.transform(credit['SEX'])
#credit['SEX']

In [9]:
credit['EDUCATION'].describe()

count          29965
unique             4
top       university
freq           14019
Name: EDUCATION, dtype: object

In [10]:
#Encode as needed
le = LabelEncoder()
le.fit(credit['EDUCATION'])
credit['EDUCATION'] = le.transform(credit['EDUCATION'])
#credit['EDUCATION']

In [11]:
credit['default'].describe()

count           29965
unique              2
top       not default
freq            23335
Name: default, dtype: object

In [12]:
#Encode as needed
le = LabelEncoder()
le.fit(credit['default'])
credit['default'] = le.transform(credit['default'])
#credit['default']

In [13]:
credit['Limit bins'].describe()

count      29965
unique         4
top       Bronze
freq        7673
Name: Limit bins, dtype: object

In [14]:
#Encode as needed
le = LabelEncoder()
le.fit(credit['Limit bins'])
credit['Limit bins'] = le.transform(credit['Limit bins'])
credit['Limit bins']

0        0
1        3
2        3
3        0
4        0
        ..
29995    2
29996    2
29997    0
29998    3
29999    0
Name: Limit bins, Length: 29965, dtype: int32

<b>Descriptive Stats</b>

In [15]:
credit.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default,Limit bins
0,20000,0,3,1,24,2,2,-1,-1,-2,...,0,0,0,689,0,0,0,0,0,0
1,120000,0,3,2,26,-1,2,0,0,0,...,3455,3261,0,1000,1000,1000,0,2000,0,3
2,90000,0,3,2,34,0,0,0,0,0,...,14948,15549,1518,1500,1000,1000,1000,5000,1,3
3,50000,0,3,1,37,0,0,0,0,0,...,28959,29547,2000,2019,1200,1100,1069,1000,1,0
4,50000,1,3,1,57,-1,0,-1,0,0,...,19146,19131,2000,36681,10000,9000,689,679,1,0


In [16]:
#shift pay features for -2
credit['PAY_0'] = credit['PAY_0'].replace(-2,0)
credit['PAY_2'] = credit['PAY_2'].replace(-2,0)
credit['PAY_3'] = credit['PAY_3'].replace(-2,0)
credit['PAY_4'] = credit['PAY_4'].replace(-2,0)
credit['PAY_5'] = credit['PAY_5'].replace(-2,0)
credit['PAY_6'] = credit['PAY_6'].replace(-2,0)

#shift pay features for -1
credit['PAY_0'] = credit['PAY_0'].replace(-1,0)
credit['PAY_2'] = credit['PAY_2'].replace(-1,0)
credit['PAY_3'] = credit['PAY_3'].replace(-1,0)
credit['PAY_4'] = credit['PAY_4'].replace(-1,0)
credit['PAY_5'] = credit['PAY_5'].replace(-1,0)
credit['PAY_6'] = credit['PAY_6'].replace(-1,0)

<b>All numbers - good to go! - Pre-processing complete</b>

<b>Feature Selection via Filtering</b>

In [22]:
#features
X = credit.drop(['Limit bins', 'LIMIT_BAL'], axis=1)
#dep Var
y = credit['Limit bins']
#split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state = 123)

In [23]:
X.head()

Unnamed: 0,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,0,3,1,24,2,2,0,0,0,0,...,0,0,0,0,689,0,0,0,0,0
1,0,3,2,26,0,2,0,0,0,2,...,3272,3455,3261,0,1000,1000,1000,0,2000,0
2,0,3,2,34,0,0,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,1
3,0,3,1,37,0,0,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,1
4,1,3,1,57,0,0,0,0,0,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,1


<b>Choosing a model</b>

In [27]:
algos_Class = []
algos_Class.append(('Random Forest', RandomForestClassifier()))
algos_Class.append(('SVM', SVC()))
algos_Class.append(('GBM', GradientBoostingClassifier()))

In [28]:
#Regression
results = []
names = []
for name, model in algos_Class:
        result = cross_val_score(model, X,y, cv=3, scoring='accuracy')
        names.append(name)
        results.append(result)



In [29]:
for i in range(len(names)):
    print(names[i],results[i].mean())

Random Forest 0.5508093123431589
SVM 0.26671125094358406
GBM 0.591356187657826


In [45]:
#Modelling
algo = RandomForestClassifier(n_estimators=200)
model = algo.fit(X_train, y_train)

In [46]:
#predictions
preds = model.predict(X_test)