## Data Understanding

In [3]:
#Load dataset
import pandas as pd
data = pd.read_excel('raw-dataset.xlsx')

In [4]:
#Data columns and rows
print(data.shape)
list(data.columns)

(29366, 18)


['durasi',
 'interest',
 'net_income',
 'additional_income',
 'plafon',
 'new_debitur',
 'biz_ownership',
 'loyal_customer',
 'liquidation_value',
 'age',
 'marriage_status',
 'gender',
 'edu_code',
 'edu',
 'job_code',
 'job_kind',
 'monthly_installment',
 'Good_Not_Good']

In [5]:
#Print first columns
data.head()

Unnamed: 0,durasi,interest,net_income,additional_income,plafon,new_debitur,biz_ownership,loyal_customer,liquidation_value,age,marriage_status,gender,edu_code,edu,job_code,job_kind,monthly_installment,Good_Not_Good
0,24,8.0,28710000000.0,0.0,15000000.0,,0,0,,54.0,B,F,S1,Sarjana,ADMI,Administrasi Umum / Supervisor,778700.0,1
1,24,8.0,50500000000.0,0.0,50000000.0,,0,0,,63.0,B,F,S1,Sarjana,ADMI,Administrasi Umum / Supervisor,2683300.0,1
2,24,8.0,11000000000.0,0.0,5000000.0,,1,0,,50.0,B,F,S1,Sarjana,ADMI,Administrasi Umum / Supervisor,270800.0,1
3,18,8.0,15900000000.0,0.0,15000000.0,,1,0,,50.0,B,F,S1,Sarjana,ADMI,Administrasi Umum / Supervisor,1013300.0,1
4,12,8.0,513000000000.0,0.0,35000000.0,,0,0,,52.0,B,F,SD,SD,ADMI,Administrasi Umum / Supervisor,3336700.0,1


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29366 entries, 0 to 29365
Data columns (total 18 columns):
durasi                 29366 non-null int64
interest               29366 non-null float64
net_income             29366 non-null float64
additional_income      29366 non-null float64
plafon                 29366 non-null float64
new_debitur            116 non-null float64
biz_ownership          29366 non-null int64
loyal_customer         29366 non-null int64
liquidation_value      107 non-null float64
age                    29255 non-null float64
marriage_status        29097 non-null object
gender                 29161 non-null object
edu_code               28891 non-null object
edu                    28891 non-null object
job_code               29101 non-null object
job_kind               27150 non-null object
monthly_installment    28524 non-null float64
Good_Not_Good          29366 non-null int64
dtypes: float64(8), int64(4), object(6)
memory usage: 4.0+ MB


## Data Preparation

In [7]:
#Remove unnecessary columns
data.drop(['job_kind'], axis = 1, inplace = True) #already represented in job code
data.drop(['edu'], axis = 1, inplace = True) #already represented in edu code
data.drop(['liquidation_value'], axis = 1, inplace = True) #insufficient data
data.drop(['new_debitur'], axis = 1, inplace = True) #insufficient data

In [8]:
#Handling missing values

#filling with mean
from sklearn.preprocessing import Imputer
imp=Imputer(missing_values="NaN", strategy="mean" )
data["age"]=imp.fit_transform(data[["age"]]).ravel()
data["monthly_installment"]=imp.fit_transform(data[["monthly_installment"]]).ravel()




In [9]:
#Categorical missing values
import numpy 
from sklearn.base import TransformerMixin
class SeriesImputer(TransformerMixin):
    def __init__(self):
        """Impute missing values.
        If the Series is of dtype Object, then impute with the most frequent object.
        If the Series is not of dtype Object, then impute with the mean.  
        """
    def fit(self, X, y=None):
        if   X.dtype == numpy.dtype('O'): self.fill = X.value_counts().index[0]
        else                            : self.fill = X.mean()
        return self
    def transform(self, X, y=None):
       return X.fillna(self.fill)

In [10]:
a  = SeriesImputer()   # Initialize the imputer

#filling with modus for edu code
a.fit(data["edu_code"])              # Fit the imputer
newdata = a.transform(data["edu_code"])   # Get a new series
data["edu_code"]=newdata

In [11]:
#Remove rows with other missing values
data.dropna(inplace=True)
# summarize the number of rows and columns in the dataset
print(data.shape)

(29096, 14)


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29096 entries, 0 to 29100
Data columns (total 14 columns):
durasi                 29096 non-null int64
interest               29096 non-null float64
net_income             29096 non-null float64
additional_income      29096 non-null float64
plafon                 29096 non-null float64
biz_ownership          29096 non-null int64
loyal_customer         29096 non-null int64
age                    29096 non-null float64
marriage_status        29096 non-null object
gender                 29096 non-null object
edu_code               29096 non-null object
job_code               29096 non-null object
monthly_installment    29096 non-null float64
Good_Not_Good          29096 non-null int64
dtypes: float64(6), int64(4), object(4)
memory usage: 3.3+ MB


In [13]:
#Class distribution
data=data.rename(columns={'Good_Not_Good':'good'})
data.good.value_counts()

1    27679
0     1417
Name: good, dtype: int64

In [14]:
#List predictors
X_features = list(data.columns)
X_features.remove('good')
X_features

['durasi',
 'interest',
 'net_income',
 'additional_income',
 'plafon',
 'biz_ownership',
 'loyal_customer',
 'age',
 'marriage_status',
 'gender',
 'edu_code',
 'job_code',
 'monthly_installment']

In [15]:
#Variable output / class
Y_feature=data['good']

In [16]:
#One-hot-encoding : listing all categorical columns
credit_df_complete = pd.get_dummies(data[X_features])
list(credit_df_complete)

['durasi',
 'interest',
 'net_income',
 'additional_income',
 'plafon',
 'biz_ownership',
 'loyal_customer',
 'age',
 'monthly_installment',
 'marriage_status_B',
 'marriage_status_D',
 'marriage_status_J',
 'marriage_status_K',
 'gender_F',
 'gender_M',
 'edu_code_S1',
 'edu_code_S2',
 'edu_code_S3',
 'edu_code_SD',
 'edu_code_SM',
 'edu_code_SU',
 'job_code_ADMI',
 'job_code_AKUN',
 'job_code_BUMN',
 'job_code_DAGA',
 'job_code_DKTR',
 'job_code_EXEC',
 'job_code_GURU',
 'job_code_GUSW',
 'job_code_IBRT',
 'job_code_KOMP',
 'job_code_KONS',
 'job_code_MAHA',
 'job_code_MILD',
 'job_code_MILL',
 'job_code_MILP',
 'job_code_MILU',
 'job_code_PELA',
 'job_code_PEMI',
 'job_code_PENG',
 'job_code_PENS',
 'job_code_PGCR',
 'job_code_PNSI',
 'job_code_PROD',
 'job_code_PROF',
 'job_code_RISE',
 'job_code_SALE',
 'job_code_SENI',
 'job_code_SERV',
 'job_code_SWAS',
 'job_code_TECH',
 'job_code_WIRA',
 'job_code_ZZZZZ']

In [17]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from sklearn.preprocessing import LabelEncoder

In [18]:
#Set predictors and target
predictors = credit_df_complete
target = Y_feature

In [19]:
#label encoding
def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category','object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

dummyEncode(predictors)


Unnamed: 0,durasi,interest,net_income,additional_income,plafon,biz_ownership,loyal_customer,age,monthly_installment,marriage_status_B,...,job_code_PROD,job_code_PROF,job_code_RISE,job_code_SALE,job_code_SENI,job_code_SERV,job_code_SWAS,job_code_TECH,job_code_WIRA,job_code_ZZZZZ
0,24,8.0,2.871000e+10,0.0,15000000.0,0,0,54.0,778700.0,1,...,0,0,0,0,0,0,0,0,0,0
1,24,8.0,5.050000e+10,0.0,50000000.0,0,0,63.0,2683300.0,1,...,0,0,0,0,0,0,0,0,0,0
2,24,8.0,1.100000e+10,0.0,5000000.0,1,0,50.0,270800.0,1,...,0,0,0,0,0,0,0,0,0,0
3,18,8.0,1.590000e+10,0.0,15000000.0,1,0,50.0,1013300.0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,8.0,5.130000e+11,0.0,35000000.0,0,0,52.0,3336700.0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29096,24,8.0,1.650000e+10,0.0,10000000.0,0,0,55.0,536700.0,0,...,0,0,0,0,0,0,0,0,0,1
29097,24,8.0,1.290000e+10,0.0,15000000.0,1,0,36.0,778700.0,0,...,0,0,0,0,0,0,0,0,0,1
29098,36,8.0,9.150000e+09,0.0,30000000.0,0,0,48.0,1193300.0,0,...,0,0,0,0,0,0,0,0,0,1
29099,6,8.0,2.921000e+10,0.0,5000000.0,0,0,36.0,5542700.0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
##Split training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(predictors,target,stratify=target,test_size=0.2)

In [22]:
##Sampling
from imblearn.over_sampling import RandomOverSampler
sampletech = RandomOverSampler(random_state=0)
X_resampled, y_resampled = sampletech.fit_sample(X_train,
y_train)
from collections import Counter
print('Resampled dataset shape {}'.format(Counter(y_resampled)))

Resampled dataset shape Counter({1: 22142, 0: 22142})


In [None]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = data[X_train.good==1s]
df_minority = data[X_.good==0]

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=26665,    # to match minority class
                                 random_state=123) # reproducible results
df_minority_upsampled

# Downsample minority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=1404,    # to match minority class
                                 random_state=123) # reproducible results
df_majority_downsampled

import pandas as pd
# Combine majority class with upsampled minority class
#ups=[df_minority_upsampled,df_minority]
dfnew=df_majority.append(df_minority_upsampled)
dfnewz=df_minority.append(df_majority_downsampled)

In [23]:
##Normalization
from sklearn.preprocessing import StandardScaler
#Normalize data
scaler = StandardScaler()
# Fit only to the training data
scaler.fit(X_train)
# Now apply the transformations to the data:
X_trainf = scaler.transform(X_resampled)
X_testf = scaler.transform(X_test)
y_train = y_resampled

In [27]:
##Import library
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer

##Deciding the model configuration
#make scorer for grid search
def tp(y_true, y_pred):
    error= confusion_matrix(y_true,y_pred)[0,0]/(confusion_matrix(y_true, y_pred)[0,0] + confusion_matrix(y_true, y_pred)[0,1])
    return error

specificity = make_scorer(tp, greater_is_better=True)
scoring={'Accuracy': make_scorer(accuracy_score),'Precision': make_scorer(precision_score),'Recall': make_scorer(recall_score),'Specificity': specificity}

#Grid search parameters
parameters = {'activation':('logistic','tanh',
'relu'),'alpha':(0.00001,0.0001,0.001,0.01,0.1,1)}

from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
#Model configuration
model = GridSearchCV(MLPClassifier(hidden_layer_sizes=(607,121),max_iter=500,
                                   solver='adam',random_state=42,verbose=True),
                                param_grid=parameters,scoring=scoring,refit=False,return_train_score=False)

#Fit the model
model.fit(X_trainf,y_train)

#Results
results = model.cv_results_



Iteration 1, loss = 0.69286869
Iteration 2, loss = 0.68489179
Iteration 3, loss = 0.68336554
Iteration 4, loss = 0.68285728
Iteration 5, loss = 0.68171499
Iteration 6, loss = 0.68104150
Iteration 7, loss = 0.67986242
Iteration 8, loss = 0.67925871
Iteration 9, loss = 0.67865801
Iteration 10, loss = 0.67784799
Iteration 11, loss = 0.67696239
Iteration 12, loss = 0.67590879
Iteration 13, loss = 0.67520367
Iteration 14, loss = 0.67475149
Iteration 15, loss = 0.67409883
Iteration 16, loss = 0.67358829
Iteration 17, loss = 0.67261805
Iteration 18, loss = 0.67230687
Iteration 19, loss = 0.67162161
Iteration 20, loss = 0.67025749
Iteration 21, loss = 0.67004705
Iteration 22, loss = 0.66877484
Iteration 23, loss = 0.66846705
Iteration 24, loss = 0.66764999
Iteration 25, loss = 0.66686163
Iteration 26, loss = 0.66637866
Iteration 27, loss = 0.66548964
Iteration 28, loss = 0.66465563
Iteration 29, loss = 0.66421865
Iteration 30, loss = 0.66323005
Iteration 31, loss = 0.66307915
Iteration 32, los

Iteration 253, loss = 0.31224728
Iteration 254, loss = 0.31185492
Iteration 255, loss = 0.31174661
Iteration 256, loss = 0.31061388
Iteration 257, loss = 0.30986993
Iteration 258, loss = 0.30877550
Iteration 259, loss = 0.30798434
Iteration 260, loss = 0.30911814
Iteration 261, loss = 0.30660712
Iteration 262, loss = 0.30532023
Iteration 263, loss = 0.30552066
Iteration 264, loss = 0.30274603
Iteration 265, loss = 0.30384540
Iteration 266, loss = 0.30287038
Iteration 267, loss = 0.30127817
Iteration 268, loss = 0.30083608
Iteration 269, loss = 0.30101108
Iteration 270, loss = 0.30020108
Iteration 271, loss = 0.30037624
Iteration 272, loss = 0.29923290
Iteration 273, loss = 0.29825190
Iteration 274, loss = 0.29832701
Iteration 275, loss = 0.29608605
Iteration 276, loss = 0.29474062
Iteration 277, loss = 0.29610776
Iteration 278, loss = 0.29434685
Iteration 279, loss = 0.29319982
Iteration 280, loss = 0.29359556
Iteration 281, loss = 0.29262566
Iteration 282, loss = 0.29167737
Iteration 



Iteration 1, loss = 0.69257359
Iteration 2, loss = 0.68531849
Iteration 3, loss = 0.68431978
Iteration 4, loss = 0.68364146
Iteration 5, loss = 0.68290524
Iteration 6, loss = 0.68186211
Iteration 7, loss = 0.68046507
Iteration 8, loss = 0.67947670
Iteration 9, loss = 0.67899256
Iteration 10, loss = 0.67827845
Iteration 11, loss = 0.67692381
Iteration 12, loss = 0.67664785
Iteration 13, loss = 0.67577904
Iteration 14, loss = 0.67502177
Iteration 15, loss = 0.67487587
Iteration 16, loss = 0.67404903
Iteration 17, loss = 0.67338579
Iteration 18, loss = 0.67244465
Iteration 19, loss = 0.67219233
Iteration 20, loss = 0.67035142
Iteration 21, loss = 0.66969990
Iteration 22, loss = 0.66907482
Iteration 23, loss = 0.66799539
Iteration 24, loss = 0.66687937
Iteration 25, loss = 0.66616861
Iteration 26, loss = 0.66577764
Iteration 27, loss = 0.66473609
Iteration 28, loss = 0.66370290
Iteration 29, loss = 0.66295335
Iteration 30, loss = 0.66128396
Iteration 31, loss = 0.66094686
Iteration 32, los

Iteration 253, loss = 0.33317131
Iteration 254, loss = 0.33324045
Iteration 255, loss = 0.33235722
Iteration 256, loss = 0.33206037
Iteration 257, loss = 0.33259811
Iteration 258, loss = 0.33014747
Iteration 259, loss = 0.33029777
Iteration 260, loss = 0.32984191
Iteration 261, loss = 0.32892855
Iteration 262, loss = 0.32785554
Iteration 263, loss = 0.32750857
Iteration 264, loss = 0.32729153
Iteration 265, loss = 0.32652742
Iteration 266, loss = 0.32637704
Iteration 267, loss = 0.32401180
Iteration 268, loss = 0.32458592
Iteration 269, loss = 0.32279628
Iteration 270, loss = 0.32171952
Iteration 271, loss = 0.32209966
Iteration 272, loss = 0.32300542
Iteration 273, loss = 0.32118978
Iteration 274, loss = 0.32029763
Iteration 275, loss = 0.31847184
Iteration 276, loss = 0.31873620
Iteration 277, loss = 0.31854051
Iteration 278, loss = 0.31742702
Iteration 279, loss = 0.31627588
Iteration 280, loss = 0.31633413
Iteration 281, loss = 0.31607196
Iteration 282, loss = 0.31432362
Iteration 



Iteration 1, loss = 0.69257683
Iteration 2, loss = 0.68552278
Iteration 3, loss = 0.68320572
Iteration 4, loss = 0.68145468
Iteration 5, loss = 0.68113202
Iteration 6, loss = 0.68013501
Iteration 7, loss = 0.67933595
Iteration 8, loss = 0.67867099
Iteration 9, loss = 0.67759937
Iteration 10, loss = 0.67749000




Iteration 1, loss = 0.69289293
Iteration 2, loss = 0.68492086
Iteration 3, loss = 0.68340007
Iteration 4, loss = 0.68289615
Iteration 5, loss = 0.68175921
Iteration 6, loss = 0.68109524
Iteration 7, loss = 0.67992396
Iteration 8, loss = 0.67933431
Iteration 9, loss = 0.67874243
Iteration 10, loss = 0.67794239
Iteration 11, loss = 0.67706437
Iteration 12, loss = 0.67602138
Iteration 13, loss = 0.67532949




Iteration 1, loss = 0.69259769
Iteration 2, loss = 0.68534657




Iteration 1, loss = 0.69260056
Iteration 2, loss = 0.68555068
Iteration 3, loss = 0.68324064




Iteration 1, loss = 0.69312616
Iteration 2, loss = 0.68519848
Iteration 3, loss = 0.68373013


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Nurlaili\Anaconda3\envs\iroschoolpython\lib\site-packages\IPython\core\interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-27-87b481c3eb46>", line 31, in <module>
    model.fit(X_trainf,y_train)
  File "C:\Users\Nurlaili\Anaconda3\envs\iroschoolpython\lib\site-packages\sklearn\model_selection\_search.py", line 688, in fit
    self._run_search(evaluate_candidates)
  File "C:\Users\Nurlaili\Anaconda3\envs\iroschoolpython\lib\site-packages\sklearn\model_selection\_search.py", line 1149, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "C:\Users\Nurlaili\Anaconda3\envs\iroschoolpython\lib\site-packages\sklearn\model_selection\_search.py", line 667, in evaluate_candidates
    cv.split(X, y, groups)))
  File "C:\Users\Nurlaili\Anaconda3\envs\iroschoolpython\lib\site-packages\joblib\parallel.py", line 1007, in __call__
    while self.dispatch_o

KeyboardInterrupt: 

In [28]:
model

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08,
                                     hidden_layer_sizes=(607, 121),
                                     learning_rate='constant',
                                     learning_rate_init=0.001, max_iter=500,
                                     momentum=0.9, n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     ra...
                                     warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'activation': ('logistic', 'tanh', 'relu'),
                         'alpha': (1e-05, 0.0001, 0.001, 0.01, 0.1, 1)},
             pre_dispatch='2*n_jobs', refit=Fa

In [29]:
##Testing
#Model configuration
model=MLPClassifier(hidden_layer_sizes=(607,121),activation='tanh',max_iter=500,solver='adam',verbose=True,alpha=0.0001)

#Fit model
model.fit(X_trainf,y_train)
loss_values = model.loss_curve_

#Test the model
y_pred = model.predict(X_testf)

#Print final result
print(confusion_matrix(y_test,y_pred))

Iteration 1, loss = 0.68701128
Iteration 2, loss = 0.66445532
Iteration 3, loss = 0.64690401
Iteration 4, loss = 0.62965586
Iteration 5, loss = 0.61237938
Iteration 6, loss = 0.59662466
Iteration 7, loss = 0.57839911
Iteration 8, loss = 0.56273056
Iteration 9, loss = 0.54880677
Iteration 10, loss = 0.53557674
Iteration 11, loss = 0.52350572
Iteration 12, loss = 0.50940764
Iteration 13, loss = 0.49755262
Iteration 14, loss = 0.48479831
Iteration 15, loss = 0.47396151
Iteration 16, loss = 0.46427547
Iteration 17, loss = 0.45450955
Iteration 18, loss = 0.44230875
Iteration 19, loss = 0.43313205
Iteration 20, loss = 0.42417624
Iteration 21, loss = 0.41459916
Iteration 22, loss = 0.40567104
Iteration 23, loss = 0.39855962
Iteration 24, loss = 0.38915468
Iteration 25, loss = 0.38276164
Iteration 26, loss = 0.37263301
Iteration 27, loss = 0.36758591
Iteration 28, loss = 0.35839380
Iteration 29, loss = 0.35330703
Iteration 30, loss = 0.34609610
Iteration 31, loss = 0.33903606
Iteration 32, los

In [32]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.03      0.05      0.03       283
           1       0.95      0.91      0.93      5537

    accuracy                           0.86      5820
   macro avg       0.49      0.48      0.48      5820
weighted avg       0.90      0.86      0.88      5820

