In [1]:
import pandas as pd
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_dataset = pd.read_csv('train_data.csv')
test_dataset = pd.read_csv('test_data.csv')

In [3]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1204428 entries, 0 to 1204427
Data columns (total 26 columns):
addr_state             1204428 non-null object
annual_inc             1204428 non-null float64
issue_d                1204428 non-null object
application_type       1204428 non-null object
dti                    1204428 non-null float64
emp_length             1204428 non-null object
emp_title              1204428 non-null object
funded_amnt            1204428 non-null float64
funded_amnt_inv        1204428 non-null float64
grade                  1204428 non-null object
home_ownership         1204428 non-null object
id                     1204428 non-null int64
initial_list_status    1204428 non-null object
loan_amnt              1204428 non-null float64
member_id              1204428 non-null int64
policy_code            1204428 non-null float64
pub_rec                1204428 non-null float64
purpose                1204428 non-null object
sub_grade              1204428 non-n

In [4]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246318 entries, 0 to 246317
Data columns (total 26 columns):
addr_state             246318 non-null object
annual_inc             246318 non-null float64
issue_d                246318 non-null object
application_type       246318 non-null object
dti                    246318 non-null float64
emp_length             246318 non-null object
emp_title              246318 non-null object
funded_amnt            246318 non-null float64
funded_amnt_inv        246318 non-null float64
grade                  246318 non-null object
home_ownership         246318 non-null object
id                     246318 non-null int64
initial_list_status    246318 non-null object
loan_amnt              246318 non-null float64
member_id              246318 non-null int64
policy_code            246318 non-null float64
pub_rec                246318 non-null float64
purpose                246318 non-null object
sub_grade              246318 non-null object
term      

In [5]:
train_dataset.isna().sum()

addr_state             0
annual_inc             0
issue_d                0
application_type       0
dti                    0
emp_length             0
emp_title              0
funded_amnt            0
funded_amnt_inv        0
grade                  0
home_ownership         0
id                     0
initial_list_status    0
loan_amnt              0
member_id              0
policy_code            0
pub_rec                0
purpose                0
sub_grade              0
term                   0
title                  0
total_acc              0
total_pymnt            0
zip_code               0
verification_status    0
default_ind            0
dtype: int64

In [6]:
test_dataset.isna().sum()

addr_state             0
annual_inc             0
issue_d                0
application_type       0
dti                    0
emp_length             0
emp_title              0
funded_amnt            0
funded_amnt_inv        0
grade                  0
home_ownership         0
id                     0
initial_list_status    0
loan_amnt              0
member_id              0
policy_code            0
pub_rec                0
purpose                0
sub_grade              0
term                   0
title                  0
total_acc              0
total_pymnt            0
zip_code               0
verification_status    0
default_ind            0
dtype: int64

In [7]:
dataset = pd.concat([train_dataset, test_dataset], ignore_index=True)

In [8]:
from sklearn.preprocessing import LabelEncoder

category_colmap = {}
for col in dataset.select_dtypes('object').columns:
    print(col)
    le_enc = LabelEncoder()
    category_colmap[col] = le_enc
    dataset[col] = le_enc.fit_transform(dataset[col])

# ============== inverse transformation, getting true class name of labels ================ #
#category_colmap['column_name'].inverse_transform([0])  


addr_state
issue_d
application_type
emp_length
emp_title
grade
home_ownership
initial_list_status
purpose
sub_grade
term
title
zip_code
verification_status


In [9]:
train_dataset = dataset.iloc[0:len(train_dataset)].reset_index(drop=True)
test_dataset = dataset.iloc[len(train_dataset):].reset_index(drop=True)

In [10]:
from sklearn.model_selection import train_test_split

x_features = ['addr_state', 'annual_inc', 'issue_d', 'application_type', 'dti',
       'emp_length', 'emp_title', 'funded_amnt', 'funded_amnt_inv', 'grade',
       'home_ownership', 'id', 'initial_list_status', 'loan_amnt', 'member_id',
       'policy_code', 'pub_rec', 'purpose', 'sub_grade', 'term', 'title',
       'total_acc', 'total_pymnt', 'zip_code', 'verification_status']
x = train_dataset[x_features]

y_predictor = ['default_ind']
y = train_dataset[y_predictor]

x_train, x_valid, y_train, y_valid = train_test_split(x,y,test_size=0.3)
x_test, y_test = test_dataset[x_features], test_dataset[y_predictor]


In [11]:
traindata_dist = dict(y_train['default_ind'].value_counts())
validitydata_dist = dict(y_valid['default_ind'].value_counts())
testdata_dist = dict(y_test['default_ind'].value_counts())

print('Train Data Class Distribution     # Count :{}, 0/1 Ratio :{}, Class-0 % :{}, Class-1 % :{}'.format(traindata_dist, round((traindata_dist[0] / traindata_dist[1]),3), round((traindata_dist[0]/len(y_train)),3), round((traindata_dist[1]/len(y_train)),3)))
print('Valiadity Data Class Distribution # Count :{}, 0/1 Ratio :{}, Class-0 % :{}, Class-1 % :{}'.format(validitydata_dist, round((validitydata_dist[0]/ validitydata_dist[1]),3), round((validitydata_dist[0]/len(y_valid)),3), round((validitydata_dist[1]/len(y_valid)),3)))
print('Test Data Class Distribution      # Count :{}, 0/1 Ratio :{}, Class-0 % :{}, Class-1 % :{}'.format(testdata_dist, round((testdata_dist[0] / testdata_dist[1]),3), round((testdata_dist[0]/len(y_test)),3), round((testdata_dist[1]/len(y_test)),3)))


Train Data Class Distribution     # Count :{1: 477384, 0: 365715}, 0/1 Ratio :0.766, Class-0 % :0.434, Class-1 % :0.566
Valiadity Data Class Distribution # Count :{1: 205192, 0: 156137}, 0/1 Ratio :0.761, Class-0 % :0.432, Class-1 % :0.568
Test Data Class Distribution      # Count :{0: 241726, 1: 4592}, 0/1 Ratio :52.641, Class-0 % :0.981, Class-1 % :0.019


In [12]:
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support, f1_score, accuracy_score

def TestReport(model_name, y_true, y_pred):
    #print(confusion_matrix(y_test, y_pred))
    #print(classification_report(y_test, y_pred))
    precision, recall, fbeta_score, support = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    f1_ = f1_score(y_true, y_pred,average='weighted')
    acc_scr = accuracy_score(y_true, y_pred)
    print('{} \t |--> Precision : {} Recall : {} F1-Score : {} Accuracy-Score : {}'.format(model_name, round(precision,3), round(recall,3), round(f1_,3), round(acc_scr,3)))


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

lr_clf = LogisticRegression()
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
kn_clf = KNeighborsClassifier()
nb_clf = GaussianNB()
vt_clf_model = VotingClassifier(estimators=[('lr',lr_clf),('dt',dt_clf),('rf',rf_clf),('kn',kn_clf),('nb',nb_clf)], voting='soft')


for clf_model in (lr_clf, dt_clf, rf_clf, kn_clf, nb_clf, vt_clf_model):
    clf_model.fit(x_train, y_train)
    y_pred = clf_model.predict(x_valid)
    TestReport(clf_model.__class__.__name__, y_valid, y_pred)


LogisticRegression 	 |--> Precision : 0.797 Recall : 0.796 F1-Score : 0.797 Accuracy-Score : 0.796
DecisionTreeClassifier 	 |--> Precision : 0.995 Recall : 0.995 F1-Score : 0.995 Accuracy-Score : 0.995
RandomForestClassifier 	 |--> Precision : 0.999 Recall : 0.999 F1-Score : 0.999 Accuracy-Score : 0.999
KNeighborsClassifier 	 |--> Precision : 0.918 Recall : 0.904 F1-Score : 0.902 Accuracy-Score : 0.904
GaussianNB 	 |--> Precision : 0.706 Recall : 0.707 F1-Score : 0.702 Accuracy-Score : 0.707
VotingClassifier 	 |--> Precision : 0.987 Recall : 0.987 F1-Score : 0.987 Accuracy-Score : 0.987


In [33]:
for col in test_dataset.select_dtypes('object'):
    print(col)
    test_dataset[col] = category_colmap[col].transform(test_dataset[col])
    
x_test, y_test = test_dataset[x_features], test_dataset[y_predictor]

In [34]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246318 entries, 0 to 246317
Data columns (total 26 columns):
addr_state             246318 non-null int64
annual_inc             246318 non-null float64
issue_d                246318 non-null int64
application_type       246318 non-null int64
dti                    246318 non-null float64
emp_length             246318 non-null int64
emp_title              246318 non-null int64
funded_amnt            246318 non-null float64
funded_amnt_inv        246318 non-null float64
grade                  246318 non-null int64
home_ownership         246318 non-null int64
id                     246318 non-null int64
initial_list_status    246318 non-null int64
loan_amnt              246318 non-null float64
member_id              246318 non-null int64
policy_code            246318 non-null float64
pub_rec                246318 non-null float64
purpose                246318 non-null int64
sub_grade              246318 non-null int64
term                

In [40]:
for clf_model in (lr_clf, dt_clf, rf_clf, kn_clf, nb_clf, vt_clf_model):
    y_pred = clf_model.predict(x_test)
    TestReport(clf_model.__class__.__name__, y_test, y_pred)
    

LogisticRegression 	 |--> Precision : 0.967 Recall : 0.854 F1-Score : 0.905 Accuracy-Score : 0.854
DecisionTreeClassifier 	 |--> Precision : 0.972 Recall : 0.516 F1-Score : 0.664 Accuracy-Score : 0.516
RandomForestClassifier 	 |--> Precision : 0.975 Recall : 0.594 F1-Score : 0.728 Accuracy-Score : 0.594
KNeighborsClassifier 	 |--> Precision : 0.965 Recall : 0.98 F1-Score : 0.972 Accuracy-Score : 0.98
GaussianNB 	 |--> Precision : 0.964 Recall : 0.98 F1-Score : 0.972 Accuracy-Score : 0.98
VotingClassifier 	 |--> Precision : 0.968 Recall : 0.958 F1-Score : 0.963 Accuracy-Score : 0.958
