In [1]:
import pandas as pd
from src.transform import *
import seaborn as sns

from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,ParameterGrid, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score, auc, roc_curve
from xgboost import XGBClassifier
pd.options.display.float_format = "{:,.2f}".format


  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('loan.csv')


In [3]:

dates = ['issue_d','earliest_cr_line','last_pymnt_d']
default_columns = ['Charged Off', 'Default', 'Does not meet the credit policy. Status:Charged Off', 'Late (31-120 days)']
valid = default_columns + ['Fully Paid','Does not meet the credit policy. Status:Fully Paid']

#remove current loans without feedback
dfv = df[df.loan_status.isin(valid)].copy()

for d in dates:
    dfv[d] = pd.to_datetime(dfv[d],format='%b-%Y')




In [4]:
dfv = transform_categorical(dfv)
dfv = transform_dates(dfv)
dfv = transform_numerical(dfv)
dfv = transform_target(dfv,default_columns)

dfv.head()

Unnamed: 0,term,int_rate,installment,grade,sub_grade,home_ownership,annual_inc,issue_d,loan_status,dti,earliest_cr_line,recoveries,last_pymnt_d,application_type,expected_payment_dt,relationship_months,low_inc,VIP,target
100,36,22.35,1151.16,4,D5,MORTGAGE,100000.0,2018-12-01,Fully Paid,30.46,2012-01-01,0.0,2019-01-01,Joint App,2021-12-01,83.0,0,0,0
152,60,16.14,975.71,3,C4,MORTGAGE,45000.0,2018-12-01,Fully Paid,50.53,2009-06-01,0.0,2019-02-01,Joint App,2023-12-01,114.0,0,0,0
170,36,7.56,622.68,1,A3,MORTGAGE,100000.0,2018-12-01,Fully Paid,18.92,1999-02-01,0.0,2019-02-01,Joint App,2021-12-01,238.0,0,0,0
186,36,11.31,147.99,2,B3,RENT,38500.0,2018-12-01,Fully Paid,4.64,2003-12-01,0.0,2019-02-01,Individual,2021-12-01,180.0,0,0,0
215,36,27.27,345.18,5,E5,MORTGAGE,450000.0,2018-12-01,Fully Paid,12.37,1997-10-01,0.0,2019-02-01,Joint App,2021-12-01,254.0,0,0,0


In [5]:
# we have 284k data with label 1, lets sample the same 

In [6]:
# check pipeline ->
dfv.drop(dates+['recoveries', 'expected_payment_dt','sub_grade','loan_status'], axis =1, inplace = True)
dfv.fillna({'relationship_months':0,'annual_inc':0}, inplace = True)
dfv.dtypes

term                     int64
int_rate               float64
installment            float64
grade                    int64
home_ownership          object
annual_inc             float64
dti                    float64
application_type        object
relationship_months    float64
low_inc                  int64
VIP                      int64
target                   int64
dtype: object

In [7]:
non_default = dfv[dfv.target == 0].sample(n=dfv.target.sum(), random_state= 27, axis = 0)
default = dfv[dfv.target == 1]
training_data =  pd.concat([default,non_default])

In [8]:
y = training_data['target']
X = training_data.drop(['target'],1)

  X = training_data.drop(['target'],1)


In [9]:
num = X.select_dtypes('float64').columns
text = X.select_dtypes('object').columns
int = X.select_dtypes('int64').columns



In [29]:
numerical_pieline = Pipeline( steps = [

    ('imput', SimpleImputer(missing_values=np.NaN, strategy='median')),
    ('scaler', StandardScaler() ),
    ])

categorical_pipeline = Pipeline( steps =[
    ('imput', SimpleImputer(missing_values=np.NaN, strategy="most_frequent")),
    ('ohc', OneHotEncoder())
])

int_pipeline = Pipeline( steps =[
    ('imput', SimpleImputer(missing_values=np.NaN, strategy="most_frequent"))
    ])


preprocessor = ColumnTransformer(transformers = [
    ('categorical', categorical_pipeline, text),
    ('num', numerical_pieline,  num),
    ('int', int_pipeline, int)
])

model_pipeline_log = Pipeline(steps = [
    ('prepoc', preprocessor),
    ('clf', LogisticRegression(solver='liblinear'))
    

])


model_pipeline_xgb = Pipeline(steps = [
    ('prepoc', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, objective ='binary:logistic', eval_metric = "logloss"))
    

])

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y,                                    
                                     test_size=0.2,
                                     random_state=0)  


In [31]:
model_pipeline_log.fit(X_train,y_train)
y_pred_log = model_pipeline_log.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_log)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_log)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_log)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_log)))

#Dummy Classifier Confusion matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred_log)))
fpr, tpr, thresholds = roc_curve(y_test, y_pred_log)
auc(fpr, tpr)


Accuracy Score : 0.6452636761680353
Precision Score : 0.652754964909661
Recall Score : 0.616626994622234
F1 Score : 0.6341768593992257
Confusion Matrix : 
[[38419 18604]
 [21743 34972]]


0.6451863380946605

In [32]:
model_pipeline_xgb.fit(X_train,y_train)
y_pred_xgb = model_pipeline_xgb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_xgb)))
print('Precision Score : ' + str(precision_score(y_test,y_pred_xgb)))
print('Recall Score : ' + str(recall_score(y_test,y_pred_xgb)))
print('F1 Score : ' + str(f1_score(y_test,y_pred_xgb)))

#Dummy Classifier Confusion matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred_xgb)))
fpr_x, tpr_x, thresholds_x = roc_curve(y_test, y_pred_xgb)
auc(fpr_x, tpr_x)

Accuracy Score : 0.6572033972814715
Precision Score : 0.6491233973819699
Recall Score : 0.6802433218725205
F1 Score : 0.6643191073534855
Confusion Matrix : 
[[36169 20854]
 [18135 38580]]


0.657265620391217

In [37]:
PD_xgb = model_pipeline_xgb.predict_proba(X_test)[:,1]
PD_log = model_pipeline_log.predict_proba(X_test)[:,1]

In [47]:
PD = pd.concat([pd.Series(y_test.values),pd.Series(PD_xgb), pd.Series(PD_log)], axis = 1)

In [48]:
PD

Unnamed: 0,0,1,2
0,0,0.39,0.45
1,0,0.50,0.54
2,0,0.64,0.55
3,0,0.18,0.30
4,1,0.32,0.24
...,...,...,...
113733,0,0.27,0.34
113734,1,0.84,0.84
113735,0,0.10,0.23
113736,1,0.40,0.40


In [None]:
#Choose XGBoost

In [49]:
# #Not Successfull :(

# grid_clf_acc = GridSearchCV(model_pipeline,param_grid = param_grid,scoring = 'precision')
# grid_clf_acc.fit(X_train, y_train)

# #Predict values based on new parameters
# y_pred_acc = grid_clf_acc.predict(X_test)

# # New Model Evaluation metrics 
# print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred_acc)))
# print('Precision Score : ' + str(precision_score(y_test,y_pred_acc)))
# print('Recall Score : ' + str(recall_score(y_test,y_pred_acc)))
# print('F1 Score : ' + str(f1_score(y_test,y_pred_acc)))

# #Logistic Regression (Grid Search) Confusion matrix
# confusion_matrix(y_test,y_pred_acc)

# #Training not successfull

In [None]:
#https://medium.com/ml-byte-size/how-does-decision-tree-output-predict-proba-12c78634c9d5