# Read Data from feather

In [None]:
import feather

In [None]:
loans =feather.read_dataframe('./loans_modelling_data')

In [None]:
loans.shape

# Data Preprocessing for Evaluation

## Splitting data into train and test

In [None]:
loans_train = loans.loc[loans['issue_d'] <  loans['issue_d'].quantile(0.9)]
loans_test =  loans.loc[loans['issue_d'] >= loans['issue_d'].quantile(0.9)]

In [None]:
print('Number of loans in the partition:   ', loans_train.shape[0] + loans_test.shape[0])
print('Number of loans in the full dataset:', loans.shape[0])

In [None]:
loans_test.shape[0] / loans.shape[0]

In [None]:
del loans

In [None]:

loans_train.drop('issue_d', axis=1, inplace=True)
loans_test.drop('issue_d', axis=1, inplace=True)

In [None]:
y_train = loans_train['charged_off']
y_test = loans_test['charged_off']

In [None]:
X_train = loans_train.drop('charged_off', axis=1)
X_test = loans_test.drop('charged_off', axis=1)

In [None]:
del loans_train, loans_test

## Modelling

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
pipeline_rfc = Pipeline([
    ('imputer', SimpleImputer(copy=False)),
    ('model', RandomForestClassifier(n_jobs=-1, random_state=1))
])

In [None]:
param_grid_rfc = {
    'model__n_estimators': [50] # The number of randomized trees to build
}

In [None]:
grid_rfc = GridSearchCV(estimator=pipeline_rfc, param_grid=param_grid_rfc, scoring='roc_auc', n_jobs=1, pre_dispatch=1, cv=5, verbose=1, return_train_score=False)

In [None]:
grid_rfc.fit(X_train, y_train)

In [None]:
grid_rfc.best_score_

### ACCUARACY,  F1 SCORE AND ROC AUC FOR TRAIN DATA

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [None]:
X_train=X_train.fillna(X_train.mean()) # To remove error missing values
X_test=X_test.fillna(X_test.mean())

In [None]:
# See the inital model performance
clf = RandomForestClassifier(random_state=10)
print('Acc:', cross_val_score(clf, X_train, y_train, 
                              cv=StratifiedKFold(n_splits=5), 
                              scoring='accuracy').mean())
print('F1:', cross_val_score(clf, X_train, y_train, 
                             cv=StratifiedKFold(n_splits=5), 
                             scoring='f1').mean())
print('ROC AUC:', cross_val_score(clf, X_train, y_train, 
                                  cv=StratifiedKFold(n_splits=5), 
                                  scoring='roc_auc').mean())

### Prediction on test data

In [None]:
from sklearn import model_selection
# random forest model creation
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
# predictions
rfc_predict = rfc.predict(X_test)

In [None]:
y_score = rfc.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_score)

#### Confusion Matrix for test data

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
rfc_cv_score =cross_val_score(rfc, X_train, y_train, cv=10, scoring='roc_auc')

In [None]:
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, rfc_predict))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict))
print('\n')
print("=== All AUC Scores ===")
print(rfc_cv_score)
print('\n')
print("=== Mean AUC Score ===")
print("Mean AUC Score - Random Forest: ", rfc_cv_score.mean())

#### ROC Curve

In [None]:
import sklearn.metrics as metrics
from sklearn.metrics import roc_curve

In [None]:
import matplotlib.pyplot as plt

In [None]:
y_pred_proba = rfc.predict_proba(X_test)[:,1]
fpr,tpr,thresholds = roc_curve(y_test,y_pred_proba)

In [None]:
roc_auc = metrics.auc(fpr, tpr)

In [None]:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
roc_auc_score(y_test,y_pred_proba)

#### Precision Recall Curve

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
from sklearn.metrics import auc

In [None]:
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

In [None]:
# calculate precision-recall AUC
auc_prc = auc(recall, precision)
print(auc_prc)

In [None]:
plt.figure(figsize = (10,8))
plt.plot([0, 1], [0.5, 0.5],'k--')
plt.plot(recall, precision,'b', label = 'Precision Recall Curve = %0.2f' % auc_prc)
plt.legend(loc = 'lower right')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('PRC curve')
plt.show()