In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression
import random
import math
from statistics import mean, stdev
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Load and show the data
data = pd.read_csv("Bank_Credit_Cards.csv", low_memory = False)
data = data.drop(columns = ['CARD-DELIVERY','INCREASE-LIMIT-FLAG','INCREASE-LIMIT-DATE','NO-OF-CARDS','WRITE-OFF-DATE',
                            'FREE-FEE-FLAG','CYCLE-CODE',' 4Y-AMT-CASH ','AT-ANNIV-DATE'])
data.shape

(11329, 23)

## DATA CLEANING

In [3]:
# Remove comma and blankspace
data["AUTO-PAY"] = data[" OUT-BANK-ACCOUNT "].str.replace(" ","")
data["AUTO-PAY"] = data["AUTO-PAY"].str.replace(",","")

# Autopay = 1; NO autopay = 0
data["AUTO-PAY"] = data[["AUTO-PAY"]].applymap(lambda x: 1 if len(x) > 9 else 0)

data["AUTO-PAY"].astype("category")
np.unique(data["AUTO-PAY"], return_counts = True)

(array([0, 1], dtype=int64), array([6745, 4584], dtype=int64))

In [4]:
data["OCCUPATION-CODE"] = data[["OCCUPATION-CODE"]].applymap(lambda x: 0 if x == 0 else 1)
data["OCCUPATION-CODE"].astype("category")
np.unique(data["OCCUPATION-CODE"], return_counts = True)

(array([0, 1], dtype=int64), array([11226,   103], dtype=int64))

In [5]:
# If Man, then equals 1; If Women, then equals 0
data["SEX"] = data[["SEX"]].applymap(lambda x: 0 if x=="F" else 1)

data["SEX"].astype("category")
np.unique(data["SEX"], return_counts = True)

(array([0, 1], dtype=int64), array([3578, 7751], dtype=int64))

In [6]:
# If no Black List, equals 0; Blacklisted, equals 1
data["BLACK-LIST"] = data[["BLACK-LIST-CODE"]].applymap(lambda x: 0 if x==0 else 1)

data["BLACK-LIST"].astype("category")
np.unique(data["BLACK-LIST"], return_counts = True)

(array([0, 1], dtype=int64), array([10758,   571], dtype=int64))

In [7]:
# Converting Date data from integer to datetime 
# Randomly assigning a birthdate to those invalid value
data["B-DATE"] = data[["BIRTH-DATE"]].applymap(lambda x: 19000101 if x < 19100000 or x > 19820000 else x)
data["B-DATE"] = pd.to_datetime(data["B-DATE"], format='%Y%m%d')

# Computing Age and assign random age to those does not have valid value
from datetime import date
  
def calculateAge(birthDate):
    today = date.today()
    if birthDate.year != 1900: 
        # data was 20 years ago, age is thus deducted by 20
        age = today.year - birthDate.year - ((today.month, today.day) < (birthDate.month, birthDate.day)) - 20
    else: age = random.randint(18, 80)
    return age

data["AGE"] = data[["B-DATE"]].applymap(lambda x: calculateAge(x))
np.unique(data["AGE"], return_counts = True)

(array([19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
        53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
        70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82], dtype=int64),
 array([  2,   9,  21,  38,  63,  90, 108, 151, 176, 194, 245, 265, 283,
        287, 346, 406, 367, 418, 388, 409, 404, 389, 408, 411, 418, 427,
        387, 370, 335, 327, 289, 262, 275, 260, 239, 254, 263, 196, 197,
        110, 112, 130, 113, 118,  86,  67,  48,  46,  27,  20,  26,  19,
          6,   4,   4,   1,   2,   3,   3,   3,   2,   1,   1], dtype=int64))

In [8]:
# Normalizing age values
min_age = min(data["AGE"])
max_age = max(data["AGE"])
data["SCALED-AGE"] = data[["AGE"]].applymap(lambda x: (x-min_age)/(max_age-min_age))
np.unique(data["SCALED-AGE"], return_counts = True)

(array([0.        , 0.01587302, 0.03174603, 0.04761905, 0.06349206,
        0.07936508, 0.0952381 , 0.11111111, 0.12698413, 0.14285714,
        0.15873016, 0.17460317, 0.19047619, 0.20634921, 0.22222222,
        0.23809524, 0.25396825, 0.26984127, 0.28571429, 0.3015873 ,
        0.31746032, 0.33333333, 0.34920635, 0.36507937, 0.38095238,
        0.3968254 , 0.41269841, 0.42857143, 0.44444444, 0.46031746,
        0.47619048, 0.49206349, 0.50793651, 0.52380952, 0.53968254,
        0.55555556, 0.57142857, 0.58730159, 0.6031746 , 0.61904762,
        0.63492063, 0.65079365, 0.66666667, 0.68253968, 0.6984127 ,
        0.71428571, 0.73015873, 0.74603175, 0.76190476, 0.77777778,
        0.79365079, 0.80952381, 0.82539683, 0.84126984, 0.85714286,
        0.87301587, 0.88888889, 0.9047619 , 0.92063492, 0.93650794,
        0.95238095, 0.96825397, 1.        ]),
 array([  2,   9,  21,  38,  63,  90, 108, 151, 176, 194, 245, 265, 283,
        287, 346, 406, 367, 418, 388, 409, 404, 389, 408, 411, 41

In [9]:
data["BRANCH-CODE"].astype("category")
np.unique(data["BRANCH-CODE"], return_counts = True)

(array([ 56,  65,  66,  67,  68,  69, 165, 166, 167, 168, 169, 265, 266,
        267, 268, 269, 365, 366, 367, 368, 369, 465, 466, 467, 468, 469,
        565, 566, 567, 568, 569, 665, 667, 668, 669, 756, 765, 766, 767,
        768, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876],
       dtype=int64),
 array([  1, 260, 290, 139,   2, 263, 196, 132, 151, 490, 285, 358, 189,
        157, 203, 446, 464, 146, 120, 133,  74, 157, 256, 194,  87, 157,
        110, 604, 203, 494, 592, 328, 306, 105, 182,   1, 169,  99,  92,
        175, 329, 194, 257, 277, 220, 124, 258, 212, 174, 126, 116, 232],
       dtype=int64))

In [10]:
# cleaning SPENDING LIMIT and converting to integer
data["SPENDING-LIMIT"] = data[" SPENDING-LIMIT "].str.replace(",","")
data["SPENDING-LIMIT"] = data["SPENDING-LIMIT"].str.replace(" ","")
data["SPENDING-LIMIT"] = data["SPENDING-LIMIT"].str.replace("-","0")
data["SPENDING-LIMIT"] = data["SPENDING-LIMIT"].astype(str).astype(int)
np.unique(data["SPENDING-LIMIT"], return_counts = True)

(array([      0,       1,  100000,  400000,  480000,  500000,  530000,
         550000,  600000,  650000,  680000,  700000,  720000,  750000,
         780000,  800000,  850000,  880000,  900000,  940000,  950000,
        1000000]),
 array([    6,     1,     1,     1,     1,   489,     1,     2,    82,
            6,     3,    94,     1,    10,     1,    72,    22,     1,
           35,     2,    17, 10481], dtype=int64))

In [11]:
# cleaning CURRENT BALANCE and converting to integer
data["CURRENT-BALANCE"] = data[" CURRENT-BALANCE "].str.strip(" ")
data["CURRENT-BALANCE"] = data["CURRENT-BALANCE"].apply(lambda x: 0 if len(x) > 9 else x)
data["CURRENT-BALANCE"] = data["CURRENT-BALANCE"].str.replace("-","0")
data["CURRENT-BALANCE"] = data["CURRENT-BALANCE"].astype(str)
data["CURRENT-BALANCE"] = data["CURRENT-BALANCE"].str.replace(",","")
data["CURRENT-BALANCE"] = data["CURRENT-BALANCE"].apply(lambda x: 0 if x == "nan" else x)
data["CURRENT-BALANCE"] = data["CURRENT-BALANCE"].astype(str).astype(int)
np.unique(data["CURRENT-BALANCE"])

array([      0,      11,      25, ..., 2019973, 2041930, 2082069])

In [12]:
# cleaning ONE YEAR CASH AMOUNT and converting to integer
data["ONE-YEAR-CASH"] = data[' OY-AMT-CASH '].str.strip(" ")
data["ONE-YEAR-CASH"] = data["ONE-YEAR-CASH"].str.replace("-","0")
data["ONE-YEAR-CASH"] = data["ONE-YEAR-CASH"].str.replace(",","")
data["ONE-YEAR-CASH"] = data["ONE-YEAR-CASH"].astype(str).astype(int)
np.unique(data["ONE-YEAR-CASH"], return_counts = True)

(array([      0,  100000,  200000,  320000,  330000,  498000,  500000,
         535000,  600000,  700000,  800000,  850000,  900000,  930000,
         950000,  969100,  970000,  998000, 1000000, 1020000, 1580000]),
 array([10880,     1,     1,     1,     1,     1,   160,     1,     8,
            7,     4,     1,     3,     1,     2,     1,     1,     1,
          252,     1,     1], dtype=int64))

In [13]:
# cleaning PRESENT YEAR CASH AMOUNT and converting to integer
data["PRESENT-YEAR-CASH"] = data[' PY-AMT-CASH '].str.strip(" ")
data["PRESENT-YEAR-CASH"] = data["PRESENT-YEAR-CASH"].str.replace("-","0")
data["PRESENT-YEAR-CASH"] = data["PRESENT-YEAR-CASH"].str.replace(",","")
data["PRESENT-YEAR-CASH"] = data["PRESENT-YEAR-CASH"].astype(str).astype(int)
np.unique(data["PRESENT-YEAR-CASH"], return_counts = True)

(array([      0,   83002,  100000,  115000,  120000,  125000,  130000,
         140000,  150000,  160000,  170000,  180000,  200000,  205000,
         220000,  230000,  250000,  298000,  300000,  330000,  330900,
         350000,  360000,  370000,  400000,  450000,  460000,  471000,
         480000,  485000,  490000,  492000,  493933,  495000,  496000,
         497000,  499000,  500000,  520000,  530900,  540000,  550000,
         565000,  590000,  593933,  596000,  597000,  600000,  605000,
         610000,  617000,  640000,  650000,  665000,  670000,  680000,
         685000,  700000,  720000,  750000,  780000,  790000,  795000,
         800000,  845000,  847092,  850000,  860000,  880000,  890000,
         900000,  917000,  920000,  930000,  930900,  931000,  940000,
         950000,  960000,  969000,  969100,  970000,  970900,  975000,
         976000,  978000,  980000,  980900,  981000,  985000,  988000,
         989000,  990000,  990900,  991000,  994751,  995000,  995500,
      

In [14]:
data["ACCT-STATUS"] = data["ACCOUNT-STATUS"].str.replace("0S","75")
data["ACCT-STATUS"] = data["ACCT-STATUS"].str.replace("NE","85")
data["ACCT-STATUS"] = data["ACCT-STATUS"].str.replace("ME","95")
data["ACCT-STATUS"] = data["ACCT-STATUS"].astype(int)
np.unique(data["ACCT-STATUS"], return_counts = True)

(array([ 0, 12, 21, 33, 44, 55, 65, 75, 85, 95]),
 array([1067, 1842, 1407,  676, 1299,  938,  712,    2, 3385,    1],
       dtype=int64))

In [15]:
for i in range(len(data["BLACK-LIST-DATE"])):
    if data["BLACK-LIST-DATE"][i] != 0 and data["BLACK-LIST-DATE"][i] > 1:
        data["BLACK-LIST-DATE"][i] = 20031231 - data["BLACK-LIST-DATE"][i]
        pass
max_num = max(data["BLACK-LIST-DATE"])
data["BLACK-LIST-DATE"] = data[["BLACK-LIST-DATE"]].applymap(lambda x: x /(max_num))
np.unique(data["BLACK-LIST-DATE"], return_counts = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["BLACK-LIST-DATE"][i] = 20031231 - data["BLACK-LIST-DATE"][i]


(array([0.        , 0.50629346, 0.50631839, 0.50639316, 0.50641809,
        0.50644301, 0.50649286, 0.50659256, 0.50661748, 0.50664241,
        0.50666733, 0.5084619 , 0.50848683, 0.50851175, 0.5086613 ,
        0.50871115, 0.50873607, 0.50881085, 0.50888562, 0.50898532,
        0.50901024, 0.50903517, 0.50908502, 0.50915979, 0.50920964,
        0.51095436, 0.51105406, 0.51107899, 0.51110391, 0.51112884,
        0.51115376, 0.51125346, 0.51127838, 0.51130331, 0.51132823,
        0.51140301, 0.51142793, 0.51145286, 0.51147778, 0.51157748,
        0.5116024 , 0.51165225, 0.51349667, 0.5135216 , 0.51354652,
        0.51357145, 0.51359637, 0.51367114, 0.51369607, 0.51372099,
        0.51384562, 0.51387054, 0.51389547, 0.51394532, 0.51402009,
        0.51404501, 0.51406994, 0.51419456, 0.51593928, 0.51596421,
        0.51598913, 0.51601406, 0.51608883, 0.51611376, 0.51613868,
        0.51616361, 0.51633808, 0.516363  , 0.51643778, 0.5164627 ,
        0.51651255, 0.51661225, 0.51663717, 0.51

### Independent Variables

In [16]:
# BUCKET: 1 = GOOD customer; 0 = BAD customer
data["ZERO-ONE-BUCKET"] = data[["BUCKET"]].applymap(lambda x: 1 if x == 1 or x == 0 else 0)
data["ZERO-ONE-BUCKET"].astype("category")
np.unique(data["ZERO-ONE-BUCKET"], return_counts = True)

(array([0, 1], dtype=int64), array([ 1206, 10123], dtype=int64))

In [17]:
data["ALL-MAX-BUCKET"] = data[["ALL-MAX-BUCKET"]].applymap(lambda x: -x)
np.unique(data["ALL-MAX-BUCKET"], return_counts = True)

(array([-7, -6, -5, -4, -3, -2,  0], dtype=int64),
 array([ 306,   57,  120,  267,  807, 1155, 8617], dtype=int64))

In [18]:
data = data.drop(columns = [' ACC-AQCCOUNT-NO ', 'MEMBER-SINCE',
                            ' AVG-PAYMENTS ',' AVG-BALANCES ','LAST-TRX-DATE',
                            ' OUT-BANK-ACCOUNT ',' OY-AMT-CASH ', ' PY-AMT-CASH ', 'AGE',
                            ' YTD-AMT-CASH ','PY-MAX-BUCKET',' CURRENT-BALANCE ',
                            'ACCOUNT-STATUS','BIRTH-DATE','B-DATE','BLACK-LIST-CODE',
                            'ATTRITION-REASON', ' SPENDING-LIMIT '])

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11329 entries, 0 to 11328
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SEX                11329 non-null  int64  
 1   OCCUPATION-CODE    11329 non-null  int64  
 2   BLACK-LIST-DATE    11329 non-null  float64
 3   BUCKET             11329 non-null  int64  
 4   YTD-MAX-BUCKET     11329 non-null  int64  
 5   ALL-MAX-BUCKET     11329 non-null  int64  
 6   BRANCH-CODE        11329 non-null  int64  
 7   AUTO-PAY           11329 non-null  int64  
 8   BLACK-LIST         11329 non-null  int64  
 9   SCALED-AGE         11329 non-null  float64
 10  SPENDING-LIMIT     11329 non-null  int32  
 11  CURRENT-BALANCE    11329 non-null  int32  
 12  ONE-YEAR-CASH      11329 non-null  int32  
 13  PRESENT-YEAR-CASH  11329 non-null  int32  
 14  ACCT-STATUS        11329 non-null  int32  
 15  ZERO-ONE-BUCKET    11329 non-null  int64  
dtypes: float64(2), int32(5

### Set up for modelling

In [20]:
X_APP = data[["AUTO-PAY", "OCCUPATION-CODE", "SCALED-AGE", "BLACK-LIST",
              "SEX", "BRANCH-CODE"]].values
X_BS = data[["YTD-MAX-BUCKET", "ONE-YEAR-CASH","SPENDING-LIMIT","PRESENT-YEAR-CASH",
             "BLACK-LIST-DATE", "CURRENT-BALANCE", "ACCT-STATUS"]].values

In [21]:
y_01 = data["ZERO-ONE-BUCKET"].values
y_BP = data["ALL-MAX-BUCKET"].values

## APPLICATION SCORING - ZERO ONE CLASSIFICATION

### Logistic Regression

In [22]:
accuracy = []
specificity = []
sensitivity = []
pnl = []
profit = []

sss = StratifiedShuffleSplit(n_splits = 10, test_size=0.3, random_state=42)
sss.get_n_splits(X_APP, y_01)

for train_index, test_index in sss.split(X_APP, y_01):
    """ Splitting data into 10 splits and applying Logistic Regression """
    X_train, X_test = X_APP[train_index], X_APP[test_index]
    y_train, y_test = y_01[train_index], y_01[test_index]
    lr = LogisticRegression(max_iter = 50).fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    pnl = []
    
    for i in range(len(y_pred)):
        """ To calculate average profit / loss             
            1 = Good Customer; 0 = Bad Customer"""
        if y_pred[i] == 1 and y_test[i] == 1:
            pnl.append(0.02),
        elif y_pred[i] == 1 and y_test[i] == 0:
            pnl.append(-0.45),
        pass
        
    """ To evaluate the model """
    profit.append(mean(pnl))
    accuracy.append(accuracy_score(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity.append(tn / (tn+fp))
    sensitivity.append(recall_score(y_test, y_pred))
    
ans = []
for i in range(len(sensitivity)):
    ans.append(math.sqrt(specificity[i]*sensitivity[i]))


print(classification_report(y_pred, y_test))
print("Accuracy Score: {:.4f}%".format(mean(accuracy)*100)),
print("Square root of Sensitivity and Specificity: {:.4f}%".format(mean(ans)*100)),
print("Average P/L: {:.4f}%".format(mean(profit)*100))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.89      0.94      3399

    accuracy                           0.89      3399
   macro avg       0.50      0.45      0.47      3399
weighted avg       1.00      0.89      0.94      3399

Accuracy Score: 89.3498%
Square root of Sensitivity and Specificity: 0.0000%
Average P/L: -3.0056%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree Classifier

In [23]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = "entropy", max_depth = 20)

sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 42)
sss.get_n_splits(X_APP, y_01)

accuracy = []
specificity = []
sensitivity = []
pnl = []
profit = []

for train_index, test_index in sss.split(X_APP, y_01):
    """ Splitting data and loading Tree Classifier """
    X_train, X_test = X_APP[train_index], X_APP[test_index]
    y_train, y_test = y_01[train_index], y_01[test_index]
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    pnl = []
    
    for i in range(len(y_pred)):
        """ assigning profit or loss values to predictions """
        if y_pred[i] == 1 and y_test[i] == 1:
            pnl.append(0.02),
        elif y_pred[i] == 1 and y_test[i] == 0:
            pnl.append(-0.45),
        pass
    
    """ evaluating model """
    profit.append(mean(pnl))
    accuracy.append(accuracy_score(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity.append(tn / (tn+fp))
    sensitivity.append(recall_score(y_test, y_pred))
    
ans = []
for i in range(len(sensitivity)):
    ans.append(math.sqrt(specificity[i]*sensitivity[i]))

    
print(classification_report(y_pred, y_test))
print("Accuracy Score: {:.4f}%".format(mean(accuracy)*100)),
print("Square root of Sensitivity and Specificity: {:.4f}%".format(mean(ans)*100)),
print("Average P/L: {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

           0       0.07      0.09      0.08       297
           1       0.91      0.89      0.90      3102

    accuracy                           0.82      3399
   macro avg       0.49      0.49      0.49      3399
weighted avg       0.84      0.82      0.83      3399

Accuracy Score: 82.7626%
Square root of Sensitivity and Specificity: 27.0252%
Average P/L: -3.0211%


### Random Forest Classifier

In [24]:
from sklearn.ensemble import RandomForestClassifier

accuracy = []
specificity = []
sensitivity = []
pnl = []
profit = []

rf = RandomForestClassifier(n_estimators= 50, max_depth = 50)

sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 42)
sss.get_n_splits(X_APP, y_01)

for train_index, test_index in sss.split(X_APP, y_01):
    """ Splitting data and loading Random Forest Classifier """
    X_train, X_test = X_APP[train_index], X_APP[test_index]
    y_train, y_test = y_01[train_index], y_01[test_index]
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    pnl = []
    
    for i in range(len(y_pred)):
        """ loading profit or loss values to predictions """
        if y_pred[i] == 1 and y_test[i] == 1:
            pnl.append(0.02),
        elif y_pred[i] == 1 and y_test[i] == 0:
            pnl.append(-0.45),
        pass
    
    """ evaluating model """
    profit.append(mean(pnl))
    accuracy.append(accuracy_score(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity.append(tn / (tn+fp))
    sensitivity.append(recall_score(y_test, y_pred))

ans = []
for i in range(len(sensitivity)):
    ans.append(math.sqrt(specificity[i]*sensitivity[i]))


print(classification_report(y_pred, y_test))
print("Accuracy Score: {:.4f}%".format(mean(accuracy)*100)),
print("Square root of Sensitivity and Specificity: {:.4f}%".format(mean(ans)*100)),
print("Average P/L: {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

           0       0.03      0.08      0.04       132
           1       0.96      0.89      0.93      3267

    accuracy                           0.86      3399
   macro avg       0.50      0.49      0.48      3399
weighted avg       0.92      0.86      0.89      3399

Accuracy Score: 86.1548%
Square root of Sensitivity and Specificity: 18.9725%
Average P/L: -3.0169%


## APPLICATION SCORING - BUCKET PREDICTION

### Logistic Regression

In [25]:
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=4242)
sss.get_n_splits(X_APP, y_BP)

specificity = []
sensitivity = []
pnl = []
profit = []

for train_index, test_index in sss.split(X_APP, y_BP):
    """ Splitting data into 10 splits and applying Linear Regression """
    X_train, X_test = X_APP[train_index], X_APP[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    lr = LogisticRegression(C = 0.5, max_iter = 500, solver = "saga").fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    pred_proba = lr.predict_proba(X_test)
    recall = recall_score(y_pred, y_test, average = "weighted")
    
    """ Calculating sensitivity """
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >= -1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    """ Calculating Specificity"""
    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] < -1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    """ Calculating average profit / loss """
    pnl = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pnl.append(y_test[i])
        pass
    
    profit.append(sum(pnl)/len(pnl))         
    
print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L is {:.4f}".format(mean(profit)))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -7       0.00      0.00      0.00         0
          -6       0.00      0.00      0.00         0
          -5       0.00      0.00      0.00         0
          -4       0.00      0.00      0.00         0
          -3       0.00      0.00      0.00         0
          -2       0.00      0.00      0.00         0
           0       1.00      0.76      0.86      3399

    accuracy                           0.76      3399
   macro avg       0.14      0.11      0.12      3399
weighted avg       1.00      0.76      0.86      3399

The square root of Specificity and Sensitivity is 0.0000%
The average P/L is -0.7843


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree Classifier

In [26]:
sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 4242)
sss.get_n_splits(X_APP, y_BP)

from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = "entropy", max_depth = 25)

specificity = []
sensitivity = []
pnl = []
profit = []

for train_index, test_index in sss.split(X_APP, y_BP):
    """ Splitting data and loading Tree Classifier """
    X_train, X_test = X_APP[train_index], X_APP[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
        
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    """ Calculating average profit / loss """
    pnl = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pnl.append(y_test[i])
            pass
        
    profit.append(sum(pnl)/len(pnl))

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L is {:.4f}".format(mean(profit)))

              precision    recall  f1-score   support

          -7       0.16      0.16      0.16        96
          -6       0.00      0.00      0.00        22
          -5       0.00      0.00      0.00        34
          -4       0.03      0.03      0.03        71
          -3       0.06      0.06      0.06       246
          -2       0.10      0.12      0.10       282
           0       0.78      0.77      0.78      2648

    accuracy                           0.62      3399
   macro avg       0.16      0.16      0.16      3399
weighted avg       0.63      0.62      0.62      3399

The square root of Specificity and Sensitivity is 43.7086%
The average P/L is -0.7517


### Random Forest Classifier

In [47]:
sss_BP = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 4242)
sss_BP.get_n_splits(X_APP, y_BP)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, max_depth= 100)

specificity = []
sensitivity = []
pnl = []
profit = []

for train_index, test_index in sss_BP.split(X_APP, y_BP):
    """ Splitting data and loading Random Forest Regressor """
    X_train, X_test = X_APP[train_index], X_APP[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    """ Calculating average profit / loss """
    pnl = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pnl.append(y_test[i])
            pass
        
    profit.append(sum(pnl)/len(pnl))

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L is {:.4f}".format(mean(profit)))

              precision    recall  f1-score   support

          -7       0.11      0.19      0.14        52
          -6       0.00      0.00      0.00         7
          -5       0.00      0.00      0.00         8
          -4       0.01      0.05      0.02        21
          -3       0.01      0.03      0.02       115
          -2       0.06      0.11      0.08       177
           0       0.89      0.76      0.82      3019

    accuracy                           0.69      3399
   macro avg       0.15      0.16      0.15      3399
weighted avg       0.80      0.69      0.74      3399

The square root of Specificity and Sensitivity is 32.4066%
The average P/L is -0.7649


## APPLICATION SCORING - P/L PREDICTION

### Logistic Regression

In [28]:
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=4242)
sss.get_n_splits(X_APP, y_BP)

specificity = []
sensitivity = []
pnl = []
profit = []

for train_index, test_index in sss.split(X_APP, y_BP):
    """ Splitting data into 10 splits and applying Linear Regression """
    X_train, X_test = X_APP[train_index], X_APP[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    lr = LogisticRegression(C = 0.01, max_iter = 500, solver = "saga").fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    pred_proba = lr.predict_proba(X_test)
    recall = recall_score(y_pred, y_test, average = "weighted")
    
    """ Calculating sensitivity """
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >= -1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    """ Calculating Specificity"""
    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] < -1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    """ Calculating average profit / loss """
    pred_bucket = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pred_bucket.append(y_test[i])
            pass
    
    pnl = []
    for i in range(len(pred_bucket)):
        if pred_bucket[i] == -7:
            pnl.append(-0.65),
        elif pred_bucket[i] == -6:
            pnl.append(-0.45),
        elif pred_bucket[i] == -5:
            pnl.append(-0.35),
        elif pred_bucket[i] == -4:
            pnl.append(-0.25),
        elif pred_bucket[i] == -3:
            pnl.append(-0.15),
        elif pred_bucket[i] == -2:
            pnl.append(0),
        elif pred_bucket[i] == -1:
            pnl.append(0.01),
        elif pred_bucket[i] == 0:
            pnl.append(0.03)
        pass
    profit.append(mean(pnl)/len(y_test)) 

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L of this model is {:.4f}%".format(mean(profit)*100))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -7       0.00      0.00      0.00         0
          -6       0.00      0.00      0.00         0
          -5       0.00      0.00      0.00         0
          -4       0.00      0.00      0.00         0
          -3       0.00      0.00      0.00         0
          -2       0.00      0.00      0.00         0
           0       1.00      0.76      0.86      3399

    accuracy                           0.76      3399
   macro avg       0.14      0.11      0.12      3399
weighted avg       1.00      0.76      0.86      3399

The square root of Specificity and Sensitivity is 0.0000%
The average P/L of this model is -0.0005%


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree Classifier

In [29]:
sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 4242)
sss.get_n_splits(X_APP, y_BP)

from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion = "entropy", max_depth = 25)

specificity = []
sensitivity = []
pnl = []
profit = []

for train_index, test_index in sss.split(X_APP, y_BP):
    """ Splitting data and loading Tree Classifier """
    X_train, X_test = X_APP[train_index], X_APP[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
        
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    
    """ Calculating average profit / loss """
    pred_bucket = []
    
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pred_bucket.append(y_test[i])
            pass
    
    pnl = []
    for i in range(len(pred_bucket)):
        if pred_bucket[i] == -7:
            pnl.append(-0.65),
        elif pred_bucket[i] == -6:
            pnl.append(-0.45),
        elif pred_bucket[i] == -5:
            pnl.append(-0.35),
        elif pred_bucket[i] == -4:
            pnl.append(-0.25),
        elif pred_bucket[i] == -3:
            pnl.append(-0.15),
        elif pred_bucket[i] == -2:
            pnl.append(0),
        elif pred_bucket[i] == -1:
            pnl.append(0.01),
        elif pred_bucket[i] == 0:
            pnl.append(0.03)
        pass
    profit.append(mean(pnl)/len(y_test)) 

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L of this model is {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

          -7       0.17      0.17      0.17        92
          -6       0.00      0.00      0.00        21
          -5       0.00      0.00      0.00        34
          -4       0.03      0.03      0.03        76
          -3       0.06      0.06      0.06       247
          -2       0.10      0.12      0.10       283
           0       0.78      0.77      0.78      2646

    accuracy                           0.62      3399
   macro avg       0.16      0.16      0.16      3399
weighted avg       0.63      0.62      0.62      3399

The square root of Specificity and Sensitivity is 43.7906%
The average P/L of this model is -0.0004%


### Random Forest Classifier

In [51]:
sss_BP = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 4242)
sss_BP.get_n_splits(X_APP, y_BP)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 10, max_depth= 30)

specificity = []
sensitivity = []
pnl = []
profit = []

for train_index, test_index in sss_BP.split(X_APP, y_BP):
    """ Splitting data and loading Random Forest Regressor """
    X_train, X_test = X_APP[train_index], X_APP[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    """ Calculating average profit / loss """
    pred_bucket = []
    
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pred_bucket.append(y_test[i])
            pass
    
    pnl = []
    for i in range(len(pred_bucket)):
        if pred_bucket[i] == -7:
            pnl.append(-0.65),
        elif pred_bucket[i] == -6:
            pnl.append(-0.45),
        elif pred_bucket[i] == -5:
            pnl.append(-0.35),
        elif pred_bucket[i] == -4:
            pnl.append(-0.25),
        elif pred_bucket[i] == -3:
            pnl.append(-0.15),
        elif pred_bucket[i] == -2:
            pnl.append(0),
        elif pred_bucket[i] == -1:
            pnl.append(0.01),
        elif pred_bucket[i] == 0:
            pnl.append(0.03)
        pass
    profit.append(mean(pnl)/len(y_test)) 

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L of this model is {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

          -7       0.14      0.19      0.16        68
          -6       0.00      0.00      0.00        19
          -5       0.00      0.00      0.00        14
          -4       0.01      0.04      0.02        23
          -3       0.02      0.04      0.03       156
          -2       0.07      0.12      0.09       196
           0       0.86      0.76      0.81      2923

    accuracy                           0.67      3399
   macro avg       0.16      0.17      0.16      3399
weighted avg       0.75      0.67      0.71      3399

The square root of Specificity and Sensitivity is 35.5506%
The average P/L of this model is -0.0004%


## BEAHVIOUR SCORING - ZERO ONE CLASSIFICATION

### Logistic Regerssion

In [31]:
accuracy = []
specificity = []
sensitivity = []
pnl = []
profit = []

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=4242)
sss.get_n_splits(X_BS, y_01)

for train_index, test_index in sss.split(X_BS, y_01):
    """ Splitting data into 10 splits and bslying Logistic Regression """
    X_train, X_test = X_BS[train_index], X_BS[test_index]
    y_train, y_test = y_01[train_index], y_01[test_index]
    lr= LogisticRegression(C = 0.1, max_iter = 300).fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    pnl = []
    
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 1 and y_test[i] == 1:
            pnl.append(0.02),
        elif y_pred[i] == 1 and y_test[i] == 0:
            pnl.append(-0.45),
        pass
    
    """ evaluating the model """
    profit.append(mean(pnl))
    accuracy.append(accuracy_score(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity.append(tn / (tn+fp))
    sensitivity.append(recall_score(y_test, y_pred))
    
ans = []
for i in range(len(sensitivity)):
    ans.append(math.sqrt(specificity[i]*sensitivity[i]))

print(classification_report(y_pred, y_test))
print("Accuracy Score: {:.4f}%".format(mean(accuracy)*100)),
print("Square root of Sensitivity and Specificity: {:.4f}%".format(mean(ans)*100)),
print("Average P/L: {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       1.00      0.89      0.94      3395

    accuracy                           0.89      3399
   macro avg       0.50      0.45      0.47      3399
weighted avg       1.00      0.89      0.94      3399

Accuracy Score: 89.3116%
Square root of Sensitivity and Specificity: 6.5345%
Average P/L: -2.9879%


### Decision Tree Classifier

In [32]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion="entropy", max_depth = 5)

accuracy = []
specificity = []
sensitivity = []
pnl = []
profit = []

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=4242)
sss.get_n_splits(X_BS, y_01)

for train_index, test_index in sss.split(X_BS, y_01):
    """ Splitting data and loading Tree Classifier """
    X_train, X_test = X_BS[train_index], X_BS[test_index]
    y_train, y_test = y_01[train_index], y_01[test_index]
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    pnl = []
    
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 1 and y_test[i] == 1:
            pnl.append(0.02),
        elif y_pred[i] == 1 and y_test[i] == 0:
            pnl.append(-0.45),
        pass
    
    """ evaluating the model """
    profit.append(mean(pnl))
    accuracy.append(accuracy_score(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity.append(tn / (tn+fp))
    sensitivity.append(recall_score(y_test, y_pred))
    
ans = []
for i in range(len(sensitivity)):
    ans.append(math.sqrt(specificity[i]*sensitivity[i]))

print(classification_report(y_pred, y_test))
print("Accuracy Score: {:.4f}%".format(mean(accuracy)*100)),
print("Square root of Sensitivity and Specificity: {:.4f}%".format(mean(ans)*100)),
print("Average P/L: {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       354
           1       0.98      0.98      0.98      3045

    accuracy                           0.96      3399
   macro avg       0.89      0.90      0.89      3399
weighted avg       0.96      0.96      0.96      3399

Accuracy Score: 95.7929%
Square root of Sensitivity and Specificity: 87.5356%
Average P/L: 0.7914%


### Random Forest Classifier

In [33]:
# the best 
from sklearn.ensemble import RandomForestClassifier

accuracy = []
specificity = []
sensitivity = []
pnl = []
profit = []

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=4242)
sss.get_n_splits(X_BS, y_01)

rf = RandomForestClassifier(n_estimators= 10, max_depth= 25)
for train_index, test_index in sss.split(X_BS, y_01):
    """ Splitting data and loading Random Forest Classifier """
    X_train, X_test = X_BS[train_index], X_BS[test_index]
    y_train, y_test = y_01[train_index], y_01[test_index]
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    pnl = []
    
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 1 and y_test[i] == 1:
            pnl.append(0.02),
        elif y_pred[i] == 1 and y_test[i] == 0:
            pnl.append(-0.45),
        pass
    
    """ evaluating the model """
    profit.append(mean(pnl))
    accuracy.append(accuracy_score(y_test, y_pred))
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    specificity.append(tn / (tn+fp))
    sensitivity.append(recall_score(y_test, y_pred))
    
ans = []
for i in range(len(sensitivity)):
    ans.append(math.sqrt(specificity[i]*sensitivity[i]))


print(classification_report(y_pred, y_test))
print("Accuracy Score: {:.4f}%".format(mean(accuracy)*100)),
print("Square root of Sensitivity and Specificity: {:.4f}%".format(mean(ans)*100)),
print("Average P/L: {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

           0       0.78      0.79      0.79       357
           1       0.98      0.97      0.97      3042

    accuracy                           0.95      3399
   macro avg       0.88      0.88      0.88      3399
weighted avg       0.96      0.95      0.96      3399

Accuracy Score: 95.4604%
Square root of Sensitivity and Specificity: 87.4463%
Average P/L: 0.7961%


## BEHAVIOURAL SCORING - BUCKET PREDICTION

### Logistic Regression

In [34]:
sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 42)
sss.get_n_splits(X_BS, y_BP)

specificity = []
sensitivity = []
pnl = []
profit = []
lr = LogisticRegression(C = 0.4,  max_iter = 200)

for train_index, test_index in sss.split(X_BS, y_BP):
    """ Splitting data and loading Random Forest Classifier """
    X_train, X_test = X_BS[train_index], X_BS[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    pred_proba = lr.predict_proba(X_test)
    
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    """ Calculating average profit / loss """
    pnl = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pnl.append(y_test[i])
        pass
    
    profit.append(sum(pnl)/len(pnl))

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L of this model is {:.4f}".format(mean(profit)))

              precision    recall  f1-score   support

          -7       0.01      0.08      0.02        13
          -6       0.00      0.00      0.00         0
          -5       0.00      0.00      0.00         0
          -4       0.00      0.00      0.00         0
          -3       0.00      0.00      0.00         3
          -2       0.00      0.00      0.00         0
           0       1.00      0.76      0.86      3383

    accuracy                           0.76      3399
   macro avg       0.14      0.12      0.13      3399
weighted avg       0.99      0.76      0.86      3399

The square root of Specificity and Sensitivity is 7.9025%
The average P/L of this model is -0.7822


  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree Classifier

In [35]:
from sklearn.tree import DecisionTreeClassifier
sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.7, random_state = 42)
sss.get_n_splits(X_BS, y_BP)

specificity = []
sensitivity = []
profit = []

tree = DecisionTreeClassifier(max_depth = 10)

for train_index, test_index in sss.split(X_BS, y_BP):
    """ Splitting data and loading Random Forest Classifier """
    X_train, X_test = X_BS[train_index], X_BS[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    pred_proba = tree.predict_proba(X_test)
    
    """Calculating Sensitivity and Specitivity"""
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    

    """ Calculating average profit / loss """
    pnl = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pnl.append(y_test[i])
            pass
        
    profit.append(sum(pnl)/len(pnl))
    
print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L of this model is {:.4f}".format(mean(profit)))

              precision    recall  f1-score   support

          -7       1.00      1.00      1.00       213
          -6       0.97      0.83      0.90        47
          -5       0.94      0.89      0.91        89
          -4       0.95      0.92      0.94       193
          -3       0.90      0.97      0.93       529
          -2       0.92      0.92      0.92       810
           0       0.99      0.99      0.99      6050

    accuracy                           0.98      7931
   macro avg       0.95      0.93      0.94      7931
weighted avg       0.98      0.98      0.98      7931

The square root of Specificity and Sensitivity is 97.5789%
The average P/L of this model is -0.0317


### Random Forest Classifier

In [36]:
sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 42)
sss.get_n_splits(X_BS, y_BP)

specificity = []
sensitivity = []
pnl = []
profit = []

rf = RandomForestClassifier(n_estimators= 10, criterion = "entropy", max_depth = 10)

for train_index, test_index in sss.split(X_BS, y_BP):
    """ Splitting data and loading Random Forest Classifier """
    X_train, X_test = X_BS[train_index], X_BS[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    pred_proba = rf.predict_proba(X_test)
    
    """Calculating Sensitivity and Specitivity"""
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    """ Calculating average profit / loss """
    pnl = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pnl.append(y_test[i])
            pass
        
    profit.append(sum(pnl)/len(pnl))

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L of this model is {:.4f}".format(mean(profit)))

              precision    recall  f1-score   support

          -7       0.97      0.95      0.96        94
          -6       0.53      0.90      0.67        10
          -5       0.72      0.87      0.79        30
          -4       0.93      0.89      0.91        83
          -3       0.91      0.98      0.94       225
          -2       0.95      0.96      0.96       342
           0       1.00      0.99      0.99      2615

    accuracy                           0.98      3399
   macro avg       0.86      0.93      0.89      3399
weighted avg       0.98      0.98      0.98      3399

The square root of Specificity and Sensitivity is 97.7515%
The average P/L of this model is -0.0368


## BEHAVIOURAL SCORING - P/L PREDICTION

### Logistic Regression

In [37]:
sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 42)
sss.get_n_splits(X_BS, y_BP)

specificity = []
sensitivity = []
pnl = []
profit = []

lr = LogisticRegression(C = 0.4, max_iter = 200)

for train_index, test_index in sss.split(X_BS, y_BP):
    """ Splitting data and loading Random Forest Classifier """
    X_train, X_test = X_BS[train_index], X_BS[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test)
    pred_proba = lr.predict_proba(X_test)
    
    """ Calculating sensitivity and specitivity"""
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    """ Calculating average profit / loss """
    pred_bucket = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pred_bucket.append(y_test[i])
            pass
    
    pnl = []
    for i in range(len(pred_bucket)):
        if pred_bucket[i] == -7:
            pnl.append(-0.65),
        elif pred_bucket[i] == -6:
            pnl.append(-0.45),
        elif pred_bucket[i] == -5:
            pnl.append(-0.35),
        elif pred_bucket[i] == -4:
            pnl.append(-0.25),
        elif pred_bucket[i] == -3:
            pnl.append(-0.15),
        elif pred_bucket[i] == -2:
            pnl.append(0),
        elif pred_bucket[i] == -1:
            pnl.append(0.01),
        elif pred_bucket[i] == 0:
            pnl.append(0.03)
        pass
    profit.append(mean(pnl)/len(y_test)) 

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L of this model is {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

          -7       0.01      0.08      0.02        13
          -6       0.00      0.00      0.00         0
          -5       0.00      0.00      0.00         0
          -4       0.00      0.00      0.00         0
          -3       0.00      0.00      0.00         3
          -2       0.00      0.00      0.00         0
           0       1.00      0.76      0.86      3383

    accuracy                           0.76      3399
   macro avg       0.14      0.12      0.13      3399
weighted avg       0.99      0.76      0.86      3399

The square root of Specificity and Sensitivity is 7.9025%
The average P/L of this model is -0.0005%


  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree Classifier

In [38]:
sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 42)
sss.get_n_splits(X_BS, y_BP)

specificity = []
sensitivity = []
pnl = []
profit = []

tree = DecisionTreeClassifier(max_depth = 70)

for train_index, test_index in sss.split(X_BS, y_BP):
    """ Splitting data and loading Random Forest Classifier """
    X_train, X_test = X_BS[train_index], X_BS[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    
    """Calculating Sensitivity and Specitivity"""
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
    """ Calculating average profit / loss """
    pred_bucket = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pred_bucket.append(y_test[i])
            pass
    
    pnl = []
    for i in range(len(pred_bucket)):
        if pred_bucket[i] == -7:
            pnl.append(-0.65),
        elif pred_bucket[i] == -6:
            pnl.append(-0.45),
        elif pred_bucket[i] == -5:
            pnl.append(-0.35),
        elif pred_bucket[i] == -4:
            pnl.append(-0.25),
        elif pred_bucket[i] == -3:
            pnl.append(-0.15),
        elif pred_bucket[i] == -2:
            pnl.append(0),
        elif pred_bucket[i] == -1:
            pnl.append(0.01),
        elif pred_bucket[i] == 0:
            pnl.append(0.03)
        pass
    profit.append(mean(pnl)/len(y_test))

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L of this model is {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

          -7       1.00      1.00      1.00        92
          -6       0.94      1.00      0.97        16
          -5       0.94      0.94      0.94        36
          -4       0.93      0.95      0.94        78
          -3       0.91      0.92      0.92       240
          -2       0.91      0.92      0.91       345
           0       0.99      0.99      0.99      2592

    accuracy                           0.98      3399
   macro avg       0.95      0.96      0.95      3399
weighted avg       0.98      0.98      0.98      3399

The square root of Specificity and Sensitivity is 97.6132%
The average P/L of this model is 0.0008%


### Random Forest Classifier

In [39]:
sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 42)
sss.get_n_splits(X_BS, y_BP)

specificity = []
sensitivity = []
pnl = []
profit = []

rf = RandomForestClassifier(n_estimators= 10, criterion = "entropy", max_depth = 10)

for train_index, test_index in sss.split(X_BS, y_BP):
    """ Splitting data and loading Random Forest Classifier """
    X_train, X_test = X_BS[train_index], X_BS[test_index]
    y_train, y_test = y_BP[train_index], y_BP[test_index]
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    pred_proba = rf.predict_proba(X_test)
    
    actually_good = 0
    for i in y_test:
        if i >= -1:
            actually_good = actually_good + 1
            pass
        
    predict_actual_good = 0
    for i in range(len(y_pred)):
        if y_pred[i] >= -1 and y_test[i] >=-1:
            predict_actual_good = predict_actual_good + 1
            pass
    goodness = predict_actual_good / actually_good
    sensitivity.append(goodness)

    actually_bad = 0
    for i in y_test:
        if i < -1:
            actually_bad = actually_bad + 1
            pass
        
    predict_actual_bad = 0
    for i in range(len(y_pred)):
        if y_pred[i] < -1 and y_test[i] <-1:
            predict_actual_bad = predict_actual_bad + 1
            pass
    badness = predict_actual_bad / actually_bad
    specificity.append(badness)
    
   
    pred_bucket = []
    for i in range(len(y_pred)):
        """ Assigning profit or loss values to predictions """
        if y_pred[i] == 0:
            pred_bucket.append(y_test[i])
            pass
    
    pnl = []
    for i in range(len(pred_bucket)):
        if pred_bucket[i] == -7:
            pnl.append(-0.65),
        elif pred_bucket[i] == -6:
            pnl.append(-0.45),
        elif pred_bucket[i] == -5:
            pnl.append(-0.35),
        elif pred_bucket[i] == -4:
            pnl.append(-0.25),
        elif pred_bucket[i] == -3:
            pnl.append(-0.15),
        elif pred_bucket[i] == -2:
            pnl.append(0),
        elif pred_bucket[i] == -1:
            pnl.append(0.01),
        elif pred_bucket[i] == 0:
            pnl.append(0.03)
        pass
    profit.append(mean(pnl)/len(y_test))
    

print(classification_report(y_pred, y_test))
print("The square root of Specificity and Sensitivity is {:.4f}%".format(
    math.sqrt(mean(specificity)*mean(sensitivity))*100)),
print("The average P/L of this model is {:.4f}%".format(mean(profit)*100))

              precision    recall  f1-score   support

          -7       0.97      0.96      0.96        93
          -6       0.24      0.57      0.33         7
          -5       0.67      0.67      0.67        36
          -4       0.89      0.88      0.88        81
          -3       0.90      0.95      0.93       231
          -2       0.93      0.96      0.95       336
           0       1.00      0.99      0.99      2615

    accuracy                           0.97      3399
   macro avg       0.80      0.85      0.82      3399
weighted avg       0.98      0.97      0.98      3399

The square root of Specificity and Sensitivity is 97.7081%
The average P/L of this model is 0.0008%
