## Importing libraries

In [222]:
import pandas as pd
import numpy as np
import copy
## Models
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
## Other stuff
from sklearn.metrics import roc_auc_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
## Samplers
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import ClusterCentroids 
from imblearn.combine import SMOTEENN

## Utility functions

In [223]:
# Utiliy Functions
def min_max_data_normalize(data_train,data_test):
    data_min=data_train.min()
    data_max=data_train.max()
    normal_data_train=(data_train-data_min)/(data_max-data_min)
    normal_data_test=(data_test-data_min)/(data_max-data_min)
    return normal_data_train,normal_data_test

In [224]:
def data_processed():
    '''
    Used to read & normalize processed test and train data
    '''
    data_train=pd.read_csv("data/2022-02-06_LOANS_TRAIN.csv")
    data_test=pd.read_csv("data/2022-02-06_LOANS_TEST.csv")
    y_train=data_train['loan_status']
    data_train.drop('loan_status',1,inplace=True)
    normal_data_train,normal_data_test=min_max_data_normalize(data_train,data_test)
    return normal_data_train,normal_data_test,y_train

def AUC_score(y_ground_truth,y_predicted_probability):
    return roc_auc_score(y_ground_truth, y_predicted_probability)

def to_submission(y_test_predicted_probability):
    y_test=pd.DataFrame(y_test_predicted_probability,columns=['loan_status'])
    y_test.index = np.arange(200000, 200000+len(y_test))
    y_test.to_csv('data/submission.csv', index_label='id')
    return

def feature_engineering(data_train,data_test,n):
    pca = PCA(n_components=n)
    new_data_train = pca.fit_transform(data_train)
    new_data_test = pca.transform(data_test)
    return new_data_train,new_data_test

def over_under_sampling(X_train,Y_train,method,fraction):
    if method=="randomU":
        rus = RandomUnderSampler(random_state=0,sampling_strategy=fraction)
    elif method=="randomO":
        rus = RandomOverSampler(random_state=0,sampling_strategy=fraction)
    elif method=="centroid":
        rus = ClusterCentroids(random_state=42,sampling_strategy=fraction)
    elif method=="SMOTENN":
        rus = SMOTEENN(random_state=42,sampling_strategy=fraction)
    X_resampled, y_resampled = rus.fit_resample(X_train, Y_train)
    return X_resampled, y_resampled

## Data Split and Dimensionality Reduction

In [225]:
X_train,X_test,Y_train=data_processed()
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

## Modeling

### AdaBoost (Normal)

In [226]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, Y_train)
Y_train_pred_ada=clf.predict(X_train)
Y_val_pred_ada=clf.predict(X_val)
Y_val_pred_prob_ada=clf.predict_proba(X_val)[:,1]
AUC_score(Y_val,Y_val_pred_prob_ada)

0.6853044522472596

### SVC

SVC Takes forver to run becuase of huge dataset.

In [None]:
clf = SVC(gamma='auto',probability=True)
clf.fit(X_train, Y_train)

### kNN

In [16]:
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, Y_train)

KNeighborsClassifier(n_neighbors=2)

### RF model (weighted)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, criterion = 'gini', min_samples_leaf=30, max_depth=20)
clf.fit(X_train, Y_train, sample_weight=copy.deepcopy(Y_train)*4 + 1)
Y_train_pred_rf=clf.predict(X_train)
Y_val_pred_rf=clf.predict(X_val)
Y_val_pred_prob_rf=clf.predict_proba(X_val)[:,1]
AUC_score(Y_val,Y_val_pred_prob_rf)

### Logistic Regression (under_sampled)

In [None]:
X_train_under,Y_train_under=over_under_sampling(X_train,Y_train,method="randomU",fraction=1)
clf = LogisticRegression(random_state=0,max_iter=400).fit(X_train_under, Y_train_under)
Y_train_pred_log=clf.predict(X_train)
Y_val_pred_log=clf.predict(X_val)
Y_val_pred_prob_log=clf.predict_proba(X_val)[:,1]
print(AUC_score(Y_val,Y_val_pred_prob_log))

### Logistic Regression (over_sampled)

In [None]:
X_train_over,Y_train_over=over_under_sampling(X_train,Y_train,method="randomO",fraction=1)
clf = LogisticRegression(random_state=0,max_iter=400).fit(X_train_over, Y_train_over)
Y_train_pred_log=clf.predict(X_train)
Y_val_pred_log=clf.predict(X_val)
Y_val_pred_prob_log=clf.predict_proba(X_val)[:,1]
AUC_score(Y_val,Y_val_pred_prob_log)

New Log Regression

In [218]:
# X_train_under,Y_train_under=over_under_sampling(X_train,Y_train,method="randomU",fraction=i/10)
clf = LogisticRegression(random_state=0,max_iter=400).fit(X_train, Y_train)
Y_train_pred_log=clf.predict(X_train)
Y_val_pred_log=clf.predict(X_val)
Y_val_pred_prob_log=clf.predict_proba(X_val)[:,1]
print(AUC_score(Y_val,Y_val_pred_prob_log))
print(confusion_matrix(Y_val,Y_val_pred_log))

# for i in range(10):
#     new_weight=np.ones(len(Y_train))
#     loc1=(Y_train_pred_log==Y_train)&(Y_train==1)
#     loc2=(Y_train_pred_log==Y_train)&(Y_train==0)
#     new_weight[loc]=new_weight[loc]*np.exp(-len(loc1)*4/(sum(Y_train==1)))
#     new_weight[loc]=new_weight[loc]*np.exp(-len(loc2)*4/(sum(Y_train==0)))
#     clf = LogisticRegression(random_state=0,max_iter=400).fit(X_train, Y_train,sample_weight=new_weight)
#     Y_train_pred_log=clf.predict(X_train)
#     Y_val_pred_log=clf.predict(X_val)
#     Y_val_pred_prob_log=clf.predict_proba(X_val)[:,1]
#     print(AUC_score(Y_val,Y_val_pred_prob_log))
#     print(confusion_matrix(Y_val,Y_val_pred_log))

# new_weight=np.ones(len(Y_train))
# loc=(Y_train_pred_log!=Y_train)&(Y_train==0)
# new_weight[loc]=new_weight[loc]*1.5
# clf = LogisticRegression(random_state=0,max_iter=400).fit(X_train, Y_train,sample_weight=new_weight)
# Y_train_pred_log=clf.predict(X_train)
# Y_val_pred_log=clf.predict(X_val)
# Y_val_pred_prob_log=clf.predict_proba(X_val)[:,1]
# print(AUC_score(Y_val,Y_val_pred_prob_log))
# print(confusion_matrix(Y_val,Y_val_pred_log))

0.6785320229727705
[[55082    27]
 [ 9966    18]]
0.6784999642026455
[[55077    32]
 [ 9961    23]]
0.6784999642026455
[[55077    32]
 [ 9961    23]]
0.6784999642026455
[[55077    32]
 [ 9961    23]]
0.6784999642026455
[[55077    32]
 [ 9961    23]]


KeyboardInterrupt: 

## Testing

In [162]:
confusion_matrix(Y_val,Y_val_pred_rf)

array([[43872, 11237],
       [ 5599,  4385]])

In [163]:
confusion_matrix(Y_val,Y_val_pred_log)

array([[40135, 14974],
       [ 4822,  5162]])

In [164]:
confusion_matrix(Y_val,Y_val_pred_ada)

array([[55084,    25],
       [ 9967,    17]])

In [131]:
print(AUC_score(Y_val[(Y_val_pred_log!=Y_val_pred_rf)],Y_val_pred_log[(Y_val_pred_log!=Y_val_pred_rf)]))
confusion_matrix(Y_val[(Y_val_pred_log!=Y_val_pred_rf)],Y_val_pred_log[(Y_val_pred_log!=Y_val_pred_rf)])

0.48411789436080543


array([[1100, 4837],
       [ 298, 1075]])

In [132]:
sum(Y_val[Y_val_pred_log==Y_val_pred_ada]==1),sum(Y_val[Y_val_pred_log==Y_val_pred_ada]==0)

(4839, 40160)

## Blending
Linear combination of few models. ( Using Val set )
Under sampling of majority class in few models.

In [161]:
X_blend=np.vstack((Y_train_pred_ada,Y_train_pred_log,Y_train_pred_rf)).reshape(-1,3)
clf = LogisticRegression(random_state=0).fit(X_blend, Y_train)
X_blend=np.vstack((Y_val_pred_ada,Y_val_pred_log,Y_val_pred_rf)).reshape(-1,3)
Y_val_pred_prob=clf.predict_proba(X_blend)[:,1]
AUC_score(Y_val,Y_val_pred_prob)
# Y_average = np.mean(np.array([Y_val_pred_prob_log,Y_val_pred_prob_ada]), axis=0)
# Y_average[Y_average>=0.5] = 1
# Y_average[Y_average<0.5] = 0
# print(AUC_score(Y_val,Y_average))
# confusion_matrix(Y_val,Y_average)

0.6213852705256389


array([[40558, 14551],
       [ 4924,  5060]])

## Preping for submission

In [34]:
Y_test_pred_prob=clf.predict_proba(X_test)[:,1]
to_submission(Y_test_pred_prob)

In [35]:
np.mean(Y_test_pred_prob)

0.5003545792718493

In [23]:
X_train.shape

(132157, 29)