## Importing libraries

In [1]:
import pandas as pd
import numpy as np
## Models
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
## Other stuff
from sklearn.metrics import roc_auc_score,confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
## Samplers
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids 
from imblearn.combine import SMOTEENN 

## Utility functions

In [2]:
# Utiliy Functions
def min_max_data_normalize(data_train,data_test):
    data_min=data_train.min()
    data_max=data_train.max()
    normal_data_train=(data_train-data_min)/(data_max-data_min)
    normal_data_test=(data_test-data_min)/(data_max-data_min)
    return normal_data_train,normal_data_test

In [33]:
def data_processed():
    '''
    Used to read & normalize processed test and train data
    '''
    data_train=pd.read_csv("data/2022-02-06_LOANS_TRAIN.csv")
    data_test=pd.read_csv("data/2022-02-06_LOANS_TEST.csv")
    y_train=data_train['loan_status']
    data_train.drop('loan_status',1,inplace=True)
    normal_data_train,normal_data_test=min_max_data_normalize(data_train,data_test)
    return normal_data_train,normal_data_test,y_train

def AUC_score(y_ground_truth,y_predicted_probability):
    return roc_auc_score(y_ground_truth, y_predicted_probability)

def to_submission(y_test_predicted_probability):
    y_test=pd.DataFrame(y_test_predicted_probability,columns=['loan_status'])
    y_test.index = np.arange(200000, 200000+len(y_test))
    y_test.to_csv('data/submission.csv', index_label='id')
    return

def feature_engineering(data_train,data_test,n):
    pca = PCA(n_components=n)
    new_data_train = pca.fit_transform(data_train)
    new_data_test = pca.transform(data_test)
    return new_data_train,new_data_test

def over_under_sampling(X_train,Y_train,method,fraction):
    if method=="random":
        rus = RandomUnderSampler(random_state=0,sampling_strategy=fraction)
    elif method=="centroid":
        rus = ClusterCentroids(random_state=42,sampling_strategy=fraction)
    elif method=="SMOTENN":
        rus = SMOTEENN(random_state=42,sampling_strategy=fraction)
    X_resampled, y_resampled = rus.fit_resample(X_train, Y_train)
    return X_resampled, y_resampled

## Data Split and Dimensionality Reduction

In [12]:
X_train,X_test,Y_train=data_processed()
X_train,Y_train=over_under_sampling(X_train,Y_train,method="random",fraction=5/6)
X_train,X_test=feature_engineering(X_train,X_test,n=15)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

## Modeling

### AdaBoost

In [19]:
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, Y_train)
Y_train_pred_ada=clf.predict(X_train)
Y_val_pred_ada=clf.predict(X_val)
Y_val_pred_prob_ada=clf.predict_proba(X_val)[:,1]
AUC_score(Y_val,Y_val_pred_prob_ada)

0.6622763448287391

### SVC

SVC Takes forver to run becuase of huge dataset.

In [None]:
clf = SVC(gamma='auto',probability=True)
clf.fit(X_train, Y_train)

### kNN

In [16]:
clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, Y_train)

KNeighborsClassifier(n_neighbors=2)

### Logistic Regression

In [14]:
clf = LogisticRegression(random_state=0).fit(X_train, Y_train)
Y_train_pred_log=clf.predict(X_train)
Y_val_pred_log=clf.predict_proba(X_val)[:,1]
Y_val_pred_prob_log=clf.predict_proba(X_val)[:,1]
AUC_score(Y_val,Y_val_pred_prob_log)

0.6664574451973246

## Testing

In [9]:
confusion_matrix(Y_val,Y_val_pred)

array([[8976, 2980],
       [5283, 4704]])

## Blending
Linear combination of few models. ( Using Val set )
Under sampling of majority class in few models.

In [18]:
X_blend=np.vstack((Y_train_pred_ada,Y_train_pred_log)).reshape(-1,2)
clf = LogisticRegression(random_state=0).fit(X_blend, Y_train)
X_blend=np.vstack((Y_val_pred_ada,Y_val_pred_ada)).reshape(-1,2)
Y_val_pred_prob=clf.predict_proba(X_blend)[:,1]
AUC_score(Y_val,Y_val_pred_prob)

0.4937175688716509

## Preping for submission

In [34]:
Y_test_pred_prob=clf.predict_proba(X_test)[:,1]
to_submission(Y_test_pred_prob)

In [35]:
np.mean(Y_test_pred_prob)

0.5003545792718493