Comparing Boosting and Bagging Techniques With Noisy and Imbalanced Data
https://ieeexplore.ieee.org/document/5645694?arnumber=5645694


In [234]:
import numpy as np
from numpy.random import sample
import pandas as pd
from sklearn.model_selection import train_test_split, LeaveOneOut, cross_val_score, KFold, cross_val_predict, StratifiedKFold
from sklearn.model_selection import StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from scipy import interp
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix, precision_recall_curve
from sklearn.metrics import classification_report, f1_score, auc
from sklearn.metrics import average_precision_score, roc_curve, auc, accuracy_score, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.utils import resample
import matplotlib.pyplot as plt
from scipy import interp
%matplotlib inline
from tpot import TPOTClassifier

In [235]:
df = pd.read_csv("SCI.csv")#, sep='COLUMN_SEPARATOR', dtype=np.float64)
df.set_index('StudyID', inplace=True)
y = df.FU_SA
X = df.drop('FU_SA', axis=1)
df.shape

(591, 50)

In [236]:
correlated_features = set()  
correlation_matrix = df.corr()

for i in range(len(correlation_matrix.columns)):  
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
            
X.drop(labels=correlated_features, axis=1, inplace=True)  

In [237]:
X.shape
df = pd.concat([X,y], axis=1, sort=False)
df.shape

(591, 48)

# LR

In [238]:
logreg = LogisticRegression()

logreg.fit(X,y)

preds = logreg.predict(X)

C_app = roc_auc_score(y, preds)

## Bootstrapping ##

B = 5001

Cb_boots = []
Cb_orig = []
opts = []



for i in range(1, B):
    boot = resample(df, replace=True, n_samples=len(X)) #Create bootstrapped dataset
    y_b = boot.FU_SA 
    X_b = boot.drop('FU_SA', axis=1)
    logreg.fit(X_b, y_b) #Fit LR using boostrapped data
    preds_b = logreg.predict(X_b) #Make predictions on bootstrapped data
    roc_b = roc_auc_score(y_b, preds_b) 
    C_boots.append(roc_b)
    preds_orig = logreg.predict(X) #Apply model from boostrapped data to original data
    roc_o = roc_auc_score(y, preds_orig)
    Cb_orig.append(roc_o)
    opts.append(roc_b - roc_o)
    
O = 1/B * np.sum(opts)
O_adj = C_app - O
print(C_app)
print(O)
print('Adj AUC', O_adj)

0.6491243432574431
0.11011475103028413
Adj AUC 0.5390095922271589


# RF

In [239]:
rf = RandomForestClassifier(n_estimators = 7)

rf.fit(X,y)

preds = rf.predict(X)

C_app = roc_auc_score(y, preds)

## Bootstrapping ##

B = 5001

Cb_boots = []
Cb_orig = []
opts = []

for i in range(1, B):
    boot = resample(df, replace=True, n_samples=len(df)) #Create bootstrapped dataset
    y_b = boot.FU_SA 
    X_b = boot.drop('FU_SA', axis=1)
    rf.fit(X_b, y_b) #Fit LR using boostrapped data
    preds_b = rf.predict(X_b) #Make predictions on bootstrapped data
    roc_b = roc_auc_score(y_b, preds_b) 
    C_boots.append(roc_b)
    preds_orig = rf.predict(X) #Apply model from boostrapped data to original data
    roc_o = roc_auc_score(y, preds_orig)
    Cb_orig.append(roc_o)
    opts.append(roc_b - roc_o)
    
O = 1/B * np.sum(opts)
O_adj = C_app - O
print(C_app)
print(O)
print('Adj AUC', O_adj)

0.925
0.18129092482085002
Adj AUC 0.74370907517915


# Stratified bootstrapping