In [4]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
from IPython.display import display
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate


from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from imblearn.metrics import classification_report_imbalanced
from imblearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from imblearn.under_sampling import NearMiss

### Read unbalanced data (Multiclass)

In [None]:
file = 'Datasets/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(file)
df

In [None]:
df.columns = df.columns.str.lower().str.replace('_012','')

In [None]:
df.columns

In [None]:
df.info()

### Multiclass Imbalanced data - Manually balanced

In [None]:
def getRandomDF(df, value, n, state):
    return df[df.diabetes==value].sample(n=n, random_state=state)

In [None]:
dfs = {}
size = df.diabetes.value_counts()[1.0]
print(f'Size: {size}')

# Data for diabetes == 1
df1 = df[df.diabetes==1].sample(n=size)

# Get 10 random balanced dataframes. 
# The most imbalanced class (1) determines the total number of samples 4,361 * 3 = 13,893
for i in range(0,10):
    df0 = getRandomDF(df, 0, size, i)
    #df2 = getRandomDF(df, 2, size, i)
    dfs[i] = pd.concat([df0, df1])

#### List of models

In [None]:
models = [DecisionTreeClassifier(),
         RandomForestClassifier()]

In [None]:
for i,j in dfs.items():
    model = make_pipeline(DecisionTreeClassifier(max_depth=10))
    
    target = j.diabetes
    data = j.drop(columns=['diabetes'])
    
    dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=i)
    dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=i)
    model.fit(dfTrain, yTrain)
    #yValPred = model.predict_proba(dfVal)[:,1]
    yValPred = model.predict(dfVal)#[:,1]
    #yValPred = np.argmax(resdfVal, axis=1)
    print(f'Val prediction: {yValPred}')
    print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')
    print(f'Val AUC: {roc_auc_score(yVal,yValPred)}')
    
    yTestPred = model.predict(dfTest)#[:,1]
    #yValPred = np.argmax(resdfVal, axis=1)
    print(f'Test prediction: {yTestPred}')
    print(f'Test RMSE: {mean_squared_error(yTest, yTestPred,squared=False)}')
    print(f'Test AUC: {roc_auc_score(yTest, yTestPred)}')

    #yTrainPred = model.predict_proba(dfTrain)[:,1]
    yTrainPred = model.predict(dfTrain)#[:,1]
    #yTrainPred = np.argmax(resdfValPred, axis=1)
    print(f'Train prediction: {yTrainPred}')
    print(f'Train RMSE: {mean_squared_error(yTrain, yTrainPred,squared=False)}')
    print(f'Train AUC: {roc_auc_score(yTrain,yTrainPred)}')
    
    cm = confusion_matrix(yVal, yValPred)
    cmprob = np.round(100 * cm / len(yVal), 1)
    cm_display = ConfusionMatrixDisplay(cm, display_labels=['0', '1']).plot()
    cm_display = ConfusionMatrixDisplay(cmprob, display_labels=['0', '1']).plot()
    
    print(classification_report(yVal, yValPred))
    print(classification_report_imbalanced(yVal, yValPred))

In [None]:
target = df.diabetes
data = df.drop(columns=['diabetes'])

dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=11)
dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=11)


yValPred = model.predict(dfVal)#[:,1]
#yValPred = np.argmax(resdfVal, axis=1)
print(f'Val prediction: {yValPred}')
print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')
print(f'Val AUC: {roc_auc_score(yVal,yValPred)}')

yTestPred = model.predict(dfTest)#[:,1]
#yValPred = np.argmax(resdfVal, axis=1)
print(f'Test prediction: {yTestPred}')
print(f'Test RMSE: {mean_squared_error(yTest, yTestPred,squared=False)}')
print(f'Test AUC: {roc_auc_score(yTest, yTestPred)}')
print(classification_report(yVal, yValPred))
print(classification_report_imbalanced(yVal, yValPred))

In [None]:
for i,j in dfs.items():
    model = make_pipeline(RandomForestClassifier(n_estimators=10, random_state=1, max_depth=7))
    
    target = j.diabetes
    data = j.drop(columns=['diabetes'])
    
    dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=i)
    dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=i)
    model.fit(dfTrain, yTrain)
    #yValPred = model.predict_proba(dfVal)[:,1]
    yValPred = model.predict(dfVal)#[:,1]
    #yValPred = np.argmax(resdfVal, axis=1)
    print(f'Val prediction: {yValPred}')
    print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')
    print(f'Val AUC: {roc_auc_score(yVal,yValPred)}')

    #yTrainPred = model.predict_proba(dfTrain)[:,1]
    yTrainPred = model.predict(dfTrain)#[:,1]
    #yTrainPred = np.argmax(resdfValPred, axis=1)
    print(f'Train prediction: {yTrainPred}')
    print(f'Train RMSE: {mean_squared_error(yTrain, yTrainPred,squared=False)}')
    print(f'Train AUC: {roc_auc_score(yTrain,yTrainPred)}')
    
    cm = confusion_matrix(yVal, yValPred)
    cmprob = np.round(100 * cm / len(yVal), 1)
    cm_display = ConfusionMatrixDisplay(cm, display_labels=['0', '1']).plot()
    cm_display = ConfusionMatrixDisplay(cmprob, display_labels=['0', '1']).plot()
    
    print(classification_report(yVal, yValPred))
    print(classification_report_imbalanced(yVal, yValPred))

In [None]:
target = df.diabetes
data = df.drop(columns=['diabetes'])

dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=11)
dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=11)

yValPred = model.predict(dfVal)#[:,1]
#yValPred = np.argmax(resdfVal, axis=1)
print(f'Val prediction: {yValPred}')
print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')
print(f'Val AUC: {roc_auc_score(yVal,yValPred)}')

yTestPred = model.predict(dfTest)#[:,1]
#yValPred = np.argmax(resdfVal, axis=1)
print(f'Test prediction: {yTestPred}')
print(f'Test RMSE: {mean_squared_error(yTest, yTestPred,squared=False)}')
print(f'Test AUC: {roc_auc_score(yTest, yTestPred)}')
print(classification_report(yVal, yValPred))
print(classification_report_imbalanced(yVal, yValPred))

In [None]:
for i,j in dfs.items():
    model = make_pipeline(LogisticRegression(max_iter=1000))
    
    target = j.diabetes
    data = j.drop(columns=['diabetes'])
    
    dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=i)
    dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=i)
    model.fit(dfTrain, yTrain)
    #yValPred = model.predict_proba(dfVal)[:,1]
    yValPred = model.predict(dfVal)#[:,1]
    #yValPred = np.argmax(resdfVal, axis=1)
    print(f'Val prediction: {yValPred}')
    print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')
    print(f'Val AUC: {roc_auc_score(yVal,yValPred)}')

    #yTrainPred = model.predict_proba(dfTrain)[:,1]
    yTrainPred = model.predict(dfTrain)#[:,1]
    #yTrainPred = np.argmax(resdfValPred, axis=1)
    print(f'Train prediction: {yTrainPred}')
    print(f'Train RMSE: {mean_squared_error(yTrain, yTrainPred,squared=False)}')
    print(f'Train AUC: {roc_auc_score(yTrain,yTrainPred)}')
    
    cm = confusion_matrix(yVal, yValPred)
    cmprob = np.round(100 * cm / len(yVal), 1)
    cm_display = ConfusionMatrixDisplay(cm, display_labels=['0', '1']).plot()
    cm_display = ConfusionMatrixDisplay(cmprob, display_labels=['0', '1']).plot()
    
    print(classification_report(yVal, yValPred))
    print(classification_report_imbalanced(yVal, yValPred))

In [None]:
target = df.diabetes
data = df.drop(columns=['diabetes'])
dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=11)
dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=11)

yTestPred = model.predict(dfTest)
yTestPred

In [None]:
print(f'Val prediction: {yTestPred}')
print(f'Val RMSE: {mean_squared_error(yTest, yTestPred,squared=False)}')
print(f'Val AUC: {roc_auc_score(yTest, yTestPred)}')
print(classification_report(yTest, yTestPred))
print(classification_report_imbalanced(yTest, yTestPred))

In [None]:
for i,j in dfs.items():
    
    target = j.diabetes
    data = j.drop(columns=['diabetes'])
    
    dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=i)
    dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=i)
    
        
    dTrain = xgb.DMatrix(dfTrain, yTrain)#, enable_categorical=True)
    dVal = xgb.DMatrix(dfVal, yVal)#, enable_categorical=True)
    xgbParams = {
    'eta': 0.1,
    'max_depth': 10,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'nthreads': 8,
    
    'seed': 1,
    'verbosity': 1,
    }

    model = xgb.train(xgbParams, dTrain, num_boost_round=10)
    
    yValPred = model.predict(dVal)
    
    #roc_auc_score(yVal, yValPred)
    
    #model.fit(dfTrain, yTrain)
    #yValPred = model.predict_proba(dfVal)[:,1]
    #yValPred = model.predict(dfVal)#[:,1]
    #yValPred = np.argmax(resdfVal, axis=1)
    print(f'Val prediction: {yValPred}')
    print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')
    print(f'Val AUC: {roc_auc_score(yVal,yValPred)}')

    #yTrainPred = model.predict_proba(dfTrain)[:,1]
    #yTrainPred = model.predict(dfTrain)#[:,1]
    yTrainPred = model.predict(dTrain)
    #yTrainPred = np.argmax(resdfValPred, axis=1)
    print(f'Train prediction: {yTrainPred}')
    print(f'Train RMSE: {mean_squared_error(yTrain, yTrainPred,squared=False)}')
    print(f'Train AUC: {roc_auc_score(yTrain,yTrainPred)}')
    
    cm = confusion_matrix(yVal, (yValPred > 0.5).astype(int))
    cmprob = np.round(100 * cm / len(yVal), 1)
    cm_display = ConfusionMatrixDisplay(cm, display_labels=['0', '1']).plot()
    cm_display = ConfusionMatrixDisplay(cmprob, display_labels=['0', '1']).plot()
    
    print(classification_report(yVal, (yValPred > 0.5).astype(int)))
    print(classification_report_imbalanced(yVal, (yValPred > 0.5).astype(int)))


In [None]:
(yValPred > 0.5)*1.0

In [None]:
target = df.diabetes
data = df.drop(columns=['diabetes'])

dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=11)
dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=11)
dTrain = xgb.DMatrix(dfTrain, yTrain)#, enable_categorical=True)
dVal = xgb.DMatrix(dfVal, yVal)#, enable_categorical=True)
dTest = xgb.DMatrix(dfTest, yTest)#, enable_categorical=True)

model = xgb.train(xgbParams, dTrain, num_boost_round=10)
    
yValPred = model.predict(dVal)#yValPred = np.argmax(resdfVal, axis=1)
    
print(f'Val prediction: {yValPred}')
print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')
print(f'Val AUC: {roc_auc_score(yVal,yValPred)}')

yTestPred = model.predict(dTest)#[:,1]
#yValPred = np.argmax(resdfVal, axis=1)
print(f'Test prediction: {yTestPred}')
print(f'Test RMSE: {mean_squared_error(yTest, yTestPred,squared=False)}')
print(f'Test AUC: {roc_auc_score(yTest, yTestPred)}')
print(classification_report(yVal, (yValPred>0.5).astype(int)))
print(classification_report_imbalanced(yVal, (yValPred>0.5).astype(int)))

### Binary Unbalanced Data

In [2]:
def binaryBalancedData():
    file = 'Datasets/diabetes_binary_health_indicators_BRFSS2015.csv'
    df = pd.read_csv(file)
    df.drop_duplicates(inplace=True)

    df.columns = df.columns.str.lower().str.replace('diabetes_binary','diabetes')

    target = df.diabetes
    data = df.drop(columns=['diabetes'])

    dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=11)
    dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=11)
    return df, dfTrainFull, yTrainFull, dfTrain, yTrain, dfVal,yVal, dfTest, yTest


In [3]:
df, dfTrainFull, yTrainFull, dfTrain, yTrain, dfVal,yVal, dfTest, yTest = binaryBalancedData()
df.head()

Unnamed: 0,diabetes,highbp,highchol,cholcheck,bmi,smoker,stroke,heartdiseaseorattack,physactivity,fruits,...,anyhealthcare,nodocbccost,genhlth,menthlth,physhlth,diffwalk,sex,age,education,income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [5]:
from pandas_profiling import ProfileReport

In [6]:
profile = ProfileReport(df, title="Profiling Report")

In [7]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



### EDA

### Preprocessing

In [None]:
categoricalCols = ['highbp', 'highchol', 'cholcheck','smoker',
       'stroke', 'heartdiseaseorattack', 'physactivity', 'fruits', 'veggies',
       'hvyalcoholconsump', 'anyhealthcare', 'nodocbccost', 'genhlth','diffwalk', 'sex', 'education']

numericalCols = ['bmi', 'menthlth', 'physhlth', 'income']

In [None]:
#dfTrain.income = np.log1p(dfTrain.income)

In [None]:
# creating preprocesors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
catPreprocessor = OneHotEncoder(handle_unknown="ignore")
numPreprocessor = StandardScaler()

# Transforming the data
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer([
    #('one-hot-encoder', catPreprocessor, categoricalCols)],remainder="passthrough")
    #('one-hot-encoder', catPreprocessor, categoricalCols),('standard_scaler', numPreprocessor, numericalCols)])
    ], remainder="passthrough")
