In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.display import display
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score, mean_squared_error

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report



In [None]:
file = 'Datasets/diabetes_BRFSS2015/diabetes_012_health_indicators_BRFSS2015.csv'
df = pd.read_csv(file)
df

In [None]:
df.columns = df.columns.str.lower().str.replace('_012','')

In [None]:
df.columns

In [None]:
df.info()

### Tree

In [None]:
def getRandomDF(df, value, n, state):
    return df[df.diabetes==value].sample(n=n, random_state=state)

In [None]:
dfs = {}
size = df.diabetes.value_counts()[1.0]

# Data for diabetes == 1
df1 = df[df.diabetes==1].sample(n=size)

# Get 10 random balanced dataframes. 
# The most imbalanced class (1) determines the total number of samples 4,361 * 3 = 13,893
for i in range(0,10):
    df0 = getRandomDF(df, 0, size, i)
    #df2 = getRandomDF(df, 2, size, i)
    dfs[i] = pd.concat([df0, df1])

In [None]:
for i,j in dfs.items():
    model = make_pipeline(DecisionTreeClassifier(max_depth=10))
    
    target = j.diabetes
    data = j.drop(columns=['diabetes'])
    
    dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=i)
    dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=i)
    model.fit(dfTrain, yTrain)
    resdfVal = model.predict_proba(dfVal)#[:,1]
    yValPred = np.argmax(resdfVal, axis=1)
    print(f'Val prediction: {yValPred}')
    print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')

    resdfValPred = model.predict_proba(dfTrain)#[:,1]
    yTrainPred = np.argmax(resdfValPred, axis=1)
    print(f'Train prediction: {yTrainPred}')
    print(f'Val RMSE: {mean_squared_error(yTrain, yTrainPred,squared=False)}')
    
    cm = confusion_matrix(yVal, yValPred)
    cmprob = np.round(100 * cm / len(yVal), 1)
    cm_display = ConfusionMatrixDisplay(cm, display_labels=['0', '1']).plot()
    cm_display = ConfusionMatrixDisplay(cmprob, display_labels=['0', '1']).plot()
    
    print(classification_report(yVal, yValPred))

In [None]:
for param in model.get_params():
    print(param)

In [None]:

model = make_pipeline(DecisionTreeClassifier())
    

In [None]:
dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=11)
dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=11)

In [None]:
categoricalCols = ['highbp', 'highchol', 'cholcheck','smoker',
       'stroke', 'heartdiseaseorattack', 'physactivity', 'fruits', 'veggies',
       'hvyalcoholconsump', 'anyhealthcare', 'nodocbccost', 'genhlth','diffwalk', 'sex', 'education']

numericalCols = ['bmi', 'menthlth', 'physhlth', 'income']

In [None]:
# creating preprocesors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
catPreprocessor = OneHotEncoder(handle_unknown="ignore")
numPreprocessor = StandardScaler()

# Transforming the data
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer([
    ('one-hot-encoder', catPreprocessor, categoricalCols)],remainder="passthrough")
    #('one-hot-encoder', catPreprocessor, categoricalCols),
    #('standard_scaler', numPreprocessor, numericalCols)])
    


In [None]:
# creating the model
from sklearn.pipeline import make_pipeline
model = make_pipeline(preprocessor, DecisionTreeClassifier(max_depth=10, max_leaf_nodes=30))

model.fit(dfTrain, yTrain)

In [None]:


resdfVal = model.predict_proba(dfVal)#[:,1]
yValPred = np.argmax(resdfVal, axis=1)
print(f'Val prediction: {yValPred}')
print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')

resdfValPred = model.predict_proba(dfTrain)#[:,1]
yTrainPred = np.argmax(resdfValPred, axis=1)
print(f'Train prediction: {yTrainPred}')
print(f'Val RMSE: {mean_squared_error(yTrain, yTrainPred,squared=False)}')

In [None]:
(yValPred == yVal).mean()

In [None]:
cm = confusion_matrix(yVal, yValPred)
cmprob = np.round(100 * cm / len(yVal), 1)
cm_display = ConfusionMatrixDisplay(cm, display_labels=['0', '1']).plot()
cm_display = ConfusionMatrixDisplay(cmprob, display_labels=['0', '1']).plot()
#tn, fp, fn, tp = cm.ravel()
#print(tn, fp, fn, tp)
#print(cm)

In [None]:
print(classification_report(yVal, yValPred))

In [None]:
import seaborn as sns
corr = df.corr()
#sns.set_theme()
plt.figure(figsize=(20,20))
sns.heatmap(corr, cmap="Blues", annot=True)

In [None]:
corr = df[numericalCols + ['diabetes']].corr()
#sns.set_theme()
plt.figure(figsize=(20,20))
sns.heatmap(corr, cmap="Blues", annot=True)

### Binary

In [None]:
file = 'Datasets/diabetes_binary_health_indicators_BRFSS2015.csv'
df = pd.read_csv(file)
df

In [None]:
df.columns = df.columns.str.lower().str.replace('diabetes_binary','diabetes')
df.columns

In [None]:
target = df.diabetes
data = df.drop(columns=['diabetes'])

In [None]:
dfTrainFull, dfTest, yTrainFull, yTest = train_test_split(data, target, test_size=0.2, random_state=11)
dfTrain, dfVal, yTrain, yVal = train_test_split(data, target, test_size=0.25, random_state=11)

In [None]:
# creating the model
from sklearn.pipeline import make_pipeline
model = make_pipeline(DecisionTreeClassifier(max_depth=10, max_leaf_nodes=30))

model.fit(dfTrain, yTrain)

In [None]:

yValPred = model.predict(dfVal)
print(f'Val prediction: {yValPred}')
print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')
auc = roc_auc_score(yVal, yValPred)
print(auc)

yTrainPred = model.predict(dfTrain)
print(f'Train prediction: {yTrainPred}')
print(f'Val RMSE: {mean_squared_error(yTrain, yTrainPred,squared=False)}')
auc = roc_auc_score(yTrain, yTrainPred)
print(auc)


In [None]:
yValPred

In [None]:
print(classification_report(yVal, yValPred))

In [None]:
yValPred = model.predict_proba(dfVal)#[:,1]
yValPred

In [None]:
roc_auc_score(yVal,yValPred[:,1])

In [None]:
model = make_pipeline(LogisticRegression(max_iter=1000))    

In [None]:
model.fit(dfTrain, yTrain)

In [None]:
yValPred = model.predict(dfVal)

In [None]:
print(f'Val prediction: {yValPred}')
print(f'Val RMSE: {mean_squared_error(yVal, yValPred,squared=False)}')
auc = roc_auc_score(yVal, yValPred)
print(auc)

yTrainPred = model.predict(dfTrain)
print(f'Train prediction: {yTrainPred}')
print(f'Val RMSE: {mean_squared_error(yTrain, yTrainPred,squared=False)}')
auc = roc_auc_score(yTrain, yTrainPred)
print(auc)

In [None]:
print(classification_report(yVal, yValPred))

In [None]:
roc_auc_score(yVal,yValPred)

In [None]:
# creating preprocesors
from sklearn.preprocessing import OneHotEncoder, StandardScaler
catPreprocessor = OneHotEncoder(handle_unknown="ignore")
numPreprocessor = StandardScaler()

# Transforming the data
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer([
    ('one-hot-encoder', catPreprocessor, categoricalCols)],remainder="passthrough")
    #('one-hot-encoder', catPreprocessor, categoricalCols),
    #('standard_scaler', numPreprocessor, numericalCols)])
    


In [None]:
# creating the model
from sklearn.pipeline import make_pipeline
model = make_pipeline(preprocessor, DecisionTreeClassifier(max_depth=10, max_leaf_nodes=30))

model.fit(dfTrain, yTrain)