# **Model Training**
This notebook focuses on model training.

---

## Accuracy-trained models

In [4]:
from ucimlrepo import fetch_ucirepo

cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

x = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

### 3. Naive Bayes

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
import numpy as np

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

nb_model = GaussianNB()

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    # Split data into train and test folds
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    nb_model.fit(X_train_scaled, y_train)
    score = nb_model.score(X_test_scaled, y_test)
    cv_scores.append(score)

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, stratify=y_balanced,
                                                    random_state=42)

# Normalization
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

nb_model.fit(X_train_scaled, y_train)
y_pred = nb_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Cross-Validation Accuracy Scores: [0.71313412 0.71994441 0.72077832 0.71480195 0.71629135 0.72212955
 0.71670837 0.7206005  0.72421462 0.7117042 ]
Mean Accuracy: 0.72
Standard Deviation: 0.00
Confidence Interval: [0.71, 0.72]
Test Set Accuracy: 0.72
Confusion Matrix:
[[6528 2206]
 [1822 3833]]
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.76      8734
           1       0.63      0.68      0.66      5655

    accuracy                           0.72     14389
   macro avg       0.71      0.71      0.71     14389
weighted avg       0.72      0.72      0.72     14389



<br><br>
<br><br>
### 4. Bayesian network

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']


def fit_bayesian_network():
    model = BayesianNetwork([('HighBP', 'Diabetes_binary'),
                             ('HighChol', 'Diabetes_binary'),
                             ('BMI', 'Diabetes_binary'),
                             ('Smoker', 'Diabetes_binary'),
                             ('Diabetes_binary', 'BMI')])

    cpd_highbp = TabularCPD(variable='HighBP', variable_card=2, values=[[0.8], [0.2]])
    cpd_highchol = TabularCPD(variable='HighChol', variable_card=2, values=[[0.7], [0.3]])
    cpd_bmi = TabularCPD(variable='BMI', variable_card=2, values=[[0.5], [0.5]])
    cpd_smoker = TabularCPD(variable='Smoker', variable_card=2, values=[[0.4], [0.6]])
    cpd_diabetes = TabularCPD(variable='Diabetes_binary', variable_card=2,
                              values=[[0.7, 0.3], [0.3, 0.7]],
                              evidence=['HighBP', 'HighChol', 'BMI', 'Smoker'],
                              evidence_card=[2, 2, 2, 2])

    model.add_cpds(cpd_highbp, cpd_highchol, cpd_bmi, cpd_smoker, cpd_diabetes)
    model.check_model()
    inference = VariableElimination(model)

    return model, inference


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model, inference = fit_bayesian_network(y_train)

    prediction = inference.predict(X_test_scaled)
    predicted_labels = prediction['Diabetes_binary']

    accuracy = accuracy_score(y_test, predicted_labels)
    cv_scores.append(accuracy)

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores (Bayesian Network): {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

model, inference = fit_bayesian_network(y_train)

prediction = inference.predict(X_test_scaled)
predicted_labels = prediction['Diabetes_binary']

test_accuracy = accuracy_score(y_test, predicted_labels)
print(f"Test Set Accuracy (Bayesian Network): {test_accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, predicted_labels))
print("\nClassification Report:")
print(classification_report(y_test, predicted_labels))

ModuleNotFoundError: No module named 'google.generativeai'

<br><br>
<br><br>
### 5. Neural network (automatic)

In [22]:
'''
Unfortunately this code gives an error, most likely due to a "circular import" as reported in the error, but we cannot find any files that would cause this overlap. Furthermore, it could be caused by an issue with the TensorFlow and Keras versions, but we have also revised these and cannot find any immediate problems. 
'''

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import keras_tuner as kt
from ucimlrepo import fetch_ucirepo

cdc_diabetes_health_indicators = fetch_ucirepo(id=891)
x = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_input', min_value=16, max_value=128, step=16),
                    activation='relu', input_dim=X_balanced.shape[1]))

    for i in range(hp.Int('num_layers', 1, 3)): # Automatically choose the number of hidden layers (1-3)
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=16, max_value=128, step=16),
                        activation='relu'))
        model.add(Dropout(rate=hp.Choice('dropout_rate', [0.2, 0.3, 0.4])))

    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=hp.Choice('optimizer', ['adam', 'rmsprop']),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='my_tuning_dir',
    project_name='binary_classification'
)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    tuner.search(X_train_scaled, y_train, epochs=20, validation_split=0.2, verbose=0)
    best_model = tuner.get_best_models(num_models=1)[0]

    scores = best_model.evaluate(X_test_scaled, y_test, verbose=0)
    cv_scores.append(scores[1])

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

best_model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=1)

y_pred = (best_model.predict(X_test_scaled) > 0.5).astype("int32")
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {test_accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

AttributeError: partially initialized module 'keras_tuner.src.engine.base_tuner' has no attribute 'BaseTuner' (most likely due to a circular import)

<br><br>
<br><br>
### 6. Neural network (not automatic)

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

num_hidden_layers = 2  # 1 or 2

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model = Sequential()
    model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
    model.add(Dropout(0.3))

    if num_hidden_layers == 2:
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.3))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

    scores = model.evaluate(X_test_scaled, y_test, verbose=0)
    cv_scores.append(scores[1])

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

model = Sequential()
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dropout(0.3))

if num_hidden_layers == 2:
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=1)

y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {test_accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Cross-Validation Accuracy Scores: [0.75066018 0.7449618  0.76066715 0.75149411 0.74534333 0.74589938
 0.75229359 0.75090355 0.75771475 0.74617738]
Mean Accuracy: 0.75
Standard Deviation: 0.01
Confidence Interval: [0.75, 0.76]
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Set Accuracy: 0.75
Confusion Matrix:
[[7027 1707]
 [1901 3754]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.80  

<br><br>
<br><br>
### 7. Deep learning

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model = Sequential([
        Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')  # Output layer with sigmoid activation
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

    scores = model.evaluate(X_test_scaled, y_test, verbose=0)
    cv_scores.append(scores[1])

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)

# Normalization
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

model = Sequential([
    Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Sigmoid activation
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=1)

y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {test_accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Cross-Validation Accuracy Scores: [0.74829745 0.7466296  0.75649756 0.7551077  0.74603838 0.74589938
 0.7496525  0.75159854 0.75618571 0.74756742]
Mean Accuracy: 0.75
Standard Deviation: 0.00
Confidence Interval: [0.75, 0.75]
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Set Accuracy: 0.75
Confusion Matrix:
[[6941 1793]
 [1804 3851]]

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.79  

<br><br>
<br><br>
### 8. SVM

In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

svm_clf = LinearSVC(dual="auto", random_state=42)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = []
y_pred_cv = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    svm_clf.fit(X_train_scaled, y_train)
    score = svm_clf.score(X_test_scaled, y_test)
    cv_scores.append(score)

    y_pred_fold = svm_clf.predict(X_test_scaled)
    y_pred_cv.extend(y_pred_fold)

y_pred_cv = np.array(y_pred_cv)

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, stratify=y_balanced,
                                                    random_state=42)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

svm_clf.fit(X_train_scaled, y_train)

y_test_pred = svm_clf.predict(X_test_scaled)

print("\nConfusion Matrix:")
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
print(test_conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))

InvalidParameterError: The 'dual' parameter of LinearSVC must be an instance of 'bool', an instance of 'numpy.bool_' or an instance of 'int'. Got 'auto' instead.