# **Model Training**
This notebook focuses on model training.

---

## Accuracy-trained models

<br><br>
<br><br>
### 3. Naive Bayes

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pandas as pd
import numpy as np

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

nb_model = GaussianNB()

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    # Split data into train and test folds
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    nb_model.fit(X_train_scaled, y_train)
    score = nb_model.score(X_test_scaled, y_test)
    cv_scores.append(score)

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, stratify=y_balanced,
                                                    random_state=42)

# Normalization
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

nb_model.fit(X_train_scaled, y_train)
y_pred = nb_model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)
print(conf_matrix)
print("Classification Report:")
print(classification_report(y_test, y_pred))

<br><br>
<br><br>
### 4. Bayesian network

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from pgmpy.models import BayesianNetwork
from pgmpy.factors.discrete import TabularCPD
from pgmpy.inference import VariableElimination

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']


def fit_bayesian_network(X_train, y_train):
    model = BayesianNetwork([('HighBP', 'Diabetes_binary'),
                             ('HighChol', 'Diabetes_binary'),
                             ('BMI', 'Diabetes_binary'),
                             ('Smoker', 'Diabetes_binary'),
                             ('Diabetes_binary', 'BMI')])

    cpd_highbp = TabularCPD(variable='HighBP', variable_card=2, values=[[0.8], [0.2]])
    cpd_highchol = TabularCPD(variable='HighChol', variable_card=2, values=[[0.7], [0.3]])
    cpd_bmi = TabularCPD(variable='BMI', variable_card=2, values=[[0.5], [0.5]])
    cpd_smoker = TabularCPD(variable='Smoker', variable_card=2, values=[[0.4], [0.6]])
    cpd_diabetes = TabularCPD(variable='Diabetes_binary', variable_card=2,
                              values=[[0.7, 0.3], [0.3, 0.7]],
                              evidence=['HighBP', 'HighChol', 'BMI', 'Smoker'],
                              evidence_card=[2, 2, 2, 2])

    model.add_cpds(cpd_highbp, cpd_highchol, cpd_bmi, cpd_smoker, cpd_diabetes)

    model.check_model()

    inference = VariableElimination(model)

    return model, inference


cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model, inference = fit_bayesian_network(X_train_scaled, y_train)

    prediction = inference.predict(X_test_scaled)
    predicted_labels = prediction['Diabetes_binary']

    accuracy = accuracy_score(y_test, predicted_labels)
    cv_scores.append(accuracy)

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores (Bayesian Network): {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

model, inference = fit_bayesian_network(X_train_scaled, y_train)

prediction = inference.predict(X_test_scaled)
predicted_labels = prediction['Diabetes_binary']

test_accuracy = accuracy_score(y_test, predicted_labels)
print(f"Test Set Accuracy (Bayesian Network): {test_accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, predicted_labels))
print("\nClassification Report:")
print(classification_report(y_test, predicted_labels))

<br><br>
<br><br>
### 5. Neural network (automatic)

In [None]:
# to-do

<br><br>
<br><br>
### 6. Neural network (not automatic)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

num_hidden_layers = 2  # 1 or 2

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model = Sequential()
    model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
    model.add(Dropout(0.3))

    if num_hidden_layers == 2:
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.3))

    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

    scores = model.evaluate(X_test_scaled, y_test, verbose=0)
    cv_scores.append(scores[1])

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

model = Sequential()
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dropout(0.3))

if num_hidden_layers == 2:
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=1)

y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {test_accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

<br><br>
<br><br>
### 7. Deep learning

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model = Sequential([
        Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')  # Output layer with sigmoid activation
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

    scores = model.evaluate(X_test_scaled, y_test, verbose=0)
    cv_scores.append(scores[1])

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, stratify=y_balanced, random_state=42
)

# Normalization
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

model = Sequential([
    Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Sigmoid activation
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=1)

y_pred = (model.predict(X_test_scaled) > 0.5).astype("int32")
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {test_accuracy:.2f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

2025-01-06 12:49:41.851679: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


KeyboardInterrupt: 

<br><br>
<br><br>
### 8. SVM

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)  # Adjust the fraction as needed
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)  # Adjust the fraction as needed

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

svm_clf = LinearSVC(dual="auto", random_state=42)

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = []
y_pred_cv = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    svm_clf.fit(X_train_scaled, y_train)
    score = svm_clf.score(X_test_scaled, y_test)
    cv_scores.append(score)

    y_pred_fold = svm_clf.predict(X_test_scaled)
    y_pred_cv.extend(y_pred_fold)

y_pred_cv = np.array(y_pred_cv)

cv_scores = np.array(cv_scores)
mean_accuracy = cv_scores.mean()
std_dev = cv_scores.std()
conf_interval_lower = mean_accuracy - std_dev
conf_interval_upper = mean_accuracy + std_dev

print(f"Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.2f}")
print(f"Standard Deviation: {std_dev:.2f}")
print(f"Confidence Interval: [{conf_interval_lower:.2f}, {conf_interval_upper:.2f}]")

X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, stratify=y_balanced,
                                                    random_state=42)

scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

svm_clf.fit(X_train_scaled, y_train)

y_test_pred = svm_clf.predict(X_test_scaled)

print("\nConfusion Matrix:")
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
print(test_conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))