# **Validation of Models**
This notebook focuses on the validation of models by analyzing both their accuracy and cost-based performance.

---

## Accuracy-based performance

In [1]:
from ucimlrepo import fetch_ucirepo

cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

x = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

### 3. Naive Bayes

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np
import pandas as pd

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

cost_matrix = np.array([[0, 10], [1, 0]])

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracy_scores = []
cost_scores = []

nb_model = GaussianNB()

for train_index, test_index in skf.split(X_balanced, y_balanced):
    X_train_fold, X_val_fold = X_balanced.iloc[train_index], X_balanced.iloc[test_index]
    y_train_fold, y_val_fold = y_balanced.iloc[train_index], y_balanced.iloc[test_index]

    nb_model.fit(X_train_fold, y_train_fold)

    y_pred_fold = nb_model.predict(X_val_fold)

    accuracy_fold = accuracy_score(y_val_fold, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    confusion_mat = confusion_matrix(y_val_fold, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation:", np.std(cost_scores))
divisor = len (y_val_fold)
cost_scores_per_instance = [x / divisor for x in cost_scores]
print("Mean cost:", np.mean(cost_scores_per_instance))
print("Standard deviation:", np.std(cost_scores_per_instance))

Mean accuracy: 0.7180307386879229
Standard deviation: 0.003899447448175181
Mean cost: 10399.5
Standard deviation: 176.26868695261788
Mean cost: 1.44557964970809
Standard deviation: 0.024502180560552953


### 5. Neural network with number of hidden layers automatically chosen

In [22]:
from kerastuner.tuners import RandomSearch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=128, step=32),
                    input_dim=X_train_scaled.shape[1],
                    activation='relu'))
    
    for i in range(hp.Int('num_hidden_layers', 1, 3)):  # 1 to 3 hidden layers
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=128, step=32),
                        activation='relu'))
        model.add(Dropout(hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.1)))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 1e-4, 1e-5])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_directory',
    project_name='accuracy_based_nn_tuning'
)

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
accuracy_scores = []
cost_scores = []

for train_idx, test_idx in skf.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    tuner.search(X_train_scaled, y_train, epochs=10, validation_split=0.2, verbose=0)

    best_model = tuner.get_best_models(num_models=1)[0]
    best_model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, verbose=0)

    y_pred_fold = (best_model.predict(X_test_scaled) > 0.5).astype("int32")

    accuracy_fold = accuracy_score(y_test, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    confusion_mat = confusion_matrix(y_test, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation:", np.std(cost_scores))
divisor = len(y_test)
cost_scores_per_instance = [x / divisor for x in cost_scores]
print("Mean cost per instance:", np.mean(cost_scores_per_instance))
print("Standard deviation cost per instance:", np.std(cost_scores_per_instance))

Reloading Tuner from tuner_directory/accuracy_based_nn_tuning/tuner0.json
Mean accuracy: 0.7461358811286449
Standard deviation: 0.004299364434950486
Mean cost: 10414.2
Standard deviation: 464.7538703442931
Mean cost per instance: 1.4476230191826522
Standard deviation cost per instance: 0.06460298447932904


### 6. Neural network with chosen number of hidden layers (1 and 2)

In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

def create_static_model(input_dim):
    model = Sequential([
        Dense(64, input_dim=input_dim, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

accuracy_scores = []
cost_scores = []

for train_idx, test_idx in skf.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model = create_static_model(X_train_scaled.shape[1])
    model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, verbose=0)

    y_pred_fold = (model.predict(X_test_scaled) > 0.5).astype("int32")

    accuracy_fold = accuracy_score(y_test, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    confusion_mat = confusion_matrix(y_test, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation:", np.std(cost_scores))
divisor = len(y_test)
cost_scores_per_instance = [x / divisor for x in cost_scores]
print("Mean cost per instance:", np.mean(cost_scores_per_instance))
print("Standard deviation cost per instance:", np.std(cost_scores_per_instance))

Mean accuracy: 0.751779094346053
Standard deviation: 0.0032230170288836844
Mean cost: 10296.2
Standard deviation: 382.28465833721344
Mean cost per instance: 1.431220461495691
Standard deviation cost per instance: 0.05313937424759704


### 7. Deep learning

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from keras.models import Sequential
from keras.layers import Dense, Dropout

cost_matrix = np.array([[0, 10], [1, 0]])

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']

# Cross-Validation (Accuracy)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
accuracy_scores = []
cost_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model = Sequential([
        Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

    scores = model.evaluate(X_test_scaled, y_test, verbose=0)
    accuracy_scores.append(scores[1])

    y_pred_fold = (model.predict(X_test_scaled) > 0.5).astype("int32")
    confusion_mat = confusion_matrix(y_test, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation:", np.std(cost_scores))
divisor = len (y_val_fold)
cost_scores_per_instance = [x / divisor for x in cost_scores]
print("Mean cost:", np.mean(cost_scores_per_instance))
print("Standard deviation:", np.std(cost_scores_per_instance))

Mean accuracy: 0.7505976021289825
Standard deviation: 0.005213053811480142
Mean cost: 10561.2
Standard deviation: 510.5698385137924
Mean cost: 1.4680567139282734
Standard deviation: 0.07097162058851716


### 8. SVM

In [8]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

cost_matrix = np.array([[0, 10], [1, 0]])

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

accuracy_scores = []
cost_scores = []

svm_clf = LinearSVC(dual=False, random_state=42)

for train_index, test_index in skf.split(X_balanced, y_balanced):
    X_train_fold, X_val_fold = X_balanced.iloc[train_index], X_balanced.iloc[test_index]
    y_train_fold, y_val_fold = y_balanced.iloc[train_index], y_balanced.iloc[test_index]

    scaler = StandardScaler()
    X_train_scaled = X_train_fold.copy()
    X_val_scaled = X_val_fold.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train_fold[columns_to_normalize])
    X_val_scaled[columns_to_normalize] = scaler.transform(X_val_fold[columns_to_normalize])

    svm_clf.fit(X_train_scaled, y_train_fold)

    y_pred_fold = svm_clf.predict(X_val_scaled)

    accuracy_fold = accuracy_score(y_val_fold, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    confusion_mat = confusion_matrix(y_val_fold, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation:", np.std(cost_scores))
divisor = len (y_val_fold)
cost_scores_per_instance = [x / divisor for x in cost_scores]
print("Mean cost:", np.mean(cost_scores_per_instance))
print("Standard deviation:", np.std(cost_scores_per_instance))

Mean accuracy: 0.7448709613041367
Standard deviation: 0.005344461520723734
Mean cost: 10914.7
Standard deviation: 248.00687490470904
Mean cost: 1.5171948846260772
Standard deviation: 0.03447412773209746


<br><br>

---

## Cost-based performance

### 3. Naive Bayes

In [3]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

total = np.sum(cost_matrix)
class_prior = [np.sum(cost_matrix[:, 0]) / total, np.sum(cost_matrix[:, 1]) / total]

columns_to_normalize = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
                        'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
                        'Income']

accuracy_scores = []
cost_scores = []

nb_model = GaussianNB(priors=class_prior)

for train_idx, test_idx in skf.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    nb_model.fit(X_train_scaled, y_train)

    y_pred_fold = nb_model.predict(X_test_scaled)

    accuracy_fold = accuracy_score(y_test, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    confusion_mat = confusion_matrix(y_test, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation:", np.std(cost_scores))
divisor = len (y_val_fold)
cost_scores_per_instance = [x / divisor for x in cost_scores]
print("Mean cost:", np.mean(cost_scores_per_instance))
print("Standard deviation:", np.std(cost_scores_per_instance))

Mean accuracy: 0.6829200845504216
Standard deviation: 0.006260065663740211
Mean cost: 5515.8
Standard deviation: 184.18349546036964
Mean cost: 0.7667222685571309
Standard deviation: 0.025602376349787254


### 5. Neural network (automatic)

In [18]:
from keras_tuner.tuners import RandomSearch
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=128, step=32),
                    input_dim=X_train_scaled.shape[1],
                    activation='relu'))
    
    for i in range(hp.Int('num_hidden_layers', 1, 3)):
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=128, step=32),
                        activation='relu'))
        model.add(Dropout(hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.1)))
    
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 1e-4, 1e-5])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='tuner_directory',
    project_name='cost_based_nn_tuning'
)

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
accuracy_scores = []
cost_scores = []

for train_idx, test_idx in skf.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    tuner.search(X_train_scaled, y_train, epochs=10, validation_split=0.2, verbose=0)

    best_model = tuner.get_best_models(num_models=1)[0]
    best_model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, verbose=0)

    y_pred_fold = (best_model.predict(X_test_scaled) > 0.5).astype("int32")

    accuracy_fold = accuracy_score(y_test, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    confusion_mat = confusion_matrix(y_test, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation:", np.std(cost_scores))
divisor = len(y_test)
cost_scores_per_instance = [x / divisor for x in cost_scores]
print("Mean cost per instance:", np.mean(cost_scores_per_instance))
print("Standard deviation cost per instance:", np.std(cost_scores_per_instance))

  from kerastuner.tuners import RandomSearch


Mean accuracy: 0.7469002583613903
Standard deviation: 0.005928988980732573
Mean cost: 10055.0
Standard deviation: 488.8218489388542
Mean cost per instance: 1.3976925215457325
Standard deviation cost per instance: 0.06794854725310737


### 6. Neural network (not automatic)

In [19]:
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np
from tensorflow.keras.layers import Dense, Dropout

def create_static_model(input_dim):
    model = Sequential([
        Dense(64, input_dim=input_dim, activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

accuracy_scores = []
cost_scores = []

for train_idx, test_idx in skf.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model = create_static_model(X_train_scaled.shape[1])
    model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, verbose=0)

    y_pred_fold = (model.predict(X_test_scaled) > 0.5).astype("int32")

    accuracy_fold = accuracy_score(y_test, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    confusion_mat = confusion_matrix(y_test, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation:", np.std(cost_scores))
divisor = len(y_test)
cost_scores_per_instance = [x / divisor for x in cost_scores]
print("Mean cost per instance:", np.mean(cost_scores_per_instance))
print("Standard deviation cost per instance:", np.std(cost_scores_per_instance))

Mean accuracy: 0.7511118851842213
Standard deviation: 0.005019265558982593
Mean cost: 10590.8
Standard deviation: 516.6005807197665
Mean cost per instance: 1.4721712538226301
Standard deviation cost per instance: 0.07180992225740426


### 7. Deep learning

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout

cost_matrix = np.array([[0, 10], [1, 0]])

df = pd.concat([x, y], axis=1)

healthy_df = df[df['Diabetes_binary'] == 0]
diabetic_df = df[df['Diabetes_binary'] == 1]

healthy_sampled = healthy_df.sample(frac=0.2, random_state=42)
diabetic_sampled = diabetic_df.sample(frac=0.8, random_state=42)

balanced_df = pd.concat([healthy_sampled, diabetic_sampled])

X_balanced = balanced_df.drop(columns=['Diabetes_binary'])
y_balanced = balanced_df['Diabetes_binary']

columns_to_normalize = [
    'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
    'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
    'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 
    'Education', 'Income'
]

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
cost_scores = []
accuracy_scores = []

for train_idx, test_idx in cv.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    model = Sequential([
        Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
        Dropout(0.3),
        Dense(32, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy')

    model.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

    y_pred_fold = (model.predict(X_test_scaled) > 0.5).astype("int32")
    confusion_mat = confusion_matrix(y_test, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

    accuracy = accuracy_score(y_test, y_pred_fold)
    accuracy_scores.append(accuracy)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation of accuracy:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation of cost:", np.std(cost_scores))
total_instances = len(y_balanced)
cost_scores_per_instance = [x / total_instances for x in cost_scores]
print("Mean cost per instance:", np.mean(cost_scores_per_instance))
print("Standard deviation of cost per instance:", np.std(cost_scores_per_instance))

Mean accuracy: 0.7497218881536483
Standard deviation of accuracy: 0.005162848290429181
Mean cost: 10689.9
Standard deviation of cost: 352.56331346298634
Mean cost per instance: 0.14858640053374844
Standard deviation of cost per instance: 0.004900524205812667


### 8. SVM

In [6]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

class_weights = {0: 1, 1: 10}

n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracy_scores = []
cost_scores = []

svm_clf = LinearSVC(dual=False, random_state=42, class_weight=class_weights)

for train_idx, test_idx in skf.split(X_balanced, y_balanced):
    X_train, X_test = X_balanced.iloc[train_idx], X_balanced.iloc[test_idx]
    y_train, y_test = y_balanced.iloc[train_idx], y_balanced.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[columns_to_normalize] = scaler.fit_transform(X_train[columns_to_normalize])
    X_test_scaled[columns_to_normalize] = scaler.transform(X_test[columns_to_normalize])

    svm_clf.fit(X_train_scaled, y_train)

    y_pred_fold = svm_clf.predict(X_test_scaled)

    accuracy_fold = accuracy_score(y_test, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    confusion_mat = confusion_matrix(y_test, y_pred_fold)
    cost = np.sum(confusion_mat * np.transpose(cost_matrix))
    cost_scores.append(cost)

print("Mean accuracy:", np.mean(accuracy_scores))
print("Standard deviation:", np.std(accuracy_scores))
print("Mean cost:", np.mean(cost_scores))
print("Standard deviation:", np.std(cost_scores))
divisor = len (y_val_fold)
cost_scores_per_instance = [x / divisor for x in cost_scores]
print("Mean cost:", np.mean(cost_scores_per_instance))
print("Standard deviation:", np.std(cost_scores_per_instance))

Mean accuracy: 0.5237684461396774
Standard deviation: 0.004372030459279511
Mean cost: 3647.6
Standard deviation: 40.7852914664098
Mean cost: 0.5070336391437309
Standard deviation: 0.00566934827167219
