In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, cohen_kappa_score
from scipy.stats import chi2_contingency
from scipy import stats
from statsmodels.stats.contingency_tables import mcnemar

# 1. Read the file
df = pd.read_csv('patients1.csv')

# 2. Preprocess the data
le = LabelEncoder()
categorical_columns = ['Gender', 'ProgramType', 'PhysicalActivityLevel', 'SupportSystem']
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

df['StrictDiet'] = (df['BMI'] >= 30).astype(int)

features = ['Gender', 'Age', 'BMI', 'ProgramType', 'PreviousDiets', 'PhysicalActivityLevel', 'SupportSystem']
X = df[features]
y = df['StrictDiet']

# 3. Split data into testing and training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train the naïve Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

# 5. Make predictions
y_pred = model.predict(X_test)

# 6. Evaluate the model
def print_extended_confusion_matrix_stats(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    # Basic statistics
    total = tn + fp + fn + tp
    accuracy = (tp + tn) / total
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    ppv = tp / (tp + fp)
    npv = tn / (tn + fn)
    
    # Kappa statistic
    kappa = cohen_kappa_score(y_true, y_pred)
    
    # 95% CI for accuracy
    z = 1.96  # for 95% CI
    ci_lower = accuracy - z * np.sqrt((accuracy * (1 - accuracy)) / total)
    ci_upper = accuracy + z * np.sqrt((accuracy * (1 - accuracy)) / total)
    
    # P-value (chi-square test of independence)
    chi2, p_value, _, _ = chi2_contingency(cm)
    
    # McNemar's test
    mcnemar_result = mcnemar(cm, exact=False, correction=True)
    mcnemar_p_value = mcnemar_result.pvalue
    
    # No Information Rate (NIR) and its p-value
    nir = max(np.sum(cm, axis=1)) / total
    nir_p_value = 1 - stats.binom.cdf(tp + tn - 1, total, nir)
    
    print("Confusion Matrix:")
    print(cm)
    print("\nStatistics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"95% CI: ({ci_lower:.4f}, {ci_upper:.4f})")
    print(f"No Information Rate: {nir:.4f}")
    print(f"P-Value [Acc > NIR]: {nir_p_value:.4f}")
    print(f"Kappa: {kappa:.4f}")
    print(f"Sensitivity: {sensitivity:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"Pos Pred Value: {ppv:.4f}")
    print(f"Neg Pred Value: {npv:.4f}")
    print(f"P-Value [Chi-square]: {p_value:.4f}")
    print(f"McNemar's Test P-Value: {mcnemar_p_value:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# Print extended confusion matrix and statistics
print_extended_confusion_matrix_stats(y_test, y_pred)

Confusion Matrix:
[[7 0]
 [0 3]]

Statistics:
Accuracy: 1.0000
95% CI: (1.0000, 1.0000)
No Information Rate: 0.7000
P-Value [Acc > NIR]: 0.0282
Kappa: 1.0000
Sensitivity: 1.0000
Specificity: 1.0000
Pos Pred Value: 1.0000
Neg Pred Value: 1.0000
P-Value [Chi-square]: 0.0160
McNemar's Test P-Value: 0.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         3

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10



  statistic = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
