In [4]:
import os
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

# Ensure the models directory exists
models_dir = './models_dir/'
charts_dir = './charts/'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

if not os.path.exists(charts_dir):
    os.makedirs(charts_dir)

# Load dataset
df = pd.read_csv('diabetes.csv')

# Split data
X = df.drop(columns=['Outcome'])
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train models
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_train, y_train)

# Save models
with open(os.path.join(models_dir, 'logistic_regression.pkl'), 'wb') as f:
    pickle.dump(log_reg, f)

with open(os.path.join(models_dir, 'decision_tree.pkl'), 'wb') as f:
    pickle.dump(dec_tree, f)

with open(os.path.join(models_dir, 'scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)

# Compute accuracy
log_reg_accuracy = accuracy_score(y_test, log_reg.predict(X_test))
dec_tree_accuracy = accuracy_score(y_test, dec_tree.predict(X_test))

# Save metrics
metrics = {
    "logistic_regression": classification_report(y_test, log_reg.predict(X_test), output_dict=True),
    "decision_tree": classification_report(y_test, dec_tree.predict(X_test), output_dict=True),
    "accuracy": {
        "logistic_regression": log_reg_accuracy,
        "decision_tree": dec_tree_accuracy
    }
}

with open(os.path.join(models_dir, 'metrics.pkl'), 'wb') as f:
    pickle.dump(metrics, f)

# Visualization - Feature Distribution
plt.figure(figsize=(10, 6))
for i, column in enumerate(X.columns[:6]):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[column], kde=True, bins=30)
    plt.title(column)
plt.tight_layout()
plt.savefig(os.path.join(charts_dir, 'feature_distributions.png'))
plt.close()

# Correlation Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.savefig(os.path.join(charts_dir, 'correlation_heatmap.png'))
plt.close()

# Model Performance Bar Chart (including Accuracy)
performance_metrics = ['precision', 'recall', 'f1-score', 'accuracy']
log_reg_scores = [metrics['logistic_regression']['1'][m] for m in performance_metrics[:-1]] + [log_reg_accuracy]
dec_tree_scores = [metrics['decision_tree']['1'][m] for m in performance_metrics[:-1]] + [dec_tree_accuracy]

plt.figure(figsize=(8, 5))
x = range(len(performance_metrics))
plt.bar(x, log_reg_scores, width=0.4, label="Logistic Regression", alpha=0.7)
plt.bar([i + 0.4 for i in x], dec_tree_scores, width=0.4, label="Decision Tree", alpha=0.7)
plt.xticks([i + 0.2 for i in x], performance_metrics)
plt.ylabel("Score")
plt.title("Model Performance Comparison")
plt.legend()
plt.savefig(os.path.join(charts_dir, 'model_performance.png'))
plt.close()

print("✅ Models trained, metrics saved, and visualizations (including accuracy) generated successfully!")

✅ Models trained, metrics saved, and visualizations (including accuracy) generated successfully!
