In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Set random seed for reproducibility
np.random.seed(42)

# Load the dataset
df = pd.read_csv('data/pima-indians-diabetes.csv')

# Display basic information about the dataset
#print(df.info())
#print(df.describe())
#print(df.head())

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Separate features and target variable
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Exploratory Data Analysis (EDA)
sns.set(style="whitegrid")

# Plot distributions for continuous variables
plt.figure(figsize=(16, 12))
continuous_vars = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'age']
for i, column in enumerate(continuous_vars, 1):
    plt.subplot(3, 3, i)
    sns.histplot(df[column], kde=True, bins=30)
    plt.title(f'Distribution of {column}')
plt.tight_layout()
plt.show()

# Plot boxplots for continuous variables
plt.figure(figsize=(16, 12))
for i, column in enumerate(continuous_vars, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(x='class', y=column, data=df)
    plt.title(f'{column} vs Diabetes')
plt.tight_layout()
plt.show()

# Plot heatmap for correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.2)
plt.title('Correlation Matrix')
plt.show()

# Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(classification_report(y_test, y_pred))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='coolwarm', cbar=False)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
print("Logistic Regression:")
evaluate_model(log_reg, X_test, y_test)

# Decision Tree
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
print("Decision Tree:")
evaluate_model(dt_clf, X_test, y_test)

# Random Forest
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train, y_train)
print("Random Forest:")
evaluate_model(rf_clf, X_test, y_test)

# Gradient Boosting
gb_clf = GradientBoostingClassifier(random_state=42)
gb_clf.fit(X_train, y_train)
print("Gradient Boosting:")
evaluate_model(gb_clf, X_test, y_test)

# Hyperparameter tuning for Random Forest using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

print("Best Random Forest Model:")
evaluate_model(best_rf, X_test, y_test)


1     0
2     1
3     0
4     1
5     0
6     1
7     0
8     1
9     1
10    0
11    1
12    0
13    1
14    1
15    1
16    1
17    1
18    0
19    1
20    0
21    0
22    1
23    1
24    1
25    1
26    1
27    0
28    0
29    0
30    0
31    1
32    0
33    0
34    0
35    0
36    0
37    1
38    1
39    1
40    0
41    0
42    0
43    1
44    0
45    1
46    0
47    0
48    1
49    0
Name: class, dtype: int64


'\n# Standardize the features\nscaler = StandardScaler()\nX_train = scaler.fit_transform(X_train)\nX_test = scaler.transform(X_test)\n\n# Exploratory Data Analysis (EDA)\nsns.set(style="whitegrid")\n\n# Plot distributions for continuous variables\nplt.figure(figsize=(16, 12))\ncontinuous_vars = [\'preg\', \'plas\', \'pres\', \'skin\', \'test\', \'mass\', \'age\']\nfor i, column in enumerate(continuous_vars, 1):\n    plt.subplot(3, 3, i)\n    sns.histplot(df[column], kde=True, bins=30)\n    plt.title(f\'Distribution of {column}\')\nplt.tight_layout()\nplt.show()\n\n# Plot boxplots for continuous variables\nplt.figure(figsize=(16, 12))\nfor i, column in enumerate(continuous_vars, 1):\n    plt.subplot(3, 3, i)\n    sns.boxplot(x=\'class\', y=column, data=df)\n    plt.title(f\'{column} vs Diabetes\')\nplt.tight_layout()\nplt.show()\n\n# Plot heatmap for correlation matrix\nplt.figure(figsize=(10, 8))\nsns.heatmap(df.corr(), annot=True, cmap=\'coolwarm\', linewidths=0.2)\nplt.title(\'Correl