In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [27]:
# Load your dataset from the local file path
file_path = 'diabetes.csv'
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

df = pd.read_csv(file_path, names=columns)

In [28]:
# Check the types of the columns to identify any non-numeric values
print(df.dtypes)

# Replace zero values with NaN for columns that can't have zeros
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

# Convert columns to numeric if they are not already
df = df.apply(pd.to_numeric, errors='coerce')

# Fill missing values with the median of each column
df.fillna(df.median(), inplace=True)

Pregnancies                 object
Glucose                     object
BloodPressure               object
SkinThickness               object
Insulin                     object
BMI                         object
DiabetesPedigreeFunction    object
Age                         object
Outcome                     object
dtype: object


In [29]:
# Feature scaling (normalization/standardization) - Optional but recommended for certain models
X = df.drop('Outcome', axis=1)  # Features
y = df['Outcome']  # Target

print(df.head())  # Check the dataframe after processing


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Function to train and evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = model.predict(X_test)
    
    # Evaluate model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    # Cross-validation for more robust evaluation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print("="*40)

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          3.0    117.0           72.0           23.0     30.5  32.0   
1          6.0    148.0           72.0           35.0      0.0  33.6   
2          1.0     85.0           66.0           29.0      0.0  26.6   
3          8.0    183.0           64.0            0.0      0.0  23.3   
4          1.0     89.0           66.0           23.0     94.0  28.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                    0.3725  29.0      0.0  
1                    0.6270  50.0      1.0  
2                    0.3510  31.0      0.0  
3                    0.6720  32.0      1.0  
4                    0.1670  21.0      0.0  


In [30]:
# Logistic Regression
print("Logistic Regression Results:")
logreg = LogisticRegression()
evaluate_model(logreg, X_train, X_test, y_train, y_test)

Logistic Regression Results:
Accuracy: 0.7727
Precision: 0.7250
Recall: 0.5472
F1 Score: 0.6237
Cross-Validation Accuracy: 0.7756 ± 0.0122


In [31]:
# Decision Tree Classifier
print("Decision Tree Results:")
tree = DecisionTreeClassifier(random_state=42)
evaluate_model(tree, X_train, X_test, y_train, y_test)

Decision Tree Results:
Accuracy: 0.6429
Precision: 0.4808
Recall: 0.4717
F1 Score: 0.4762
Cross-Validation Accuracy: 0.6976 ± 0.0331


In [32]:
# Neural Network (MLP)
print("Neural Network (MLP) Results:")
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
evaluate_model(mlp, X_train, X_test, y_train, y_test)

Neural Network (MLP) Results:
Accuracy: 0.6688
Precision: 0.5192
Recall: 0.5094
F1 Score: 0.5143
Cross-Validation Accuracy: 0.7366 ± 0.0265
