In [47]:
import pandas as pd 
import sklearn

In [48]:
# Read the CSV file
file_path = 'Data/diabetes_data.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# Reviewing Datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   70692 non-null  float64
 1   Sex                   70692 non-null  float64
 2   HighChol              70692 non-null  float64
 3   CholCheck             70692 non-null  float64
 4   BMI                   70692 non-null  float64
 5   Smoker                70692 non-null  float64
 6   HeartDiseaseorAttack  70692 non-null  float64
 7   PhysActivity          70692 non-null  float64
 8   Fruits                70692 non-null  float64
 9   Veggies               70692 non-null  float64
 10  HvyAlcoholConsump     70692 non-null  float64
 11  GenHlth               70692 non-null  float64
 12  MentHlth              70692 non-null  float64
 13  PhysHlth              70692 non-null  float64
 14  DiffWalk              70692 non-null  float64
 15  Stroke             

In [50]:
import os
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report



# Load the dataset
diabetes_data = pd.read_csv(file_path)

# Rename column to match schema if necessary
if 'HvyAlcoholConsump' in diabetes_data.columns:
    diabetes_data.rename(columns={'HvyAlcoholConsump': 'hvyalcohol'}, inplace=True)

# Create a SQLite database and load the data
db_connection = sqlite3.connect('diabetes_project.db')
cursor = db_connection.cursor()

schema_sql = """
DROP TABLE IF EXISTS diabetes;

CREATE TABLE diabetes (
    age FLOAT,
    sex FLOAT,
    highchol FLOAT,
    cholcheck FLOAT,
    bmi FLOAT,
    smoker FLOAT,
    heartdiseaseorattack FLOAT,
    physactivity FLOAT,
    fruits FLOAT,
    veggies FLOAT,
    hvyalcohol FLOAT,
    genhlth FLOAT,
    menthlth FLOAT,
    physhlth FLOAT,
    diffwalk FLOAT,
    stroke FLOAT,
    highbp FLOAT,
    diabetes FLOAT
);
"""

cursor.executescript(schema_sql)
db_connection.commit()

# Load data into the SQLite database
diabetes_data.to_sql('diabetes', db_connection, if_exists='append', index=False)

# Fetch the data back from SQLite
query = "SELECT * FROM diabetes"
diabetes_df = pd.read_sql_query(query, db_connection)
db_connection.close()

# Separate features (X) and target variable (y)
X = diabetes_df.drop(columns=['diabetes'])
y = diabetes_df['diabetes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Simplified hyperparameter grid
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

# Initialize and fit GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Save optimization results to a CSV file
results_df = pd.DataFrame(grid_search.cv_results_)
results_path = 'model_optimization_results.csv'
results_df.to_csv(results_path, index=False)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)


# Print final results
print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)
print("Final Test Set Accuracy:", final_accuracy)
print("\nClassification Report:\n", classification_report_output)
print("\nResults saved to:", results_path)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Best Cross-Validation Score: 0.7472718289028135
Final Test Set Accuracy: 0.7522161448509996

Classification Report:
               precision    recall  f1-score   support

         0.0       0.78      0.71      0.74     10601
         1.0       0.73      0.79      0.76     10607

    accuracy                           0.75     21208
   macro avg       0.75      0.75      0.75     21208
weighted avg       0.75      0.75      0.75     21208


Results saved to: model_optimization_results.csv


In [51]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)


In [52]:

# Step 1: Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid={
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Step 2: Fit the model
grid_search.fit(X_train, y_train)

# Step 3: Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

# Step 4: Print results
print("Final Test Set Accuracy:", final_accuracy)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)
print("\nClassification Report:\n", classification_report_output)


Fitting 3 folds for each of 216 candidates, totalling 648 fits


Final Test Set Accuracy: 0.7528291210863824
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Best Cross-Validation Score: 0.7482822630509061

Classification Report:
               precision    recall  f1-score   support

         0.0       0.77      0.71      0.74     10601
         1.0       0.73      0.79      0.76     10607

    accuracy                           0.75     21208
   macro avg       0.75      0.75      0.75     21208
weighted avg       0.75      0.75      0.75     21208



In [53]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
                                            

In [54]:
# Load the dataset
file_path = 'Data/diabetes_data.csv' 
diabetes_data = pd.read_csv(file_path)

# Rename column to match schema if necessary
if 'HvyAlcoholConsump' in diabetes_data.columns:
    diabetes_data.rename(columns={'HvyAlcoholConsump': 'hvyalcohol'}, inplace=True)


In [55]:

# Split features and target
X = diabetes_data.drop(columns=['diabetes'])
y = diabetes_data['diabetes']


KeyError: "['diabetes'] not found in axis"

In [None]:
## Debug

# Step 1: Check and Debug Column Names
print("Available Columns:", diabetes_data.columns)

# Step 2: Rename Target Column if Necessary
if 'Diabetes' in diabetes_data.columns:
    diabetes_data.rename(columns={'Diabetes': 'diabetes'}, inplace=True)

# Step 3: Inspect the First Few Rows
print("First Few Rows:\n", diabetes_data.head())

# Step 4: Split Features and Target
X = diabetes_data.drop(columns=['diabetes'])
y = diabetes_data['diabetes']

# Step 5: Scale Numerical Features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Proceed with Cross-Validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=42)
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')

# Step 7: Print Results
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())




Available Columns: Index(['Age', 'Sex', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'hvyalcohol', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Stroke',
       'HighBP', 'Diabetes'],
      dtype='object')
First Few Rows:
     Age  Sex  HighChol  CholCheck   BMI  Smoker  HeartDiseaseorAttack  \
0   4.0  1.0       0.0        1.0  26.0     0.0                   0.0   
1  12.0  1.0       1.0        1.0  26.0     1.0                   0.0   
2  13.0  1.0       0.0        1.0  26.0     0.0                   0.0   
3  11.0  1.0       1.0        1.0  28.0     1.0                   0.0   
4   8.0  0.0       0.0        1.0  29.0     1.0                   0.0   

   PhysActivity  Fruits  Veggies  hvyalcohol  GenHlth  MentHlth  PhysHlth  \
0           1.0     0.0      1.0         0.0      3.0       5.0      30.0   
1           0.0     1.0      0.0         0.0      3.0       0.0       0.0   
2           1.0     1.0      1.0 

In [None]:

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Initialize the Random Forest Classifier
model = RandomForestClassifier(random_state=42)


In [None]:
# Perform 5-Fold Cross-Validation
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())

Cross-Validation Scores: [0.72027725 0.72522809 0.72443061 0.72853303 0.73221106]
Mean Accuracy: 0.726136009261001


In [None]:
# Debug

# Step 1: Debug column names
print("Available Columns:", diabetes_data.columns)

# Step 2: Rename target column if necessary
if 'Diabetes' in diabetes_data.columns:
    diabetes_data.rename(columns={'Diabetes': 'diabetes'}, inplace=True)

# Step 3: Inspect the first few rows
print("First Few Rows:\n", diabetes_data.head())

# Step 4: Split features and target
X = diabetes_data.drop(columns=['diabetes'])
y = diabetes_data['diabetes']

# Step 5: Scale numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 7: Proceed with modeling
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)


In [None]:
# Define a detailed hyperparameter grid
param_grid = {
    'n_estimators': [200, 300, 400],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [None]:
# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
# Calculate accuracy and classification report
final_accuracy = accuracy_score(y_test, y_pred)
classification_report_output = classification_report(y_test, y_pred)

# Print results
print("Final Test Set Accuracy:", final_accuracy)
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)
print("\nClassification Report:\n", classification_report_output)

Final Test Set Accuracy: 0.7527819690682761
Best Parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 400}
Best Cross-Validation Score: 0.7490299764208371

Classification Report:
               precision    recall  f1-score   support

         0.0       0.78      0.71      0.74     10601
         1.0       0.73      0.80      0.76     10607

    accuracy                           0.75     21208
   macro avg       0.75      0.75      0.75     21208
weighted avg       0.75      0.75      0.75     21208



XGBoost for Higher Accuracy


In [None]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.3-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 330.3 kB/s eta 0:06:19
   ---------------------------------------- 0.0/124.9 MB 326.8 kB/s eta 0:06:23
   ---------------------------------------- 0.1/124.9 MB 944.1 kB/s eta 0:02:13
   ---------------------------------------- 0.4/124.9 MB 2.2 MB/s eta 0:00:57
   ---------------------------------------- 0.4/124.9 MB 2.2 MB/s eta 0:00:57
   ---------------------------------------- 0.9/124.9 MB 3.4 MB/s eta 0:00:37
   ---------------------------------------- 1.1/124.9 MB 3.7 MB/s eta 0:00:34
   ---------------------------------------- 1.5/124.9 MB 4.7 MB/s eta 0:00:27
    -----

In [None]:
import xgboost
print("XGBoost version:", xgboost.__version__) 

XGBoost version: 2.1.3


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [3]:
# Load the dataset
file_path = 'Data/diabetes_data.csv'  # Adjust to your file path
diabetes_data = pd.read_csv(file_path)

In [4]:
# Rename column if necessary
if 'HvyAlcoholConsump' in diabetes_data.columns:
    diabetes_data.rename(columns={'HvyAlcoholConsump': 'hvyalcohol'}, inplace=True)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = 'Data/diabetes_data.csv'  # Update the file path
diabetes_data = pd.read_csv(file_path)

# Debug Step 1: Check Column Names
print("Available Columns in the Dataset:", diabetes_data.columns)

# Debug Step 2: Rename Target Column if Necessary
if 'Diabetes' in diabetes_data.columns:
    diabetes_data.rename(columns={'Diabetes': 'diabetes'}, inplace=True)
elif ' diabetes ' in diabetes_data.columns:
    diabetes_data.rename(columns={' diabetes ': 'diabetes'}, inplace=True)

# Debug Step 3: Inspect the Dataset
print("First Few Rows of the Dataset:\n", diabetes_data.head())

# Step 4: Split Features and Target
X = diabetes_data.drop(columns=['diabetes'])
y = diabetes_data['diabetes']

# Step 5: Scale Numerical Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Step 7: Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              
                                                                             

Available Columns in the Dataset: Index(['Age', 'Sex', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk',
       'Stroke', 'HighBP', 'Diabetes'],
      dtype='object')
First Few Rows of the Dataset:
     Age  Sex  HighChol  CholCheck   BMI  Smoker  HeartDiseaseorAttack  \
0   4.0  1.0       0.0        1.0  26.0     0.0                   0.0   
1  12.0  1.0       1.0        1.0  26.0     1.0                   0.0   
2  13.0  1.0       0.0        1.0  26.0     0.0                   0.0   
3  11.0  1.0       1.0        1.0  28.0     1.0                   0.0   
4   8.0  0.0       0.0        1.0  29.0     1.0                   0.0   

   PhysActivity  Fruits  Veggies  HvyAlcoholConsump  GenHlth  MentHlth  \
0           1.0     0.0      1.0                0.0      3.0       5.0   
1           0.0     1.0      0.0                0.0      3.0       0.0   
2    