1. Load datasets

In [21]:
import pandas as pd
#load datasets
df= pd.read_csv("diabetes.csv")
print(df.head())

#Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Missing Values:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


2. Regression Task:

Predict the Blood Pressure of the patients based on other features.

Use Linear Regression model from Scikit-learn.

In [22]:
#importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from math import sqrt

In [26]:
#Regression task to predict blood pressure
# Drop rows with missing or zero values for BloodPressure
data = df[df['BloodPressure'] > 0]

# Features for regression (excluding BloodPressure)
X_reg = data.drop(columns=['BloodPressure', 'Outcome'])
y_reg = data['BloodPressure']

# Train-test split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

# Linear Regression
regressor = LinearRegression()
regressor.fit(X_train_reg, y_train_reg)

# Predictions and evaluation for regression
y_pred_reg = regressor.predict(X_test_reg)

# Evaluation metrics
reg_mae = mean_absolute_error(y_test_reg, y_pred_reg)
reg_mse = mean_squared_error(y_test_reg, y_pred_reg)
reg_rmse = sqrt(reg_mse)
reg_r2 = r2_score(y_test_reg, y_pred_reg)

print("Regression Task: Predict Blood Pressure")
print(f"Mean Absolute Error (MAE): {reg_mae:.2f}")
print(f"Mean Squared Error (MSE): {reg_mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {reg_rmse:.2f}")
print(f"R-squared (R²): {reg_r2:.2f}")


Regression Task: Predict Blood Pressure
Mean Absolute Error (MAE): 8.47
Mean Squared Error (MSE): 116.47
Root Mean Squared Error (RMSE): 10.79
R-squared (R²): 0.21


3. Classification Task:

Predict whether the patient has diabetes (target column: Outcome).

Use Logistic Regression or K-Nearest Neighbors (KNN) model.

In [24]:
# Classification Task: Predict Outcome (diabetes)
# Drop rows with missing or zero values in critical features
critical_features = ['Glucose', 'BMI']
data = data[(data[critical_features] > 0).all(axis=1)]

# Features and target for classification
X_clf = data.drop(columns=['Outcome'])
y_clf = data['Outcome']

# Standardize the features for better performance
scaler = StandardScaler()
X_clf = scaler.fit_transform(X_clf)

# Train-test split
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

# # Logistic Regression
# logistic_model = LogisticRegression()
# logistic_model.fit(X_train_clf, y_train_clf)

# K-Nearest Neighbors
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_clf, y_train_clf)

# # Predictions and evaluation for classification
# # Logistic Regression
# y_pred_logistic = logistic_model.predict(X_test_clf)
# logistic_acc = accuracy_score(y_test_clf, y_pred_logistic)
# print("\nClassification Task: Logistic Regression")
# print(f"Accuracy: {logistic_acc:.2f}")
# print(classification_report(y_test_clf, y_pred_logistic))

# KNN
y_pred_knn = knn_model.predict(X_test_clf)
knn_acc = accuracy_score(y_test_clf, y_pred_knn)
print("\nClassification Task: K-Nearest Neighbors")
print(f"Accuracy: {knn_acc:.2f}")
print(classification_report(y_test_clf, y_pred_knn))


Classification Task: K-Nearest Neighbors
Accuracy: 0.72
              precision    recall  f1-score   support

           0       0.82      0.78      0.80       102
           1       0.53      0.58      0.56        43

    accuracy                           0.72       145
   macro avg       0.67      0.68      0.68       145
weighted avg       0.73      0.72      0.73       145

