In [1]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


In [2]:
data = pd.read_csv('../data/diabetes.csv')

In [3]:
# Check for missing values
if data.isnull().values.any():
    print("Data contains missing values. Please handle them before proceeding.")
    sys.exit(1)
else:
    print("No missing values found in the dataset.")

No missing values found in the dataset.


In [4]:
column_names = data.columns
print("Column names in the dataset:")
for name in column_names:
    print(name)

Column names in the dataset:
Pregnancies
Glucose
BloodPressure
SkinThickness
Insulin
BMI
DiabetesPedigreeFunction
Age
Outcome


In [12]:
#check for types
print("Data types of each column:")
print(data.dtypes)

Data types of each column:
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


In [5]:
# Check for duplicate rows
duplicate_rows = data.duplicated().sum()
if duplicate_rows > 0:
    print(f"Data contains {duplicate_rows} duplicate rows. Please handle them before proceeding.")
    sys.exit(1)
else:
    print("No duplicate rows found in the dataset.")

No duplicate rows found in the dataset.


In [6]:
#prepare for linear regression
X = data.drop('Outcome', axis=1)
y = data['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# Train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
# Make predictions
y_pred = model.predict(X_test_scaled)
# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 0.171045272808501
R-squared: 0.2550028117674177


In [8]:
#the label outcome is binary, so we can use classification metrics, maybe use logistic regression
from sklearn.linear_model import LogisticRegression

In [11]:
#train logistic regression
log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)
# Make predictions
y_pred_log = log_model.predict(X_test_scaled)
# Evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
#report accuracy in percentage
print("Logistic Regression Model:")
print("Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred_log) * 100))
#report confusion matrix with percentage
conf_matrix = confusion_matrix(y_test, y_pred_log)
conf_matrix_percentage = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
print("Confusion Matrix (in percentage):")
print(conf_matrix_percentage)
accuracy = accuracy_score(y_test, y_pred_log)
conf_matrix = confusion_matrix(y_test, y_pred_log)
class_report = classification_report(y_test, y_pred_log)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Logistic Regression Model:
Accuracy: 75.32%
Confusion Matrix (in percentage):
[[0.7979798  0.2020202 ]
 [0.32727273 0.67272727]]
Accuracy: 0.7532467532467533
Confusion Matrix:
[[79 20]
 [18 37]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154

