In [1]:
# Step 1: Data Loading
import pandas as pd

# Load the dataset
file_path = "C:\\Users\\mavin\\Downloads\\Healthcare-Diabetes.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()


Unnamed: 0,Id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


In [2]:
# Step 2: Data Preprocessing
# Replace 0 values in certain columns with the median value of that column
columns_to_replace = ['Insulin', 'SkinThickness', 'BloodPressure', 'BMI']

# For each column in the list, replace 0s with the median value of that column
for column in columns_to_replace:
    median_value = data[column].median()
    data[column] = data[column].replace(0, median_value)

# Display the dataset after preprocessing
data.head()


Unnamed: 0,Id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,6,148,72,35,37,33.6,0.627,50,1
1,2,1,85,66,29,37,26.6,0.351,31,0
2,3,8,183,64,23,37,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Step 3: Splitting Data into Features and Target Variables
# Drop 'Id', 'Outcome', and 'Glucose' columns from the feature set
X = data.drop(['Id', 'Outcome', 'Glucose'], axis=1)

# Target variable (Glucose)
y = data['Glucose']

# Display feature set and target variable
X.head(), y.head()


(   Pregnancies  BloodPressure  SkinThickness  Insulin   BMI  \
 0            6             72             35       37  33.6   
 1            1             66             29       37  26.6   
 2            8             64             23       37  23.3   
 3            1             66             23       94  28.1   
 4            0             40             35      168  43.1   
 
    DiabetesPedigreeFunction  Age  
 0                     0.627   50  
 1                     0.351   31  
 2                     0.672   32  
 3                     0.167   21  
 4                     2.288   33  ,
 0    148
 1     85
 2    183
 3     89
 4    137
 Name: Glucose, dtype: int64)

In [4]:
# Step 4: Train-Test Split
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the sizes of the training and testing sets
X_train.shape, X_test.shape


((2214, 7), (554, 7))

In [5]:
# Step 5: Model Training
from sklearn.linear_model import LinearRegression

# Initialize the linear regression model
model = LinearRegression()

# Train the model on the training set
model.fit(X_train, y_train)

# Display the model coefficients
print("Model Coefficients:", model.coef_)


Model Coefficients: [ 0.02649862  0.30780676 -0.11452361  0.11247878  0.51435239  4.53824217
  0.6392661 ]


In [6]:
# Step 6: Model Evaluation
from sklearn.metrics import mean_squared_error

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Display the MSE
print(f"Mean Squared Error (MSE): {mse}")


Mean Squared Error (MSE): 833.5194855859364
