In [None]:
#Changing the working directory to our target directory
import os
os.chdir(r"C:\Users\User\OneDrive\Documents\Data Science 2025.2026")
os.getcwd()#will make us view the working directory

In [None]:
#!pip install pandas (to install pandas if not already installed)
# You need to first install all the packages before importing them. 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#Importing our data. NB This dataset was obtained from Kaggle. 
insulin_data=pd.read_csv("insulin_dosage_prediction.csv")
df=insulin_data

In [None]:
#Visualize the first 5 rows of the data set
df.head()

In [None]:
print(df.info())#check for missing values or data types

In [None]:
#Check the data summaries and see if we have any outliers
print(df.describe())

In [None]:
#Now lets create a correlation heat map (Look for squares that are dark red
#strong positive correlation) or dark blue (strong negative correlation)
#Also note that we only need to run correlation for numeric values, therefore we need to drop all the non-numeric values
numerical_data = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(6, 6))
sns.heatmap(numerical_data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap: How variables relate to each other")
plt.show()


These correlations indicate that the data is either random or synthetic. Studies have indicated that there is a strong correlation between BMI and weight. Furthermore, there should be a high correlation between HbA1c and glucose level, but our data shows a zero correlation coefficient, which indicates the data is actually not correct (randomly generated). To visualize this, let us use a scatter plot and observe if we see a random effect of the data. 

In [None]:
# Set up the figure
plt.figure(figsize=(6, 6))

# Plot 1: Weight vs BMI (Should be strongly correlated)
plt.subplot(1, 2, 1)
sns.scatterplot(x=df['weight'], y=df['BMI'])
plt.title("Weight vs. BMI")

# Plot 2: Glucose vs HbA1c (Should be strongly correlated)
plt.subplot(1, 2, 2)
sns.scatterplot(x=df['glucose_level'], y=df['HbA1c'])
plt.title("Glucose vs. HbA1c")

plt.show()

What you are seeing in this dataset is the **"Square of Doom"** in data analysis, which confirms that the dataset is indeed a fake one. Therefore, this data set is a fake one. We need to change the dataset for us to continue with our data science project (machine learning). Let's acquire a new data set, which we will obtain from GitHub (an Indian dataset).  See below. 

In [None]:
# 1. Load the REAL Pima Indians Diabetes dataset directly from a URL
url="https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
new_data = pd.read_csv(url, names=column_names)
new_data.head()

In [None]:
# 2. Check the correlation again
plt.figure(figsize=(6,6))
sns.heatmap(new_data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap: How variables relate to each other")
plt.show()


In [None]:
# 3. The "Eye Test" - Glucose vs Insulin, the scatter plots 
plt.figure(figsize=(6, 4))
sns.scatterplot(x=new_data['Glucose'], y=new_data['Insulin'])
plt.title("Glucose vs. Insulin")
plt.show()

Now, looking at the two plots (the correlation matrix and the scatter plots), we can clearly see that we are working with a real dataset. We can use this dataset to continue with our project. But first, let us do data cleaning. Let's use **describe()** and the **info()** functions to quickly assess the distribution and check for outliers.  

In [None]:
new_data.info()

We can see that the **info()** shows no missing values. Now what about the **describe()** function? Let us check it out. 

In [None]:
new_data.describe()

Looking at the summary statistics of our dataset, we can see that we have anomalies. On the minimum values, blood pressure and BMI are actually 0, which is not possible. Nobody can have a blood pressure of zero and still be alive. We can therefore say that the ***0*** might have been used as a placeholder for a missing value. Calculating averages with such values will give us wrong values. We therefore need to fix these issues. 

### Let us now remove the "fake zeros" and generate our first professional health dashboard.

In [None]:
import numpy as np
#DATA CLEANING
# Replace 0 with NaN (Not a Number) in columns where 0 is impossible
columns_to_fix = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
new_data[columns_to_fix] = new_data[columns_to_fix].replace(0, np.nan)#from the new_data
# Print how much data was actually missing
print("Missing values per column:\n", new_data.isnull().sum())
# Fill missing values with the 'Median' (middle value) of that column
# This is better than 'Mean' because it's not affected by outliers
for col in columns_to_fix:
    new_data[col] = new_data[col].fillna(new_data[col].median())


We can see that we had 5 missing values on glucose, 35 on BP, etc. 

In [None]:
# Chart 1: Glucose Distribution (Healthy vs Diabetic)
plt.subplot(1, 2, 1)
sns.histplot(data=new_data, x='Glucose', hue='Outcome', kde=True, element="step", palette='seismic')
plt.title("Glucose Levels: Healthy (0) vs Diabetic (1)", fontsize=10)
plt.xlabel("Glucose Level")

# Chart 2: BMI vs Age (Who is most at risk?)
plt.subplot(1, 2, 2)
sns.scatterplot(x=new_data['Age'], y=new_data['BMI'], hue=new_data['Outcome'], palette='seismic', alpha=0.7)
plt.title("BMI vs Age colored by Diabetes Outcome", fontsize=10)

plt.tight_layout()
plt.show()

â€¢ **data=new** tells seaborn to use the dataframe named new.\
â€¢ **x='Glucose'** selects the Glucose column for the horizontal axis.\
â€¢ **hue='Outcome'** splits the plot by the Outcome column so each class has a different color.\
â€¢ **kde=True** adds a smooth density curve on top of the histogram.\
â€¢ **element="step"** draws the histogram with outlined steps instead of filled bars.\
â€¢ **palette='seismic'** uses the seismic color palette to color the different Outcome groups.\
1. You can see that we have "mountains" on the first chart. The Red mountain (diabetic) should be shifted to the right (higher glucose) compared to the Blue mountain.
2. On chart, look for patterns. The red dots (Diabetics) mostly at the top (High BMI). 

# Building Predictive Models
Now lets dive deep into building some models. Yaaaaaaay! Are you ready for this? I am all ready!!
## 1. Insulin Predictor
Even in the real world, insulin treatment costs a lot of money. Glucose and BMI measurement is cheap and easy. From this dataset, can we build a model that predicts a patient's insulin level by just looking at the BMI, glucose, and age? \
From our data cleaning exercise, you remember that we cleaned the dataset by using the median to replace the missing values, but in **Machine Learning (ML)**, that's actually cheating!! (funny, right? ðŸ˜€). We therefore need to use the original dataset (the raw data), and only use the rows in which we know the insulin level.\
But first, let us import some libraries.\
**We will be using the scikit-learn library.
It will do the following:**\
i. It will filter out bad data\
ii. Split the data: 80% (training data) and 20% (test data)\
iii. Train a linear regression model.\
iv. Test how accurate the model is. 

In [None]:
#!pip install scikit-learn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

In [None]:
# 1. LOAD & FILTER
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
col_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DPF', 'Age', 'Outcome']
df = pd.read_csv(url, names=col_names)

# We only want rows where Insulin is NOT 0 (Real data only)
# We also need Glucose and BMI to be real (not 0)
clean_data = df[(df['Insulin'] != 0) & (df['BMI'] != 0) & (df['Glucose'] != 0)]#extracts the non zero values of Insulin, BMI and Glucose

# 2. DEFINE X (The inputs) AND y (The target)
# We will try to predict 'Insulin' using Glucose, BMI, and Age
X = clean_data[['Glucose', 'BMI', 'Age']]#sets these variables as the independent variable (inputs)
y = clean_data['Insulin']# This will be our dependent variable (our outputs).

Since we have loaded our data successfully, and identified the inputs and outputs for our model, we can now go ahead and split the data into two; Train data and test data (the 80-20). 

In [None]:
# 3. SPLIT DATA (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 4. TRAIN THE MODEL
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# 5. We now EVALUATE the model and see how best it performs.
predictions = model.predict(X_test)

print("--- Model Performance ---")
print(f"R-Squared Score: {r2_score(y_test, predictions):.2f} (Closer to 1.0 is better)")
print(f"Mean Absolute Error: {mean_absolute_error(y_test, predictions):.2f}")

In [None]:
# 6. Let us try to visualize the prediction against what we really know
plt.figure(figsize=(8, 4))
sns.scatterplot(x=y_test, y=predictions, alpha=0.6)
plt.plot([0, 600], [0, 600], color='red', linestyle='--') # Perfect prediction line
plt.xlabel("Actual Insulin Levels")
plt.ylabel("Predicted Insulin Levels")
plt.title("Actual vs Predicted Insulin (Linear Regression)")
plt.show()

**Interpretation:**
An R-squared of **0.41** means the model explains about 41% of the variation in insulin levels. \
The MAE of **60.85** is high. Practically, this means that a patient's real insulin is 100; your model might guess 160 or 40. That is big enough of a gap to be dangerous in a medical setting. This actually means that the linear regression model has failed. We there need to try the random forest to correct this phenomenon. 

## Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# 1. SETUP THE NEW MODEL
# n_estimators=100 means "build 100 decision trees"
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# 2. TRAIN
rf_model.fit(X_train, y_train)

# 3. PREDICT
rf_predictions = rf_model.predict(X_test)

# 4. COMPARE RESULTS
print("--- Model Battle: Linear vs. Random Forest ---")
print(f"Linear Regression R2:  0.41") # Your previous score
print(f"Random Forest R2:      {r2_score(y_test, rf_predictions):.2f}")
print(f"Random Forest Error:   {mean_absolute_error(y_test, rf_predictions):.2f}")

In [None]:
# 5. VISUALIZE THE IMPROVEMENT
plt.figure(figsize=(12, 5))

# Plot 1: Linear Regression (The old one)
plt.subplot(1, 2, 1)
# Note: 'predictions' variable comes from your previous linear regression run
# If you lost it, re-run the linear regression code first.
sns.scatterplot(x=y_test, y=predictions, alpha=0.4, color='blue')
plt.plot([0, 600], [0, 600], '--r') # Perfect prediction line
plt.xlabel("Actual Insulin")
plt.ylabel("Predicted Insulin")
plt.title("Linear Regression (Old)")

# Plot 2: Random Forest (The new one)
plt.subplot(1, 2, 2)
sns.scatterplot(x=y_test, y=rf_predictions, alpha=0.4, color='green')
plt.plot([0, 600], [0, 600], '--r') # Perfect prediction line
plt.xlabel("Actual Insulin")
plt.ylabel("Predicted Insulin")
plt.title("Random Forest (New)")

plt.tight_layout()
plt.show()

**Actually, from the results, we can see that the linear regression model performed better than the random forest. There could be instances where the model is overfiited especially for a small dataset like this one in which the model overfits and memorizes the noise rather than the real pattern. Hence, the linear regression is recommended.** 