In [1]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# --- Our ML tools ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# --- Our Evaluation tools (from Lesson 7!) ---
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# --- Load the Data ---
# This dataset is a classic. We'll load it from a reliable web URL.
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
# The file doesn't have a header, so we'll add the column names
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigree', 'Age', 'Outcome']
df = pd.read_csv(url, header=None, names=columns)

print("--- Here's a peek at our data ---")
print(df.head())



--- Here's a peek at our data ---
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigree  Age  Outcome  
0             0.627   50        1  
1             0.351   31        0  
2             0.672   32        1  
3             0.167   21        0  
4             2.288   33        1  


In [3]:
print("\n--- Checking for missing values... ---")
print(df.info())

# Vijay: "Sweet! It says 0 nulls. We're done, right?"
# AK: "Wait! 🤨 Look at the data. Can you have a BloodPressure of 0? Or Glucose of 0?"
# Vijay: "No... that would mean you're... 🤯"
# AK: "Exactly. This is a 'hidden' missing value. 0 is just a placeholder.
# We need to replace these 0s with 'NaN' (Not a Number) so Pandas can see them."

# Let's replace 0s in these specific columns
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

print("\n--- Now let's check .info() again... ---")
print(df.info()) # Ah! Now we see the missing values.

# Let's fill these missing values with the 'median' (the middle value)
# (We learned this in Lesson 4!)
for col in cols_with_zeros:
    df[col] = df[col].fillna(df[col].median())
    
print("\n--- Data is all clean! ---")

# --- Define X and y ---
X = df.drop('Outcome', axis=1) # X is all columns EXCEPT our target
y = df['Outcome']              # y is ONLY the 'Outcome' (0 or 1)

# --- Split the Data ---
# The GOLDEN RULE!
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scale the Data ---
# This is CRITICAL for Gradient Descent to work well.
# It puts all our features on the same scale.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Remember: only .transform() on the test set!






--- Checking for missing values... ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Pregnancies       768 non-null    int64  
 1   Glucose           768 non-null    int64  
 2   BloodPressure     768 non-null    int64  
 3   SkinThickness     768 non-null    int64  
 4   Insulin           768 non-null    int64  
 5   BMI               768 non-null    float64
 6   DiabetesPedigree  768 non-null    float64
 7   Age               768 non-null    int64  
 8   Outcome           768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

--- Now let's check .info() again... ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Pregnancies       768 non-null    int6

In [4]:

# 1. Import (we already did)
# 2. Create an "instance" of the model
#    'random_state=42' just makes sure we get the same result every time
model = LogisticRegression(random_state=42)

# 3. TRAIN!
# This is where Gradient Descent finds the lowest Log Loss.
print("\n...Training the model...")
model.fit(X_train_scaled, y_train)
print("...Model is Trained!...")




...Training the model...
...Model is Trained!...


In [7]:
# 4. Predict
# Let's see what it guesses for our 20% "unseen" test data
y_pred = model.predict(X_test_scaled)

# 5. Evaluate (Get the score!)
print("\n--- Model Evaluation ---")

# First, let's check simple Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc*100:.2f}%")

# But we know Accuracy is misleading! Let's check the Confusion Matrix
print("\n--- Confusion Matrix ---")
# (Remember: [[TN, FP], [FN, TP]])
print(confusion_matrix(y_test, y_pred))

# And the most important report:
print("\n--- Classification Report ---")
# This gives us Precision, Recall, and F1-Score all at once!
print(classification_report(y_test, y_pred))




--- Model Evaluation ---
Accuracy: 75.32%

--- Confusion Matrix ---
[[82 17]
 [21 34]]

--- Classification Report ---
              precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154

