In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [4]:
df = pd.read_csv('/content/drive/MyDrive/ML/Logistic Regression/diabetes.csv')
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


## Handling Missing Values

In [5]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


## Standardization Scaling

In [13]:
from sklearn.preprocessing import StandardScaler

# seperate features and target variable

X = df.drop(columns=['Outcome'])
y = df['Outcome']

# Apply Standard Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # fit_transfrom means Carry out

## Split Data into Training and Testing sets

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [14]:
X_test

array([[ 0.63994726, -0.71653347, -0.57412775, ...,  0.25478047,
        -0.1264714 ,  0.83038113],
       [-0.54791859, -0.27837344,  0.304734  , ...,  0.47054319,
        -0.97814487, -1.04154944],
       [-0.54791859, -0.40356202, -0.26394125, ..., -0.15136112,
        -0.94794368, -1.04154944],
       ...,
       [ 1.23388019, -0.81042491,  0.14964075, ...,  0.61015436,
         0.03963513,  2.02160968],
       [-0.54791859,  0.7857295 ,  0.04624525, ..., -0.506735  ,
        -0.40734244, -0.36084741],
       [ 1.23388019, -1.46766496,  0.04624525, ...,  0.41977549,
         0.70406123,  0.49003012]])

## Train a Logistic Regression Model

In [15]:
# Intialization the Model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

##  Predication on test data

In [19]:

# predict on test data
y_pred = model.predict(X_test)
print(y_pred,y_test)


[0 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0 1 1 1 1 1 1 1
 0 1 1 0 1 1 0 0 1 1 0 0 1 0 1 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 0 0
 0 1 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 1 0 1 0 0 1 0 1 0
 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0] 668    0
324    0
624    0
690    0
473    0
      ..
355    1
534    0
344    0
296    1
462    0
Name: Outcome, Length: 154, dtype: int64


In [20]:
import numpy as np
results = np.column_stack((y_test,y_pred))

print("Actual Values | Predicted Values")
print("---------------------------------")
for actual, predicted in results:
    print(f"{actual:14.2f} | {predicted:12.2f} ")

Actual Values | Predicted Values
---------------------------------
          0.00 |         0.00 
          0.00 |         0.00 
          0.00 |         0.00 
          0.00 |         0.00 
          0.00 |         0.00 
          0.00 |         0.00 
          0.00 |         0.00 
          0.00 |         1.00 
          0.00 |         1.00 
          0.00 |         1.00 
          1.00 |         0.00 
          0.00 |         1.00 
          1.00 |         0.00 
          0.00 |         0.00 
          0.00 |         0.00 
          1.00 |         0.00 
          0.00 |         0.00 
          0.00 |         0.00 
          1.00 |         1.00 
          1.00 |         1.00 
          0.00 |         0.00 
          0.00 |         0.00 
          0.00 |         1.00 
          0.00 |         0.00 
          0.00 |         1.00 
          1.00 |         1.00 
          0.00 |         0.00 
          0.00 |         0.00 
          1.00 |         0.00 
          0.00 |         0.00 
   

## Model Evaluation

In [22]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Classfication Report
print("\n Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\n Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Model Accuracy: 0.75

 Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154


 Confusion Matrix:
[[79 20]
 [18 37]]
