<a href="https://colab.research.google.com/github/mdjabedmollah/ml-learning/blob/main/lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split


df = pd.read_csv("https://raw.githubusercontent.com/mdjabedmollah/ml-learning/refs/heads/main/diabetes.csv")


print("Original dataset info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())



key_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for feature in key_features:
    median_val = df[feature].median()
    df[feature] = df[feature].replace(0, median_val)

max_glucose = df['Glucose'].max()
df.loc[0, 'Glucose'] = max_glucose

min_age = df['Age'].min()
min_glucose = df['Glucose'].min()
df.loc[df['Age'] == min_age, 'Glucose'] = min_glucose

print("\nAfter preprocessing:")
print(df.describe())

X = df.drop('Outcome', axis=1)
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions_rounded = np.round(predictions).astype(int)

predictions_rounded = np.clip(predictions_rounded, 0, 1)

accuracy = accuracy_score(y_test, predictions_rounded)
conf_matrix = confusion_matrix(y_test, predictions_rounded)
precision = precision_score(y_test, predictions_rounded)
recall = recall_score(y_test, predictions_rounded)
f1 = f1_score(y_test, predictions_rounded)

print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions_rounded})
print("\nSample predictions:")
print(results.head(10))

Original dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
None

First 5 rows:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29