In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# 1. Load dataset
data = load_iris()
X, y = data.data, data.target
feature_names = data.feature_names

In [3]:
# 2. Introduce some missing values artificially
rng = np.random.RandomState(42)
mask = rng.rand(*X.shape) < 0.1  # ~10% missing
X_missing = X.copy()
X_missing[mask] = np.nan

print(f"Missing values per feature:\n{pd.DataFrame(X_missing, columns=feature_names).isnull().sum()}")

Missing values per feature:
sepal length (cm)    23
sepal width (cm)     15
petal length (cm)    18
petal width (cm)     16
dtype: int64


In [4]:
# 3. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_missing, y, test_size=0.3, random_state=42)

In [6]:
# 4. Try different imputation strategies
strategies = ["mean", "median", "most_frequent"]
results = []

for strategy in strategies:
    imputer = SimpleImputer(strategy=strategy)
    
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    model = LogisticRegression(max_iter=500)
    model.fit(X_train_imputed, y_train)
    y_pred = model.predict(X_test_imputed)

    acc = accuracy_score(y_test, y_pred)
    results.append((strategy, acc))
    
    print(f"\nStrategy: {strategy}")
    print("Accuracy:", acc)
    print("Classification Report:\n", classification_report(y_test, y_pred))



Strategy: mean
Accuracy: 0.8888888888888888
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.79      0.85      0.81        13
           2       0.83      0.77      0.80        13

    accuracy                           0.89        45
   macro avg       0.87      0.87      0.87        45
weighted avg       0.89      0.89      0.89        45


Strategy: median
Accuracy: 0.9111111111111111
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.80      0.92      0.86        13
           2       0.91      0.77      0.83        13

    accuracy                           0.91        45
   macro avg       0.90      0.90      0.90        45
weighted avg       0.92      0.91      0.91        45


Strategy: most_frequent
Accuracy: 0.8222222222222222
Classification Report:
               precis