### Handling Missing Values - Imputation within ML Pipelines
**Description**: Implement a machine learning pipeline that includes imputation and a classifier.

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# -----------------------
# Defensive Data Check
# -----------------------
def validate_input_data(df, features, target):
    if df.empty:
        raise ValueError("The dataset is empty.")
    if not all(col in df.columns for col in features + [target]):
        missing = set(features + [target]) - set(df.columns)
        raise ValueError(f"Missing required columns: {missing}")
    return True


# -----------------------
# Create Robust Pipeline
# -----------------------
def build_pipeline(numeric_features, categorical_features):
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression())
    ])

    return pipeline


# -----------------------
# Sample Dataset
# -----------------------
data = {
    'age': [25, 30, np.nan, 45, 50, np.nan, 35],
    'salary': [50000, 60000, 55000, np.nan, 70000, 65000, 62000],
    'gender': ['Male', 'Female', 'Female', np.nan, 'Male', 'Female', 'Male'],
    'purchased': [0, 1, 0, 1, 1, 0, 1]
}
df = pd.DataFrame(data)

# Define feature columns and target
features = ['age', 'salary', 'gender']
target = 'purchased'

# -----------------------
# Validate and Train Model
# -----------------------
try:
    validate_input_data(df, features, target)
    
    X = df[features]
    y = df[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    pipeline = build_pipeline(numeric_features=['age', 'salary'],
                              categorical_features=['gender'])
    
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
    print("✅ Classification Report:\n", classification_report(y_test, y_pred))

except ValueError as e:
    print(f"❌ Input Validation Error: {e}")


✅ Accuracy: 0.3333333333333333
✅ Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [2]:
# write your code from here

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Sample dataset with missing values
data = {
    'age': [25, 30, np.nan, 45, 50, np.nan, 35],
    'salary': [50000, 60000, 55000, np.nan, 70000, 65000, 62000],
    'gender': ['Male', 'Female', 'Female', np.nan, 'Male', 'Female', 'Male'],
    'purchased': [0, 1, 0, 1, 1, 0, 1]  # Target
}
df = pd.DataFrame(data)

# Separate features and target
X = df.drop('purchased', axis=1)
y = df['purchased']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Identify feature types
numeric_features = ['age', 'salary']
categorical_features = ['gender']

# Create transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Create full pipeline
clf_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression())
])

# Fit the pipeline
clf_pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf_pipeline.predict(X_test)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("✅ Classification Report:\n", classification_report(y_test, y_pred))

✅ Accuracy: 0.3333333333333333
✅ Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.33      1.00      0.50         1

    accuracy                           0.33         3
   macro avg       0.17      0.50      0.25         3
weighted avg       0.11      0.33      0.17         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
