    Task: Complete Pipeline for a Dataset
1. Objective: Build a complex pipeline with multiple transformations.
2. Steps:
    - Load a sample dataset.
    - Define a transformation pipeline with both imputation and scaling.

In [1]:
# Write your code from here

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Step 1: Load a sample dataset
data = {
    'Age': [25, np.nan, 30, 40, 22],
    'Salary': [50000, 54000, np.nan, 65000, np.nan],
    'Experience': [1.0, 2.0, np.nan, 4.0, 5.0]
}
df = pd.DataFrame(data)
print("🔹 Original DataFrame:")
print(df)

# Step 2: Define numerical columns for transformation
num_features = df.columns.tolist()

# Step 3: Create a preprocessing pipeline: Imputation + Scaling
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),       # Impute missing values
    ('scaler', StandardScaler())                       # Scale numerical features
])

# Step 4: Combine into a ColumnTransformer (useful if you later add categorical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, num_features)
    ]
)

# Step 5: Apply the pipeline
processed_data = preprocessor.fit_transform(df)

# Step 6: Convert back to DataFrame for readability
processed_df = pd.DataFrame(processed_data, columns=num_features)
print("\n✅ Processed DataFrame (Imputed + Scaled):")
print(processed_df)

🔹 Original DataFrame:
    Age   Salary  Experience
0  25.0  50000.0         1.0
1   NaN  54000.0         2.0
2  30.0      NaN         NaN
3  40.0  65000.0         4.0
4  22.0      NaN         5.0

✅ Processed DataFrame (Imputed + Scaled):
        Age    Salary  Experience
0 -0.695414 -1.289210   -1.414214
1  0.000000 -0.474972   -0.707107
2  0.122720  0.000000    0.000000
3  1.758989  1.764182    0.707107
4 -1.186295  0.000000    1.414214


In [None]:
# Task: Imputation Function

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

# Step 1: Load Sample Dataset
data = load_diabetes()
X, y = data.data, data.target

# Introduce some missing values for demonstration
rng = np.random.RandomState(42)
missing_mask = rng.rand(*X.shape) < 0.1  # 10% missing
X[missing_mask] = np.nan

# Step 2: Define Imputation Function (using SimpleImputer)
# Step 3: Define Scaling Function (using StandardScaler)
# Step 4: Combine Imputation + Scaling in a Pipeline

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Step 2: Imputation
    ('scaler', StandardScaler())                  # Step 3: Scaling
])

# Step 5: Apply the Combined Transformation
X_transformed = pipeline.fit_transform(X)

# Optional: Print the shape and preview transformed data
print("Transformed shape:", X_transformed.shape)
print("First 5 rows:\n", X_transformed[:5])






# Scaling Function









# Combined Transformation Function







