In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def load_data(filepath):
    """Load dataset from a CSV file."""
    return pd.read_csv(filepath)

def clean_data(df):
    """Handle missing values, duplicates, and outliers."""
    df = df.drop_duplicates()
    df = df.dropna(subset=['target'])  # Ensure target variable has no missing values
    return df

def validate_data(df):
    """Check data types and constraints."""
    print("Data Types:\n", df.dtypes)
    print("Missing Values:\n", df.isnull().sum())
    return df

def feature_engineering(df, target_col):
    """Transform features: encoding, scaling, new feature creation."""
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop(target_col)
    categorical_features = df.select_dtypes(include=['object']).columns
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
    X = df.drop(columns=[target_col])
    y = df[target_col]
    
    X_processed = preprocessor.fit_transform(X)
    return X_processed, y

def split_data(X, y, test_size=0.2, random_state=42):
    """Split the dataset into training and testing sets."""
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Example Usage:
# df = load_data("data.csv")
# df = clean_data(df)
# df = validate_data(df)
# X, y = feature_engineering(df, target_col='target')
# X_train, X_test, y_train, y_test = split_data(X, y)
