In [4]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-win_amd64.whl (10.7 MB)
     ---------------------------------------- 0.0/10.7 MB ? eta -:--:--
     ---------------------------------------- 0.1/10.7 MB ? eta -:--:--
     ---------------------------------------- 0.1/10.7 MB ? eta -:--:--
      --------------------------------------- 0.2/10.7 MB 1.8 MB/s eta 0:00:06
      --------------------------------------- 0.3/10.7 MB 1.8 MB/s eta 0:00:06
     - -------------------------------------- 0.4/10.7 MB 2.0 MB/s eta 0:00:06
     - -------------------------------------- 0.5/10.7 MB 2.1 MB/s eta 0:00:05
     -- ------------------------------------- 0.6/10.7 MB 2.2 MB/s eta 0:00:05
     -- ------------------------------------- 0.7/10.7 MB 2.2 MB/s eta 0:00:05
     --- ------------------------------------ 0.9/10.7 MB 2.4 MB/s eta 0:00:05
     --- ------------------------------------ 1.0/10.7 MB 2.5 MB/s eta 0:00:04
     ---- ----------------------------------- 1.2/10.7 MB 2


[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: C:\Users\chandru\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


     ------------------------ ------------- 26.8/41.3 MB 346.2 kB/s eta 0:00:42
     ------------------------ ------------- 26.8/41.3 MB 346.0 kB/s eta 0:00:42
     ------------------------ ------------- 26.8/41.3 MB 345.9 kB/s eta 0:00:42
     ------------------------ ------------- 26.8/41.3 MB 345.5 kB/s eta 0:00:42
     ------------------------ ------------- 26.9/41.3 MB 345.5 kB/s eta 0:00:42
     ------------------------ ------------- 26.9/41.3 MB 345.5 kB/s eta 0:00:42
     ------------------------ ------------- 26.9/41.3 MB 345.1 kB/s eta 0:00:42
     ------------------------ ------------- 27.0/41.3 MB 345.7 kB/s eta 0:00:42
     ------------------------ ------------- 27.0/41.3 MB 345.5 kB/s eta 0:00:42
     ------------------------ ------------- 27.0/41.3 MB 345.3 kB/s eta 0:00:42
     ------------------------ ------------- 27.0/41.3 MB 345.5 kB/s eta 0:00:42
     ------------------------ ------------- 27.1/41.3 MB 345.7 kB/s eta 0:00:42
     ------------------------ ----------

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def run_etl_pipeline(input_file_path, output_file_path):
    """
    Automates the ETL process for a given dataset.

    Args:
        input_file_path (str): Path to the raw input data file (e.g., CSV).
        output_file_path (str): Path to save the processed output data file.
    """

    # 1. Extraction
    try:
        df = pd.read_csv(input_file_path)
        print(f"Data extracted successfully from {input_file_path}. Shape: {df.shape}")
    except FileNotFoundError:
        print(f"Error: Input file not found at {input_file_path}")
        return

    # Define categorical and numerical features
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = df.select_dtypes(include=['object']).columns

    # 2. Transformation Pipeline
    # Preprocessing for numerical features: Imputation and Scaling
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical features: Imputation and One-Hot Encoding
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Create a preprocessor using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Create the full transformation pipeline
    etl_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    # Apply the transformation
    print("Applying transformations...")
    transformed_data = etl_pipeline.fit_transform(df)

    # Convert transformed data back to DataFrame (if needed for inspection or specific operations)
    # Note: OneHotEncoder changes column names, so reconstructing DataFrame requires careful handling
    # For simplicity, we'll assume the output is for direct use in a model or specific analysis.
    # If a DataFrame with proper column names is required, more complex logic is needed to get feature names after OneHotEncoding.
    # For now, we'll just show saving the transformed NumPy array.

    # 3. Loading
    # Example: Saving to a new CSV file
    # If you need column names for the output CSV, you would need to get feature names from the preprocessor.
    # For demonstration, we'll save the raw transformed array or convert to a DataFrame with generic names.
    transformed_df = pd.DataFrame(transformed_data) # Convert to DataFrame for saving
    try:
        transformed_df.to_csv(output_file_path, index=False)
        print(f"Processed data loaded successfully to {output_file_path}.")
    except Exception as e:
        print(f"Error loading data to {output_file_path}: {e}")

if __name__ == "__main__":
    # Example usage:
    # Create a dummy CSV file for demonstration
    dummy_data = {
        'numerical_col_1': [10, 20, None, 40, 50],
        'numerical_col_2': [1.5, 2.3, 3.1, None, 5.0],
        'categorical_col_1': ['A', 'B', 'A', 'C', 'B'],
        'categorical_col_2': ['X', 'Y', 'X', None, 'Z']
    }
    dummy_df = pd.DataFrame(dummy_data)
    dummy_input_path = 'raw_data.csv'
    dummy_df.to_csv(dummy_input_path, index=False)

    run_etl_pipeline(input_file_path=dummy_input_path, output_file_path='processed_data.csv')

Data extracted successfully from raw_data.csv. Shape: (5, 4)
Applying transformations...
Processed data loaded successfully to processed_data.csv.
