In [11]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import os

def run_etl(input_file, output_file):
    """Simple ETL Pipeline"""
    
    
    print("⏳ Extracting data...")
    try:
        df = pd.read_csv(input_file)
        print(f"✅ Loaded data with {len(df)} rows")
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return

    
    print("🔄 Transforming data...")
    
    
    numeric_cols = df.select_dtypes(include=['number']).columns
    categorical_cols = df.select_dtypes(include=['object']).columns

    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cols),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
        ])
    
    
    processed_data = preprocessor.fit_transform(df)
    
    
    numeric_features = numeric_cols.tolist()
    categorical_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    all_features = numeric_features + list(categorical_features)
    
    
    processed_df = pd.DataFrame(processed_data, columns=all_features)
    print(f"✨ Transformed data shape: {processed_df.shape}")


    print("💾 Saving processed data...")
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    processed_df.to_csv(output_file, index=False)
    print(f"🎉 Success! Saved to {output_file}")

if __name__ == "__main__":
    
    INPUT_PATH = "raw_data.csv"  
    OUTPUT_PATH = "data/processed_data.csv"
    
    run_etl(INPUT_PATH, OUTPUT_PATH)

⏳ Extracting data...
✅ Loaded data with 5 rows
🔄 Transforming data...
✨ Transformed data shape: (5, 15)
💾 Saving processed data...
🎉 Success! Saved to data/processed_data.csv
