In [None]:
# Input parameters
input_file: str = "data/raw/sample.csv"  # {"description": "Path to input CSV file", "input_type": "file"}
output_file: str = "data/processed/sample_processed.csv"  # {"description": "Path to output CSV file"}

# Preprocessing options
remove_duplicates: bool = True  # {"description": "Remove duplicate rows"}
fill_missing: bool = True  # {"description": "Fill missing values"}
fill_method: str = "mean"  # {"description": "Method to fill missing values", "input_type": "select", "options": ["mean", "median", "mode", "zero"]}
normalize_columns: list = []  # {"description": "Columns to normalize", "input_type": "multiselect"}

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

# Create a sample dataset if input file doesn't exist
if not os.path.exists(input_file):
    print(f"Input file {input_file} not found. Creating sample data.")
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(input_file), exist_ok=True)
    
    # Generate sample data
    np.random.seed(42)
    data = {
        'id': range(1, 101),
        'value1': np.random.normal(100, 15, 100),
        'value2': np.random.normal(50, 10, 100),
        'category': np.random.choice(['A', 'B', 'C'], 100)
    }
    
    # Add some missing values
    data['value1'][np.random.choice(100, 10)] = np.nan
    data['value2'][np.random.choice(100, 10)] = np.nan
    
    # Add some duplicates
    for i in range(5):
        idx = np.random.randint(0, 95)
        data['id'][idx+5] = data['id'][idx]
        data['value1'][idx+5] = data['value1'][idx]
        data['value2'][idx+5] = data['value2'][idx]
        data['category'][idx+5] = data['category'][idx]
    
    # Create DataFrame and save to CSV
    df = pd.DataFrame(data)
    df.to_csv(input_file, index=False)
    print(f"Sample data saved to {input_file}")
else:
    print(f"Loading data from {input_file}")

# Load the data
df = pd.read_csv(input_file)

# Display basic info
print("\nData Info:")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Missing values:\n{df.isnull().sum()}")
print(f"Duplicate rows: {df.duplicated().sum()}")

# Remove duplicates if requested
if remove_duplicates and df.duplicated().sum() > 0:
    print(f"\nRemoving {df.duplicated().sum()} duplicate rows")
    df = df.drop_duplicates()

# Fill missing values if requested
if fill_missing and df.isnull().sum().sum() > 0:
    print(f"\nFilling missing values using method: {fill_method}")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            if fill_method == 'mean':
                df[col] = df[col].fillna(df[col].mean())
            elif fill_method == 'median':
                df[col] = df[col].fillna(df[col].median())
            elif fill_method == 'mode':
                df[col] = df[col].fillna(df[col].mode()[0])
            elif fill_method == 'zero':
                df[col] = df[col].fillna(0)
    
    # For categorical columns, fill with mode
    categorical_cols = df.select_dtypes(exclude=[np.number]).columns
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].mode()[0])

# Normalize columns if requested
if normalize_columns:
    print(f"\nNormalizing columns: {normalize_columns}")
    scaler = MinMaxScaler()
    df[normalize_columns] = scaler.fit_transform(df[normalize_columns])

# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Save processed data
df.to_csv(output_file, index=False)
print(f"\nProcessed data saved to {output_file}")
print(f"Final shape: {df.shape}")

# Display sample of processed data
print("\nSample of processed data:")
df.head()