## Best Practices for Data Preprocessing

#### Always Explore & Visualize Data First

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import json
from sklearn.datasets import load_diabetes
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, RobustScaler, MaxAbsScaler

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

# ----------------------------
# Step 1: Load Dataset
# ----------------------------
try:
    data = load_diabetes()
    X = pd.DataFrame(data.data, columns=data.feature_names)
    logging.info("Loaded diabetes dataset.")
except Exception as e:
    logging.error(f"Dataset loading failed: {e}")
    raise

# ----------------------------
# Step 2: Simulate Missing and Infinite Values
# ----------------------------
rng = np.random.RandomState(42)
X[rng.rand(*X.shape) < 0.1] = np.nan
X.iloc[0, 0] = np.inf
X.iloc[1, 1] = -np.inf

logging.info("Simulated missing and infinite values.")

# ----------------------------
# Step 3: Clean Anomalies
# ----------------------------

# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)
logging.info("Replaced infinite values with NaN.")

# Drop columns with all missing values
X = X.dropna(axis=1, how='all')
logging.info("Dropped columns with all missing values.")

# Check for numeric columns
if not all(np.issubdtype(dtype, np.number) for dtype in X.dtypes):
    raise TypeError("Non-numeric column detected. Please convert all data to numeric before preprocessing.")

# ----------------------------
# Step 4: Imputation
# ----------------------------
try:
    imputer = SimpleImputer(strategy='mean')
    X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    logging.info("Applied mean imputation.")
except Exception as e:
    logging.error(f"Imputation failed: {e}")
    raise

# ----------------------------
# Step 5: Outlier Capping
# ----------------------------
def cap_outliers_vectorized(df, lower_q=0.01, upper_q=0.99):
    lower = df.quantile(lower_q)
    upper = df.quantile(upper_q)
    return df.clip(lower=lower, upper=upper, axis=1)

try:
    X_capped = cap_outliers_vectorized(X_imputed)
    logging.info("Applied outlier capping at 1% and 99% percentiles.")
except Exception as e:
    logging.error(f"Outlier capping failed: {e}")
    raise

# ----------------------------
# Step 6: Scaling
# ----------------------------
scalers = {
    'minmax': MinMaxScaler(),
    'robust': RobustScaler(),
    'maxabs': MaxAbsScaler()
}
scaled_outputs = {}
scaler_params = {}

for name, scaler in scalers.items():
    try:
        X_scaled = pd.DataFrame(scaler.fit_transform(X_capped), columns=X.columns)
        scaled_outputs[name] = X_scaled
        logging.info(f"{name.title()} scaling applied successfully.")
        
        # Save params
        if name == 'minmax':
            scaler_params['minmax_min'] = scaler.data_min_.tolist()
            scaler_params['minmax_max'] = scaler.data_max_.tolist()
        elif name == 'robust':
            scaler_params['robust_center'] = scaler.center_.tolist()
            scaler_params['robust_scale'] = scaler.scale_.tolist()
        elif name == 'maxabs':
            scaler_params['maxabs_max'] = scaler.max_abs_.tolist()

    except Exception as e:
        logging.error(f"{name.title()} scaling failed: {e}")

# ----------------------------
# Step 7: Save Transformation Parameters
# ----------------------------
scaler_params['imputer_statistics'] = imputer.statistics_.tolist()

with open('transformation_parameters.json', 'w') as f:
    json.dump(scaler_params, f)

logging.info("Saved transformation parameters to 'transformation_parameters.json'.")

INFO: Loaded diabetes dataset.
INFO: Simulated missing and infinite values.
INFO: Replaced infinite values with NaN.
INFO: Dropped columns with all missing values.
INFO: Applied mean imputation.
INFO: Applied outlier capping at 1% and 99% percentiles.
INFO: Minmax scaling applied successfully.
INFO: Robust scaling applied successfully.
INFO: Maxabs scaling applied successfully.
INFO: Saved transformation parameters to 'transformation_parameters.json'.


## Handle Missing & Inconsistent Data Before Applying ML Models

In [10]:
import unittest
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Include function directly if no import
def cap_outliers_vectorized(df, lower_q=0.01, upper_q=0.99):
    lower = df.quantile(lower_q)
    upper = df.quantile(upper_q)
    return df.clip(lower=lower, upper=upper, axis=1)

class TestPreprocessing(unittest.TestCase):

    def test_imputation(self):
        df = pd.DataFrame({'A': [1, np.nan, 3], 'B': [4, 5, np.nan]})
        imputer = SimpleImputer(strategy='mean')
        X_imp = imputer.fit_transform(df)
        self.assertFalse(np.isnan(X_imp).any())

    def test_outlier_capping(self):
        df = pd.DataFrame({'A': [1, 100, 3, 4], 'B': [4, 5, 1000, 6]})
        capped = cap_outliers_vectorized(df)
        q_high = df.quantile(0.99)
        self.assertTrue((capped <= q_high).all().all())

    def test_scaling_minmax(self):
        df = pd.DataFrame({'A': [1, 2, 3], 'B': [10, 20, 30]})
        scaler = MinMaxScaler()
        scaled = scaler.fit_transform(df)
        self.assertTrue(((scaled >= 0) & (scaled <= 1)).all())

    def test_handle_infinite(self):
        df = pd.DataFrame({'A': [1, np.inf, 2], 'B': [4, -np.inf, 6]})
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        imputer = SimpleImputer(strategy='mean')
        result = imputer.fit_transform(df)
        self.assertFalse(np.isnan(result).any())

    def test_all_missing_column(self):
        df = pd.DataFrame({'A': [np.nan, np.nan], 'B': [1, 2]})
        df_clean = df.dropna(axis=1, how='all')
        imputer = SimpleImputer(strategy='mean')
        result = imputer.fit_transform(df_clean)
        self.assertFalse(np.isnan(result).any())

if __name__ == '__main__':
    unittest.main()

usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument '/home/vscode/.local/share/jupyter/runtime/kernel-v383b53d707e08e3c1984ada641f7c6a2f1931e876.json'


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [8]:
# Task 4: Drop Missing Values

import unittest
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from main import cap_outliers_vectorized # type: ignore

class TestPreprocessing(unittest.TestCase):
    def setUp(self):
        self.X = pd.DataFrame({
            'A': [1, 2, 3, np.nan, 5, 100],
            'B': [1, 2, 3, 4, np.nan, 6]
        })

    def test_imputer(self):
        imputer = SimpleImputer(strategy='mean')
        X_imp = imputer.fit_transform(self.X)
        self.assertFalse(np.isnan(X_imp).any())

    def test_scaler(self):
        imputer = SimpleImputer(strategy='mean')
        X_imp = pd.DataFrame(imputer.fit_transform(self.X), columns=self.X.columns)
        scaler = MinMaxScaler()
        scaled = scaler.fit_transform(X_imp)
        self.assertTrue((scaled >= 0).all() and (scaled <= 1).all())

    def test_outlier_capping(self):
        capped = cap_outliers_vectorized(self.X.fillna(0))
        self.assertFalse((capped > self.X.quantile(0.99)).any().any())

if __name__ == '__main__':
    unittest.main()



# Task 5: Fill Missing Values




# Task 6: Handling Outliers with Capping





ModuleNotFoundError: No module named 'main'

## Choose the Right Scaling Method

In [None]:
# Task 7: Min-Max Scaling







# Task 8: Robust Scaling






# Task 9: MaxAbs Scaling






## Keep Track of Data Transformations for Reproducibility

In [None]:
# Task 10: Log Data Preprocessing Steps






# Task 11: Store Transformation Parameters




