# Outlier Detection Techniques and Their Impact on RandomForestRegressor Performance
This notebook compares the effect of different outlier detection techniques on a RandomForestRegressor model using the Wine Quality dataset.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from scipy.stats import zscore
from scipy.stats.mstats import winsorize
from sklearn.datasets import fetch_openml

In [None]:
# Load the red wine dataset
wine = fetch_openml(name="wine-quality-red", version=1, as_frame=True)
X = wine.data
y = wine.target.astype(int)

# Convert wine quality to 3 classes: 0 = low, 1 = medium, 2 = high
y = pd.cut(y, bins=[0, 4, 6, 8], labels=[0, 1, 2])
y = y.astype(int)

In [None]:
# Train-test split before outlier removal
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
column = "residual_sugar"  # Column to detect outliers from

In [None]:
# Function to train and evaluate RandomForestRegressor
def evaluate_model(X_train, X_test, y_train, y_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = RandomForestRegressor(n_estimators=500, max_depth=None, max_features='log2', random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    return {
        "MAE": mean_absolute_error(y_test, y_pred),
        "MSE": mean_squared_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }

In [None]:
# 1. Original data
original_metrics = evaluate_model(X_train, X_test, y_train, y_test)

In [None]:
# 2. Remove 5% extreme values
lower_quantile = X_train[column].quantile(0.05)
upper_quantile = X_train[column].quantile(0.95)
X_train_filtered = X_train[(X_train[column] >= lower_quantile) & (X_train[column] <= upper_quantile)]
y_train_filtered = y_train[X_train_filtered.index]
filtered_metrics = evaluate_model(X_train_filtered, X_test, y_train_filtered, y_test)

In [None]:
# 3. Remove outliers using IQR
Q1 = X_train[column].quantile(0.25)
Q3 = X_train[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
X_train_iqr = X_train[(X_train[column] >= lower_bound) & (X_train[column] <= upper_bound)]
y_train_iqr = y_train[X_train_iqr.index]
iqr_metrics = evaluate_model(X_train_iqr, X_test, y_train_iqr, y_test)

In [None]:
# 4. Remove outliers using Z-score
z_scores = zscore(X_train[column])
X_train_zscore = X_train[np.abs(z_scores) <= 3]
y_train_zscore = y_train[X_train_zscore.index]
zscore_metrics = evaluate_model(X_train_zscore, X_test, y_train_zscore, y_test)

In [None]:
# 5. Remove outliers using Isolation Forest
iso = IsolationForest(contamination=0.05, random_state=42)
outlier_preds = iso.fit_predict(X_train[[column]])
X_train_iso = X_train[outlier_preds != -1]
y_train_iso = y_train[X_train_iso.index]
iso_metrics = evaluate_model(X_train_iso, X_test, y_train_iso, y_test)

In [None]:
# 6. Winsorization
X_train_winsorized = X_train.copy()
X_train_winsorized[column] = winsorize(X_train_winsorized[column], limits=[0.05, 0.05])
winsorized_metrics = evaluate_model(X_train_winsorized, X_test, y_train, y_test)

In [None]:
# Create a comparison DataFrame
metrics_df = pd.DataFrame({
    "Method": ["Original", "5% Trimmed", "IQR", "Z-score", "Isolation Forest", "Winsorized"],
    "MAE": [original_metrics["MAE"], filtered_metrics["MAE"], iqr_metrics["MAE"], zscore_metrics["MAE"], iso_metrics["MAE"], winsorized_metrics["MAE"]],
    "MSE": [original_metrics["MSE"], filtered_metrics["MSE"], iqr_metrics["MSE"], zscore_metrics["MSE"], iso_metrics["MSE"], winsorized_metrics["MSE"]],
    "R2": [original_metrics["R2"], filtered_metrics["R2"], iqr_metrics["R2"], zscore_metrics["R2"], iso_metrics["R2"], winsorized_metrics["R2"]]
})
metrics_df