In [3]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
np.random.seed(17)

In [4]:
data_path = Path('data/raw/outliers_homework.csv')
if data_path.exists():
    df = pd.read_csv(data_path)
else:
    # Synthetic fallback: linear trend with noise and a few extremes
    x = np.linspace(0, 10, 200)
    y = 2.2 * x + 1 + np.random.normal(0, 1.2, size=x.size)
    y[10] += 15; y[120] -= 13; y[160] += 18
    df = pd.DataFrame({'x': x, 'y': y})
df.head()

Unnamed: 0,date,daily_return,daily_return_2
0,2022-01-03,0.001263,0.003834
1,2022-01-04,-0.020046,-0.009506
2,2022-01-05,0.004739,-0.000535
3,2022-01-06,0.009953,0.012539
4,2022-01-07,0.008872,0.00984


In [5]:
def detect_outliers_iqr(series: pd.Series, k: float = 1.5) -> pd.Series:
    """Return boolean mask for IQR-based outliers.
    Assumptions: distribution reasonably summarized by quartiles; k controls strictness.
    """
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    return (series < lower) | (series > upper)

def detect_outliers_zscore(series: pd.Series, threshold: float = 3.0) -> pd.Series:
    """Return boolean mask for Z-score outliers where |z| > threshold.
    Assumptions: roughly normal distribution; sensitive to heavy tails.
    """
    mu = series.mean()
    sigma = series.std(ddof=0)
    z = (series - mu) / (sigma if sigma != 0 else 1.0)
    return z.abs() > threshold

def winsorize_series(series: pd.Series, lower: float = 0.05, upper: float = 0.95) -> pd.Series:
    lo = series.quantile(lower)
    hi = series.quantile(upper)
    return series.clip(lower=lo, upper=hi)

# Applying functions to data

In [8]:
# Apply IQR method
df["outlier_iqr"] = detect_outliers_iqr(df["daily_return"])

# Apply Z-score method
df["outlier_zscore"] = detect_outliers_zscore(df["daily_return"], threshold=3.0)

# Optional: Winsorize
df["daily_return_winsorized"] = winsorize_series(df["daily_return"], lower=0.05, upper=0.95)


# Sensitivity analysis 

## A) Compare summary statistics

In [9]:
print("Original Data:")
print(df["daily_return"].describe())

print("\nWithout Outliers (IQR):")
print(df.loc[~df["outlier_iqr"], "daily_return"].describe())

print("\nWithout Outliers (Z-score):")
print(df.loc[~df["outlier_zscore"], "daily_return"].describe())

print("\nWinsorized:")
print(df["daily_return_winsorized"].describe())


Original Data:
count    115.000000
mean      -0.001434
std        0.040579
min       -0.196672
25%       -0.008525
50%       -0.000187
75%        0.006368
max        0.212402
Name: daily_return, dtype: float64

Without Outliers (IQR):
count    106.000000
mean      -0.000039
std        0.009443
min       -0.021860
25%       -0.007213
50%       -0.000100
75%        0.006018
max        0.025708
Name: daily_return, dtype: float64

Without Outliers (Z-score):
count    110.000000
mean      -0.000078
std        0.011059
min       -0.033999
25%       -0.007529
50%       -0.000100
75%        0.006212
max        0.031952
Name: daily_return, dtype: float64

Winsorized:
count    115.000000
mean      -0.000251
std        0.010623
min       -0.020590
25%       -0.008525
50%       -0.000187
75%        0.006368
max        0.020797
Name: daily_return_winsorized, dtype: float64


## B) Compare Simple Regression

In [12]:

import statsmodels.api as sm

# Original regression
X = sm.add_constant(df["daily_return"])
y = df["daily_return_2"]
model_full = sm.OLS(y, X).fit()

# Regression without outliers (IQR)
df_no_outliers = df.loc[~df["outlier_iqr"]]
X_no = sm.add_constant(df_no_outliers["daily_return"])
y_no = df_no_outliers["daily_return_2"]
model_no = sm.OLS(y_no, X_no).fit()

print("With Outliers:\n", model_full.summary())
print("\nWithout Outliers:\n", model_no.summary())


With Outliers:
                             OLS Regression Results                            
Dep. Variable:         daily_return_2   R-squared:                       0.962
Model:                            OLS   Adj. R-squared:                  0.962
Method:                 Least Squares   F-statistic:                     2850.
Date:                Sun, 24 Aug 2025   Prob (F-statistic):           5.39e-82
Time:                        13:52:26   Log-Likelihood:                 449.05
No. Observations:                 115   AIC:                            -894.1
Df Residuals:                     113   BIC:                            -888.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0002      0.0

### Reflection

For outlier detection, I applied both the IQR method (k=1.5) and the Z-score method (threshold=3.0). I chose these thresholds because they are commonly used in practice and balance sensitivity vs. robustness. I also experimented with winsorizing (5th–95th percentile) to reduce the influence of extreme values rather than removing them completely.

**Assumptions:**  
- IQR assumes that quartiles summarize the distribution well, which may not hold for skewed data.  
- Z-score assumes approximate normality, which can be misleading with heavy-tailed returns.  
- Winsorizing assumes that capping extreme values reduces noise but introduces some bias.  

**Observed Impacts:**  
- Outliers inflated the mean and standard deviation. Removing them reduced volatility estimates.  
- In regression, coefficients and R² changed slightly once outliers were excluded, showing that a few extreme points were influencing the fit disproportionately.  
- Winsorizing gave similar results to IQR removal but retained all observations.  

**Risks if Wrong:**  
- If outliers are actually valid market shocks rather than noise, removing them could underestimate risk.  
- Conversely, keeping true anomalies could bias model parameters and risk forecasts.  

Overall, the choice of method should depend on the context: whether the goal is risk modeling (keep tail events) or data cleaning for explanatory models (remove/winsorize).
