# Setup and leakage review

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf

ticker = "MSFT"
start_date = "2018-01-01"
end_date = "2024-01-01"

# Load necessary raw data
data = yf.download(ticker, start=start_date, end=end_date)
df = data[['Close', 'High', 'Low', 'Volume']].ffill()

# Calculate the base return
df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))

# Drop first row due to return calculation
df = df.dropna()

print("Base DataFrame loaded. Rows:", df.shape[0])

  data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed

Base DataFrame loaded. Rows: 1508





Lookahead bias principle: All features must be constructed using data that was known before the prediction time t.

# Feature scaling and normalization
Scaling features ensures that no single feature, due to its large absolute magnitude (e.g., Volume vs. Daily Return), unfairly dominates the model's loss function. This is crucial for Gradient-Based Models (like Neural Networks or Support Vector Machines).

## Z-score normalization (standardization)
Transforms the feature to have a mean of 0 and a standard deviation of 1.

In [3]:
# dummy rolling feature(21 day SMA)
window = 21
df['SMA_21'] = df['Close'].rolling(window=window).mean().shift(1)   # Shift to prevent leakage

# Z-score normalization 
# 1. calc rolling mean
rolling_mean = df['SMA_21'].expanding(min_periods=window).mean().shift(1)

# 2. calc rolling std
rolling_std = df['SMA_21'].expanding(min_periods=window).std().shift(1)

# 3. z-score
df['SMA_21_Zscore'] = (df['SMA_21'] - rolling_mean) / rolling_std

In [5]:
# EXAMPLE: zscore the log return
return_mean = df['Log_Return'].expanding(min_periods=window).mean().shift(1)
return_std = df['Log_Return'].expanding(min_periods=window).std().shift(1)
df['Log_Return_Zscore'] = (df['Log_Return'] - return_mean) / return_std

df['Log_Return_Zscore'].tail()

Date
2023-12-22    0.092417
2023-12-26   -0.042680
2023-12-27   -0.136839
2023-12-28    0.116203
2023-12-29    0.052640
Name: Log_Return_Zscore, dtype: float64

## Min-max
Scales the feature to a fixed range, typically [0, 1]. <br>
NOTE: When using Min-Max in a real model, the Min and Max values must be computed only from the training dataset and then applied to the validation and test sets. You cannot use the overall minimum/maximum from the entire dataset, as this would incorporate future information.

# Feature transforms and rank
## Percentage rank features
The percentile rank tells you where the current value stands relative to its past N values (e.g., 95% rank means the current return is higher than 95% of the returns in the last N days). This transforms the raw value into a scale-independent position within its recent history.

In [6]:
window = 63 # Quarterly rank
def rolling_percentile_rank(series, window):
    """Calculates the rolling percentile rank of the current value."""
    # Use the current window to compute the rank.
    # The result needs to be shifted by 1 to prevent lookahead.
    rank_func = lambda x: pd.Series(x).rank(pct=True).iloc[-1]
    return series.rolling(window=window).apply(rank_func, raw=False).shift(1)

df['Return_Rank_63d'] = rolling_percentile_rank(df['Log_Return'], window=63)
df['Return_Rank_63d'].tail()

Date
2023-12-22    0.650794
2023-12-26    0.476190
2023-12-27    0.365079
2023-12-28    0.317460
2023-12-29    0.492063
Name: Return_Rank_63d, dtype: float64

## Reducing heavy tails 
heavy tails (outliers/extreme returns) can disproportionately influence modles. 
<br>
<br>
winsorization : settning returns that are outside a certial percentile (eg 99th) equal to the value of that percentile. this caps the extremes. <br>
signed log transform: smooth way to compress larg values while preserving the sign(direction). Transform(x)=sign(x)×ln(1+∣x∣)

In [7]:
# Apply the Signed Log Transform to the raw Log Returns
df['Log_Return_SignedLog'] = np.sign(df['Log_Return']) * np.log1p(np.abs(df['Log_Return']))