# Pairs Trading Strategy

In [371]:
# Install necessary packages
%pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


In [372]:
# Importing of libraries
from statsmodels.api import OLS, add_constant
from statsmodels.tsa.stattools import adfuller
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import yfinance as yf

## 🧺 Asset Universe Selection/Data preprocessing + cleaning


In [373]:
# Data preprocessing

def normalize_data(ticker1, ticker2, startdate, enddate):
    """
    This function cleans and normalizes the two tickers we wish to pair trade in a given timeframe. 
    The code takes logs of the price series to stabilize variance and enhance the linearity of relationships between assets. 
    This helps ensure that the regression and cointegration tests produce meaningful and stable results, rather than being skewed by raw price scales or volatility outliers.

    Argument:
    ticker1, ticker2 -- the two tickers we want to trade
    date1, date2 -- starting and ending dates of the timeframe

    Returns:
    log_data -- closing prices for the two tickers, which have been through log transformation
    
    (Optional) log_zscore_data -- closing prices for the two tickers, which have been through log transformation and normalization
    
    """
    # Log transformation on closing prices
    tickers = [ticker1, ticker2]
    data = yf.download(tickers, start=startdate, end=enddate, auto_adjust=True)
    data = data.dropna()
    log_data1 = np.log1p(data["Close"][ticker1])
    log_data2 = np.log1p(data["Close"][ticker2])
    
    # Calculating the z-scores 
    log_zscore_data1 = (log_data1-log_data1.mean())/log_data1.std()
    log_zscore_data2 = (log_data2-log_data2.mean())/log_data2.std()

    return log_data1,log_data2, data


# Example usage 
d1, d2, d3 = normalize_data('AAPL', 'MSFT', '2020-01-01', '2024-12-31')
print (d1.head())

[*********************100%***********************]  2 of 2 completed

Date
2020-01-02    4.298928
2020-01-03    4.289292
2020-01-06    4.297120
2020-01-07    4.292470
2020-01-08    4.308212
Name: AAPL, dtype: float64





## 🔍 Cointegration Testing (Engle-Granger)


In [374]:
# Cointegration Test
def test_cointegration(series_y, series_x, significance=0.05):
    """
    To test for cointegration, we first need to ensure the two price series are non stationary. This is done by ADF test.

    ADF Test:
    - Null hypothesis: Non Stationarity exists in the series.
    - Alternative Hypothesis: Stationarity exists in the series.

    Therefore to indicate non-stationarity in both time series, we seek a p-value > signficance.

    The Engle-Granger test then provides a p-value indicating whether the pair is likely cointegrated. 
    We look for a p-value below a chosen significance threshold (commonly 5%).

    Engle-Granger test:
    - Regress one series on the other
    - Find the residuals
    - Test residuals for stationarity
    
    Arguments:
    - series_y: Dependent asset (e.g., AAPL)
    - series_x: Independent asset (e.g., MSFT)
    - significance: p-value threshold for ADF test
    
    Returns:
    - is_cointegrated (bool)
    - beta_hat (float)
    - adf_pval (float)
    - spread (Series): residual spread if cointegrated, else None
    
    """

    # Test if both time series are non stationary
    pval1 = adfuller(series_x)[1]
    pval2 = adfuller(series_y)[1]
    if pval1 < significance  or pval2 < significance:
        return False, None, None, None

    #Fitting model
    model = OLS(series_y, add_constant(series_x)).fit()

    # Obtaining Residuals + beta (hedge ratio)
    resid = model.resid
    beta = model.params[1]

    # Testing residuals for stationarity
    adf_pval = adfuller(resid)[1]

    # Boolean, which indicates if the pair is cointegrated or not
    is_cointegrated = adf_pval < significance

    return is_cointegrated, beta, adf_pval, resid if is_cointegrated else None

# Example Usage
print(test_cointegration(d1,d2))

(np.False_, np.float64(0.9950820119431412), np.float64(0.2523715702742759), None)


  beta = model.params[1]


In [375]:
def find_cointegrated_pairs(log_price_df, significance=0.05):
    """
    Run cointegration tests across all unique pairs of columns in the DataFrame.

    Arguments:
    - log_price_df: DataFrame of log-transformed prices with each column as an asset
    - significance: p-value threshold to confirm cointegration (default = 0.05)

    Returns:
    - valid_pairs: list of cointegrated asset name pairs (tuple)
    - hedge_ratios: dict of (asset1, asset2) : beta
    - adf_results: dict of (asset1, asset2) : pval
    - spreads: dict of (asset1, asset2) : residual spread 
    """
    symbols = log_price_df.columns.tolist()
    pairs = [(i, j) for i in symbols for j in symbols if i < j]
    valid_pairs = []
    hedge_ratios = {}
    adf_results = {}
    spreads = {}

    for a1, a2 in pairs:
        y = log_price_df[a1]
        x = log_price_df[a2]
        is_cointegrated, beta, pval, spread = test_cointegration(y, x, significance)
        adf_results[(a1, a2)] = pval
        if is_cointegrated:
            valid_pairs.append((a1, a2))
            hedge_ratios[(a1, a2)] = beta
            spreads[(a1, a2)] = spread

    return valid_pairs, hedge_ratios, adf_results, spreads


## 📐 Spread & Z-score Calculation

In [376]:
'''
This function creates a DataFrame using (Arguments):
- tickers from the list of valid pairs from the find valid pairs function
- startdate and enddate for the time period of interest

Returns:
- A DataFrame with the following columns:
- Price_ticker1: Closing prices of the first ticker
- Price_ticker2: Closing prices of the second ticker
- Spread: Absolute difference between the two prices
- StdDev: Standard deviation of the spread over the past 20 days
- Middle_Band: 20-day moving average of the spread
- Top_Band: Middle_Band + 2 * StdDev
- Bottom_Band: Middle_Band - 2 * StdDev
'''

def calculate_values(tickers, startdate, enddate, threshold=2, window=20):

    # Storing the data in a DataFrame
    data = yf.download(tickers, start=startdate, end=enddate, auto_adjust=True)
    df = pd.DataFrame({
        'Price_ticker1': data['Close'][tickers[0]],
        'Price_ticker2': data['Close'][tickers[1]]
    })
    df = df.dropna()

    # Calculate Spread
    df['Spread'] = abs(df['Price_ticker1'] - df['Price_ticker2'])

    # Calculate the standard deviation of the spread over the past 20 days
    df['StdDev'] = df['Spread'].rolling(window=window).std()

    # Calculate the top, middle and bottom Bollinger Bands
    df['Middle_Band'] = df['Spread'].rolling(window=window).mean()
    df['Top_Band'] = df['Middle_Band'] + (threshold * df['StdDev'])
    df['Bottom_Band'] = df['Middle_Band'] - (threshold * df['StdDev'])

    return df.dropna() # Removes the 1st 19 rows with NaN values due to rolling calculations

# Using Apple and Microsoft as an example to create the DataFrame
df = calculate_values(('AAPL', 'MSFT'), '2024-01-01', '2024-03-24')

df.tail(5)

[*********************100%***********************]  2 of 2 completed


Unnamed: 0_level_0,Price_ticker1,Price_ticker2,Spread,StdDev,Middle_Band,Top_Band,Bottom_Band
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2024-03-18,172.677139,413.399078,240.721939,8.571582,231.24339,248.386555,214.100225
2024-03-19,175.02298,417.450684,242.427704,8.369831,232.437996,249.177658,215.698334
2024-03-20,177.597427,421.234772,243.637344,7.909053,233.76107,249.579177,217.942964
2024-03-21,170.341248,425.335907,254.994659,8.90749,235.284845,253.099824,217.469866
2024-03-22,171.245773,424.711853,253.46608,9.440108,236.705131,255.585347,217.824915


# Generation of Signals

In [None]:
import numpy as np

def calculate_hurst_exponent(ts, start=2, end=20):
    """
    Calculates the Hurst exponent of a time series using rescaled range analysis.
    
    Parameters:
        ts: Time series data.
        start: Minimum lag (must be >= 2).
        end: Maximum lag (should be < len(ts)).
    
    Returns:
        hurst_exponent: Calculated Hurst Exponent
    """
    ts = np.array(ts)

    # Validation
    if len(ts) < end:
        raise ValueError("Time series is too short for the given lag range.")
    if start < 2:
        raise ValueError("Start lag must be >= 2.")
    if end <= start:
        raise ValueError("End lag must be greater than start lag.")

    lags = range(start, end)
    tau = []

    for lag in lags:
        diff = ts[lag:] - ts[:-lag]
        std_dev = np.std(diff)
        tau.append(std_dev)

    # log-log regression
    log_lags = np.log(lags)
    log_tau = np.log(tau)

    slope, _ = np.polyfit(log_lags, log_tau, 1)
    hurst = 2.0 * slope
    return hurst


In [None]:
'''
Function that generates both entry and exit signals based on the Bollinger Bands and 
Hurst exponent.

An entry signal is generated when the spread exceeds the top or bottom Bollinger Bands, 
and an exit signal is generated when the spread returns to the middle band +/- 1 standard deviation.
- Entry Signals are only generated when there is no active position.
- Exit Signals are generated when there is an active position

Arguments:
- df: DataFrame containing the required columns 

Returns:
- df: DataFrame with additional columns for Entry_Signal and Exit_Signal
'''

def generate_signals(df, threshold = 1):
    hurst = calculate_hurst_exponent(df['Spread'])
    entry_active = False
    entry_type = None

    entry_signals = []
    exit_signals = []

    for date, row in df.iterrows():
        # interrows loops over each row in the DataFrame and returns the index and row data
        # index in this case is the date
        spread = row['Spread']
        top = row['Top_Band']
        bottom = row['Bottom_Band']
        mid = row['Middle_Band']
        std = row['StdDev']

        # Generating Entry Signals
        if not entry_active and hurst < 0.5:
            # < 0.5 indicates mean-reverting behavior
            if spread > top:
                entry_signals.append('Wide_Entry')
                entry_active = True
                entry_type = 'Wide_Entry'

            elif spread < bottom:
                entry_signals.append('Tight_Entry')
                entry_active = True
                entry_type = 'Tight_Entry'

            else:
                entry_signals.append(None)
        else:
            entry_signals.append(None)

        # Generating Exit Signals
        if entry_active:
            if entry_type == 'Wide_Entry' and spread <= mid + (threshold * std):
                exit_signals.append('Wide_Exit')
                entry_active = False
                entry_type = None

            elif entry_type == 'Tight_Entry' and spread >= mid - (threshold * std):
                exit_signals.append('Tight_Exit')
                entry_active = False
                entry_type = None

            else:
                exit_signals.append(None)
        else:
            exit_signals.append(None)

    df['Entry_Signal'] = entry_signals
    df['Exit_Signal'] = exit_signals

    return df

df = generate_signals(df)

In [382]:
df.tail(5)


Unnamed: 0_level_0,Price_ticker1,Price_ticker2,Spread,StdDev,Middle_Band,Top_Band,Bottom_Band,Entry_Signal,Exit_Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-03-18,172.677139,413.399078,240.721939,8.571582,231.24339,248.386555,214.100225,,
2024-03-19,175.02298,417.450684,242.427704,8.369831,232.437996,249.177658,215.698334,,
2024-03-20,177.597427,421.234772,243.637344,7.909053,233.76107,249.579177,217.942964,,
2024-03-21,170.341248,425.335907,254.994659,8.90749,235.284845,253.099824,217.469866,,
2024-03-22,171.245773,424.711853,253.46608,9.440108,236.705131,255.585347,217.824915,,


In [388]:
# Showing the entry and exit signals
filtered = df[(df['Entry_Signal'].notna()) | (df['Exit_Signal'].notna())]
filtered


Unnamed: 0_level_0,Price_ticker1,Price_ticker2,Spread,StdDev,Middle_Band,Top_Band,Bottom_Band,Entry_Signal,Exit_Signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-02-09,187.716324,415.829803,228.11348,8.558704,210.602886,227.720295,193.485478,Wide_Entry,
2024-02-13,183.929184,401.759552,217.830368,8.555092,212.566235,229.676419,195.456051,,Wide_Exit
2024-03-01,178.581482,411.596222,233.01474,4.867871,222.181953,231.917695,212.446212,Wide_Entry,
2024-03-11,171.712952,400.71936,229.006409,6.18973,225.88532,238.264779,213.50586,,Wide_Exit
2024-03-14,171.961456,421.224884,249.263428,8.575111,229.112704,246.262927,211.962482,Wide_Entry,
