# Pairs Trading Strategy

In [None]:
# Install necessary packages
%pip install statsmodels

In [19]:
# Importing of libraries
from statsmodels.api import OLS, add_constant
from statsmodels.tsa.stattools import adfuller
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import yfinance as yf

## 🧺 Asset Universe Selection/Data preprocessing + cleaning


In [15]:
# Data preprocessing

def normalize_data(ticker1, ticker2, startdate, enddate):
    """
    This function cleans and normalizes the two tickers we wish to pair trade in a given timeframe. 
    The code takes logs of the price series to stabilize variance and enhance the linearity of relationships between assets. 
    This helps ensure that the regression and cointegration tests produce meaningful and stable results, rather than being skewed by raw price scales or volatility outliers.

    Argument:
    ticker1, ticker2 -- the two tickers we want to trade
    date1, date2 -- starting and ending dates of the timeframe

    Returns:
    log_data -- closing prices for the two tickers, which have been through log transformation
    
    (Optional) log_zscore_data -- closing prices for the two tickers, which have been through log transformation and normalization
    
    """
    # Log transformation on closing prices
    tickers = [ticker1, ticker2]
    data = yf.download(tickers, start=startdate, end=enddate, auto_adjust=True)
    data = data.dropna()
    log_data1 = np.log1p(data["Close"][ticker1])
    log_data2 = np.log1p(data["Close"][ticker2])
    
    # Calculating the z-scores 
    log_zscore_data1 = (log_data1-log_data1.mean())/log_data1.std()
    log_zscore_data2 = (log_data2-log_data2.mean())/log_data2.std()

    return log_data1,log_data2, data


# Example usage 
d1, d2, d3 = normalize_data('AAPL', 'MSFT', '2020-01-01', '2024-12-31')
print (d1.head())

[*********************100%***********************]  2 of 2 completed

Date
2020-01-02    4.298928
2020-01-03    4.289292
2020-01-06    4.297120
2020-01-07    4.292470
2020-01-08    4.308212
Name: AAPL, dtype: float64





## 🔍 Cointegration Testing (Engle-Granger)


In [12]:
# Cointegration Test
def test_cointegration(series_y, series_x, significance=0.05):
    """
    To test for cointegration, we first need to ensure the two price series are non stationary. This is done by ADF test.

    ADF Test:
    - Null hypothesis: Non Stationarity exists in the series.
    - Alternative Hypothesis: Stationarity exists in the series.

    Therefore to indicate non-stationarity in both time series, we seek a p-value > signficance.

    The Engle-Granger test then provides a p-value indicating whether the pair is likely cointegrated. 
    We look for a p-value below a chosen significance threshold (commonly 5%).

    Engle-Granger test:
    - Regress one series on the other
    - Find the residuals
    - Test residuals for stationarity
    
    Arguments:
    - series_y: Dependent asset (e.g., AAPL)
    - series_x: Independent asset (e.g., MSFT)
    - significance: p-value threshold for ADF test
    
    Returns:
    - is_cointegrated (bool)
    - beta_hat (float)
    - adf_pval (float)
    - spread (Series): residual spread if cointegrated, else None
    
    """

    # Test if both time series are non stationary
    pval1 = adfuller(series_x)[1]
    pval2 = adfuller(series_y)[1]
    if pval1 < significance  or pval2 < significance:
        return False, None, None, None

    #Fitting model
    model = OLS(series_y, add_constant(series_x)).fit()

    # Obtaining Residuals + beta (hedge ratio)
    resid = model.resid
    beta = model.params[1]

    # Testing residuals for stationarity
    adf_pval = adfuller(resid)[1]

    # Boolean, which indicates if the pair is cointegrated or not
    is_cointegrated = adf_pval < significance

    return is_cointegrated, beta, adf_pval, resid if is_cointegrated else None

# Example Usage
print(test_cointegration(d1,d2))

(False, 0.9950820197762814, 0.2523713821320346, None)


  beta = model.params[1]


In [18]:
def find_cointegrated_pairs(log_price_df, significance=0.05):
    """
    Run cointegration tests across all unique pairs of columns in the DataFrame.

    Arguments:
    - log_price_df: DataFrame of log-transformed prices with each column as an asset
    - significance: p-value threshold to confirm cointegration (default = 0.05)

    Returns:
    - valid_pairs: list of cointegrated asset name pairs (tuple)
    - hedge_ratios: dict of (asset1, asset2) : beta
    - adf_results: dict of (asset1, asset2) : pval
    - spreads: dict of (asset1, asset2) : residual spread 
    """
    symbols = log_price_df.columns.tolist()
    pairs = [(i, j) for i in symbols for j in symbols if i < j]
    valid_pairs = []
    hedge_ratios = {}
    adf_results = {}
    spreads = {}

    for a1, a2 in pairs:
        y = log_price_df[a1]
        x = log_price_df[a2]
        is_cointegrated, beta, pval, spread = test_cointegration(y, x, significance)
        adf_results[(a1, a2)] = pval
        if is_cointegrated:
            valid_pairs.append((a1, a2))
            hedge_ratios[(a1, a2)] = beta
            spreads[(a1, a2)] = spread

    return valid_pairs, hedge_ratios, adf_results, spreads


## 📐 Spread & Z-score Calculation