In [48]:
### Library Imports
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import numpy as np
import warnings
warnings.filterwarnings("ignore")

### Notes
'''In a pairs trading strategy, the positions (long and short) should ideally be taken at the beginning 
when a trading signal is generated and should remain constant until an exit signal is triggered. The 
position sizes should not change over time.'''

### Resources
# https://www.cmcmarkets.com/en/trading-guides/pairs-trading -> Overview of Pairs Trading
# https://hudsonthames.org/an-introduction-to-cointegration/ -> Cointegration for Pairs Trading


'In a pairs trading strategy, the positions (long and short) should ideally be taken at the beginning \nwhen a trading signal is generated and should remain constant until an exit signal is triggered. The \nposition sizes should not change over time.'

In [49]:
### Function to Commodities Data
# List of all commodities available in yfinance
commodities_list = [
    'ES=F', 'YM=F', 'NQ=F', 'RTY=F', 'ZB=F', 'ZN=F', 'ZF=F', 'ZT=F',
    'GC=F', 'MGC=F', 'SI=F', 'SIL=F', 'PL=F', 'HG=F', 'PA=F', 'CL=F',
    'HO=F', 'NG=F', 'RB=F', 'BZ=F', 'B0=F', 'ZC=F', 'ZO=F', 'KE=F',
    'ZR=F', 'ZM=F', 'ZL=F', 'ZS=F', 'GF=F', 'HE=F', 'LE=F', 'CC=F',
    'KC=F', 'CT=F', 'LBS=F', 'OJ=F', 'SB=F'
]
metals = ['GC=F', 'MGC=F', 'SI=F', 'SIL=F', 'PL=F', 'HG=F', 'PA=F']
metals_test = ['PL=F', 'PA=F']
energy = ['CL=F', 'HO=F', 'NG=F', 'RB=F', 'BZ=F', 'B0=F']
agriculture = ['ZC=F', 'ZO=F', 'KE=F', 'ZR=F', 'ZM=F', 'ZL=F', 'ZS=F', 'GF=F', 'HE=F', 'LE=F', 'CC=F', 'KC=F', 'CT=F', 'LBS=F', 'OJ=F', 'SB=F']

def import_commod_data(tickers, start_date):
    data = pd.DataFrame()
    if len(tickers) == 1:
        data[tickers[0]] = yf.download(tickers[0], start_date)['Adj Close']
    else:
        for t in tickers:
            data[t] = yf.download(t, start_date)['Adj Close']

    # Drop rows with NaN or Inf values
    data = data.replace([np.inf, -np.inf], np.nan).dropna()
    
    return data

start_date = '2023-07-01'
end_date = '2024-07-01'
commod_data = import_commod_data(metals_test, start_date)
commod_data = commod_data.reset_index() # Turn Multi-Index into Column in Pandas 
commod_data.tail()


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Unnamed: 0,Date,PL=F,PA=F
253,2024-07-05,1035.300049,1035.800049
254,2024-07-08,1003.700012,1011.5
255,2024-07-09,989.700012,982.5
256,2024-07-10,997.5,987.599976
257,2024-07-11,1015.400024,988.5


In [50]:
### Function to Compute Spread and Z-Score
def spread_and_zscore(series1, series2):
    spread = series1 - series2
    z_score = (spread - spread.mean()) / spread.std()
    return spread, z_score

### Function Call to Compute Spread and Z-Score
# Define Metals
metal1_data = commod_data[metals_test[0]]
metal2_data = commod_data[metals_test[1]]
print(metal1_data)
# Function Call
spread, z_score = spread_and_zscore(metal1_data, metal2_data)
print('Most recent computed spread is: ', round(spread.iloc[-1], 4))
print('Most recent computed z-score is: ', round(z_score.iloc[-1], 4))


0       909.299988
1       918.099976
2       904.500000
3       910.099976
4       929.200012
          ...     
253    1035.300049
254    1003.700012
255     989.700012
256     997.500000
257    1015.400024
Name: PL=F, Length: 258, dtype: float64
Most recent computed spread is:  26.9
Most recent computed z-score is:  1.1823


In [51]:
### Function to check for cointegration
# The cointegration test checks whether there is a long-term equilibrium relationship between the two time series
def check_cointegration(series1, series2):
    result = sm.tsa.stattools.coint(series1, series2)
    # The second element of the result is the p-value of the cointegration test
    p_val = result[1] 
    return p_val

### Function Call to Compute P Value
# If the p-value is less than a significance level (commonly 0.05), the null hypothesis of no cointegration is rejected, 
# indicating that the series are cointegrated
p_value = check_cointegration(metal1_data, metal2_data)

if p_value > 0.05:
    print(
        'The computed p-value is:', round(p_value, 4), 
        'and thus the null hypothesis of no cointegration cannot be rejected, \n'
        'so the assumption of mean reversion may not hold and these pairs should NOT be considered'
    )
elif p_value < 0.05:
    print(
        'The computed p-value is:', round(p_value, 4), 
        'and thus the null hypothesis of no cointegration is rejected, \n'
        'so the assumption of mean reversion holds and these pairs should be considered'
    )


The computed p-value is: 0.0493 and thus the null hypothesis of no cointegration is rejected, 
so the assumption of mean reversion holds and these pairs should be considered


In [52]:
### Calculate Trend Signals Based on Z-Score
''' 
The z-score is used to generate trading signals based on predefined thresholds:
  - When the z-score is below -1, it indicates that the spread is significantly below its mean (i.e., one asset 
    is undervalued compared to the other). 
    This means commodity 1 (series1) is cheaper than normal and commodity 2 (series2) is more expensive than normal
    Hence, the strategy goes long on commodity 1 (buying 1000 units) and short on commodity 2 (selling 1000 units)

  - When the z-score is above 1, it indicates that the spread is significantly above its mean (i.e., one asset 
    is overvalued compared to the other). 
    This means commodity 1 (series1) is more expensive than normal and commodity 2 (series2) is cheaper than normal
    Hence, the strategy goes short on commodity 1 (selling 1000 units) and long on commodity 2 (buying 1000 units)
    
  - Positions are exited when the z-score reverts to a value between -0.5 and 0.5, indicating that the spread 
    has reverted to its mean.
'''
# Function to generate trading signals based on z-score
def generate_trading_signals(z_score):
    longs = z_score < -1
    shorts = z_score > 1
    exits = abs(z_score) < 0.5
    return longs, shorts, exits


In [53]:
### Function to Calculate the position
def calculate_position(commod_data, metal1, metal2):
    # Initialize a DataFrame to store positions for metal1 and metal2, with the same index as commod_data
    positions = pd.DataFrame(index = commod_data.index, columns = [metal1, metal2])
    positions[metal1] = 0  # Initialize all positions for metal1 to 0
    positions[metal2] = 0  # Initialize all positions for metal2 to 0
    
    # Extract the price series for metal1 and metal2 from the commod_data DataFrame
    series1 = commod_data[metal1]
    series2 = commod_data[metal2]
    
    # Check for cointegration between the two series and get the p-value
    p_value = check_cointegration(series1, series2)
    
    # If the p-value is less than 0.05, the series are cointegrated
    if p_value < 0.05:
        print(
            f'The computed p-value is: {round(p_value, 4)}, '
            'and thus the null hypothesis of no cointegration is rejected, \n'
            'so the assumption of mean reversion holds and these pairs should be considered'
        )
        
        # Calculate the spread and z-score between the two series
        spread, z_score = spread_and_zscore(series1, series2)
        print('Most recent computed spread is:', round(spread.iloc[-1], 4))
        print('Most recent computed z-score is:', round(z_score.iloc[-1], 4))

        # Generate trading signals based on the z-score
        longs, shorts, exits = generate_trading_signals(z_score)
        
        # Initialize variables to keep track of the current position
        position_long = 0
        position_short = 0
        
        # Iterate over the z-score to determine the trading positions
        for k in range(len(z_score)):
            # Long signal is generated
            if longs[k]:  
                position_long = 1000  # Set the long position to 1000 units
                position_short = -1000  # Set the short position to -1000 units
            # Short signal is generated
            elif shorts[k]:
                position_long = -1000  # Set the long position to -1000 units
                position_short = 1000  # Set the short position to 1000 units
            # Exit signal is generated - spread has reverted to the mean
            elif exits[k]:
                position_long = 0  # Set the long position to 0 units
                position_short = 0  # Set the short position to 0 units
            
            # Assign the current positions to the positions DataFrame
            positions.at[commod_data.index[k], metal1] = position_long
            positions.at[commod_data.index[k], metal2] = position_short
    else:
        print(f'{metal1} and {metal2} are NOT cointegrated with p-value:', round(p_value, 5), 'which produces an exit position of 0')
    
    return positions

# Example usage (not part of the function to be commented)
# Get the name of each metal
metal1 = metals_test[0]
metal2 = metals_test[1]
# Return position DataFrame
position_df = calculate_position(commod_data, metal1, metal2)
position_df.tail()


The computed p-value is: 0.0493, and thus the null hypothesis of no cointegration is rejected, 
so the assumption of mean reversion holds and these pairs should be considered
Most recent computed spread is: 26.9
Most recent computed z-score is: 1.1823


Unnamed: 0,PL=F,PA=F
253,-1000,1000
254,-1000,1000
255,-1000,1000
256,-1000,1000
257,-1000,1000
