In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

from scipy.optimize import brute
from decimal import Decimal, getcontext
from statsmodels.tsa.stattools import coint
import os

In [2]:

directory_path = '/Users/manasmaskar/Rutgers/Fall24/Algo Trading/HomeWork/HW1/DataFiles'

# List all files in the directory (assuming they are CSV files)
files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

# Initialize a dictionary to store each DataFrame, with filenames as keys
data_dict = {}

# Loop over each file and read it into a DataFrame
for file in files:
    file_path = os.path.join(directory_path, file)
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Use the filename (without extension) as the key in the dictionary
    data_dict[file] = df

# Check if all files have been loaded successfully
print(f"Loaded {len(data_dict)} files.")

# Example: Access one of the DataFrames (e.g., for the first file)
print(data_dict[files[0]].head())  # Display the first few rows of the first DataFrame


Loaded 29 files.
                         Local time     Open     High      Low    Close  \
0  01.01.2020 00:00:00.000 GMT-0500  1.29853  1.29853  1.29853  1.29853   
1  01.01.2020 01:00:00.000 GMT-0500  1.29853  1.29853  1.29853  1.29853   
2  01.01.2020 02:00:00.000 GMT-0500  1.29853  1.29853  1.29853  1.29853   
3  01.01.2020 03:00:00.000 GMT-0500  1.29853  1.29853  1.29853  1.29853   
4  01.01.2020 04:00:00.000 GMT-0500  1.29853  1.29853  1.29853  1.29853   

   Volume  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  


In [3]:
#Clean the Local time by removing the GMT - 5:00
import re

# Loop through all dataframes in the dictionary and clean the 'Local time' column
for file, df in data_dict.items():
    # Use regex to remove any GMT offset (e.g., GMT-0500, GMT-0400, etc.)
    df['Time'] = df['Local time'].str.replace(r' GMT[+-]\d{4}', '', regex=True)
    
    # Convert the cleaned datetime string to a proper pandas datetime object
    df['Time'] = pd.to_datetime(df['Time'], format='%d.%m.%Y %H:%M:%S.%f')
    
    # Drop the old 'Local time' column if you no longer need it
    df.drop(columns=['Local time'], inplace=True)

# Check the cleaned data
print(data_dict[files[0]].head())  # Display the first few rows of the first DataFrame after cleaning



      Open     High      Low    Close  Volume                Time
0  1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 00:00:00
1  1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 01:00:00
2  1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 02:00:00
3  1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 03:00:00
4  1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 04:00:00


In [4]:
# Extracting the close prices in each df

for file, df in data_dict.items():
    # Extract the file name without the extension to use as the new DataFrame name
    df_name = file.split('.')[0] + '_df'
    
    # Select only the 'Cleaned Local Time' and 'Close' columns
    filtered_df = df[['Time', 'Close']].copy()
    
    # Store the filtered DataFrame in a variable with the dynamic name
    globals()[df_name] = filtered_df
    
    # Optionally, print the first few rows to confirm
    print(f"{df_name}:")
    print(globals()[df_name].head(), "\n")


USDCAD_Candlestick_1_Hour_BID_01_df:
                 Time    Close
0 2020-01-01 00:00:00  1.29853
1 2020-01-01 01:00:00  1.29853
2 2020-01-01 02:00:00  1.29853
3 2020-01-01 03:00:00  1.29853
4 2020-01-01 04:00:00  1.29853 

UKGILT_df:
                 Time    Close
0 2020-01-01 00:00:00  131.274
1 2020-01-01 01:00:00  131.274
2 2020-01-01 02:00:00  131.274
3 2020-01-01 03:00:00  131.274
4 2020-01-01 04:00:00  131.274 

AUDUSD_Candlestick_1_Hour_BID_01_df:
                 Time    Close
0 2020-01-01 00:00:00  0.70115
1 2020-01-01 01:00:00  0.70115
2 2020-01-01 02:00:00  0.70115
3 2020-01-01 03:00:00  0.70115
4 2020-01-01 04:00:00  0.70115 

BRENT_df:
                 Time   Close
0 2020-01-01 00:00:00  65.995
1 2020-01-01 01:00:00  65.995
2 2020-01-01 02:00:00  65.995
3 2020-01-01 03:00:00  65.995
4 2020-01-01 04:00:00  65.995 

DOLLAR_df:
                 Time   Close
0 2020-01-01 00:00:00  96.134
1 2020-01-01 01:00:00  96.134
2 2020-01-01 02:00:00  96.134
3 2020-01-01 03:00:00  96.13

In [5]:
close_prices_df = pd.DataFrame()

# Loop through each filtered DataFrame in globals() (which contains all individual DataFrames)
for file, df in data_dict.items():
    # Extract the file name without extension to use as the column label
    df_name = file.split('.')[0] + '_df'
    
    # Set the 'Time' column as the index and add the 'Close' column to the close_prices_df
    filtered_df = globals()[df_name]
    filtered_df.set_index('Time', inplace=True)
    
    # Add the 'Close' column with the file name as the column label
    close_prices_df[df_name] = filtered_df['Close']

# Calculate the pairwise correlation matrix for the close prices
correlation_matrix = close_prices_df.corr()

# Display the correlation matrix
print("Pairwise Correlation Matrix:")
print(correlation_matrix)

Pairwise Correlation Matrix:
                                     USDCAD_Candlestick_1_Hour_BID_01_df  \
USDCAD_Candlestick_1_Hour_BID_01_df                             1.000000   
UKGILT_df                                                      -0.449022   
AUDUSD_Candlestick_1_Hour_BID_01_df                            -0.891733   
BRENT_df                                                       -0.200071   
DOLLAR_df                                                       0.645871   
XAUUSD_Candlestick_1_Hour_BID_01_df                             0.280521   
DIESEL_df                                                      -0.032825   
COCOA_df                                                        0.403156   
BTCUSD_Candlestick_1_Hour_BID_01_df                            -0.265420   
NZDUSD_Candlestick_1_Hour_BID_01_df                            -0.860515   
SUGAR_df                                                        0.221588   
USA500_df                                                  

In [6]:
import plotly.express as px

# Calculate the pairwise correlation matrix for the close prices
correlation_matrix = close_prices_df.corr()

# Create a heatmap using Plotly
fig = px.imshow(correlation_matrix,
                labels=dict(color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale="Viridis")

# Update layout for better readability
fig.update_layout(title="Pairwise Correlation Matrix of Close Prices",
                  xaxis_title="Instruments",
                  yaxis_title="Instruments",
                  width=800, height=800)

# Show the plot
fig.show()


In [7]:
import plotly.graph_objects as go
import numpy as np
import networkx as nx

# Set a threshold for the minimum correlation value to visualize the relationships
threshold = 0.5

# Create a NetworkX graph to better handle node placement and edge generation
G = nx.Graph()

# Add nodes (instruments) to the graph
node_names = correlation_matrix.columns
for node in node_names:
    G.add_node(node)

# Add edges based on correlation threshold
for i in range(len(correlation_matrix)):
    for j in range(i + 1, len(correlation_matrix)):
        correlation_value = correlation_matrix.iloc[i, j]
        if abs(correlation_value) > threshold:
            # Add edge with weight as the correlation value
            G.add_edge(node_names[i], node_names[j], weight=correlation_value)

# Get node positions using a layout for better legibility (e.g., spring layout for spread out nodes)
pos = nx.spring_layout(G, k=0.5)  # k adjusts the distance between nodes

# Prepare the edge and node traces for Plotly
edge_x = []
edge_y = []
edge_texts = []
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.append(x0)
    edge_x.append(x1)
    edge_x.append(None)
    edge_y.append(y0)
    edge_y.append(y1)
    edge_y.append(None)
    # Prepare text for hover (correlation value)
    edge_texts.append(f"{edge[0]} vs {edge[1]}: {edge[2]['weight']:.2f}")

# Create edge traces (lines between nodes)
edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=2, color='gray'),
    hoverinfo='text',
    mode='lines',
    text=edge_texts
)

# Create node traces (circles for each instrument)
node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=list(G.nodes()),
    textposition="bottom center",
    hoverinfo='text',
    marker=dict(
        size=20,
        color='lightblue',
        line_width=2
    )
)

# Create the plot with better layout and spacing
fig = go.Figure(data=[edge_trace, node_trace])

# Update layout for better legibility and spacing
fig.update_layout(
    title="Neat Correlation Network Graph",
    title_x=0.5,
    showlegend=False,
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    height=700, width=700,
    margin=dict(l=40, r=40, t=40, b=40)
)

# Show the plot
fig.show()


In [8]:
import plotly.graph_objects as go

# Create headers and rows for the table
header_values = ["Instruments"] + list(correlation_matrix.columns)
row_values = [correlation_matrix.index] + [correlation_matrix[col].round(2).tolist() for col in correlation_matrix.columns]

# Create the table
fig = go.Figure(data=[go.Table(
    header=dict(values=header_values,
                fill_color='lightblue',
                align='center',
                font=dict(size=10, color='black')),
    cells=dict(values=row_values,
               fill_color='lightgray',
               align='center',
               font=dict(size=8))
)])

# Update layout
fig.update_layout(
    title="Pairwise Correlation Matrix in Table Format",
    title_x=0.5,
    width=1200,
    height=900
)

# Show the plot
fig.show()


In [9]:
import plotly.graph_objects as go

# Create headers and rows for the table
header_values = ["Instruments"] + list(correlation_matrix.columns)
row_values = [correlation_matrix.index] + [correlation_matrix[col].round(2).tolist() for col in correlation_matrix.columns]

# Create the table with adjustments
fig = go.Figure(data=[go.Table(
    header=dict(values=header_values,
                fill_color='lightblue',
                align='center',
                font=dict(size=10, color='black'),
                height=40),  # Set header height for better readability
    cells=dict(values=row_values,
               fill_color='lightgray',
               align='center',
               font=dict(size=10),
               height=30))  # Set cell height for more space
])

# Update layout for more space and better readability
fig.update_layout(
    title="Pairwise Correlation Matrix in Table Format",
    title_x=0.5,
    width=1200,  # Increase width to accommodate more columns
    height=800,  # Increase height for better row spacing
    margin=dict(l=10, r=10, t=20, b=10)
)

# Show the plot
fig.show()


<h1> Cointegration Analysis

In [10]:
'''
instruments = list(correlation_matrix.columns)

# Initialize an empty list to store the results
cointegration_results = []

# Loop through all pairs of instruments
for i in range(len(instruments)):
    for j in range(i + 1, len(instruments)):
        # Skip the pair AUDUSD vs NZDUSD
        if (instruments[i] == "AUDUSD" and instruments[j] == "NZDUSD") or (instruments[i] == "NZDUSD" and instruments[j] == "AUDUSD"):
            continue
        
        # Perform the Engle-Granger Cointegration Test
        score, p_value, _ = coint(close_prices_df[instruments[i]], close_prices_df[instruments[j]])
        
        # Store the result if the p-value is less than 0.05
        cointegration_results.append((instruments[i], instruments[j], p_value))

# Sort the pairs by p-value (lowest p-value means stronger cointegration)
cointegration_results_sorted = sorted(cointegration_results, key=lambda x: x[2])

# Display the top 2 most cointegrated pairs (excluding AUDUSD vs NZDUSD)
print("Top 2 Most Cointegrated Pairs (excluding AUDUSD vs NZDUSD):")
for pair in cointegration_results_sorted[:2]:
    print(f"{pair[0]} vs {pair[1]}: p-value = {pair[2]:.4f}")
    '''


'\ninstruments = list(correlation_matrix.columns)\n\n# Initialize an empty list to store the results\ncointegration_results = []\n\n# Loop through all pairs of instruments\nfor i in range(len(instruments)):\n    for j in range(i + 1, len(instruments)):\n        # Skip the pair AUDUSD vs NZDUSD\n        if (instruments[i] == "AUDUSD" and instruments[j] == "NZDUSD") or (instruments[i] == "NZDUSD" and instruments[j] == "AUDUSD"):\n            continue\n        \n        # Perform the Engle-Granger Cointegration Test\n        score, p_value, _ = coint(close_prices_df[instruments[i]], close_prices_df[instruments[j]])\n        \n        # Store the result if the p-value is less than 0.05\n        cointegration_results.append((instruments[i], instruments[j], p_value))\n\n# Sort the pairs by p-value (lowest p-value means stronger cointegration)\ncointegration_results_sorted = sorted(cointegration_results, key=lambda x: x[2])\n\n# Display the top 2 most cointegrated pairs (excluding AUDUSD vs N

In [11]:
# Calculate the correlation between AUDUSD and NZDUSD
correlation_audusd_nzdusd = close_prices_df['AUDUSD_Candlestick_1_Hour_BID_01_df'].corr(close_prices_df['NZDUSD_Candlestick_1_Hour_BID_01_df'])

# Print the correlation value
print(f"Correlation between AUDUSD and NZDUSD: {correlation_audusd_nzdusd:.4f}")


Correlation between AUDUSD and NZDUSD: 0.9575


In [12]:
# Set the correlation threshold range
lower_threshold = 0.7
upper_threshold = 0.9575

# List of all instruments
instruments = list(correlation_matrix.columns)

# Initialize an empty list to store the filtered results
filtered_correlations = []

# Loop through all pairs of instruments and apply the filter
for i in range(len(instruments)):
    for j in range(i + 1, len(instruments)):
        # Skip the pair AUDUSD vs NZDUSD
        if (instruments[i] == "AUDUSD" and instruments[j] == "NZDUSD") or (instruments[i] == "NZDUSD" and instruments[j] == "AUDUSD"):
            continue
        
        # Get the correlation value between the two instruments
        correlation_value = correlation_matrix.iloc[i, j]
        
        # Apply the filter: only keep correlations between 0.7 and 0.9575
        if lower_threshold <= abs(correlation_value) <= upper_threshold:
            filtered_correlations.append((instruments[i], instruments[j], correlation_value))

# Sort the filtered correlations in descending order by the correlation value
filtered_correlations_sorted = sorted(filtered_correlations, key=lambda x: abs(x[2]), reverse=True)

# Display the filtered results
print("Filtered Correlations between 0.7 and 0.9575 (excluding AUDUSD vs NZDUSD) in Descending Order:")
for pair in filtered_correlations_sorted:
    print(f"{pair[0]} vs {pair[1]}: correlation = {pair[2]:.4f}")


Filtered Correlations between 0.7 and 0.9575 (excluding AUDUSD vs NZDUSD) in Descending Order:
AUDUSD_Candlestick_1_Hour_BID_01_df vs NZDUSD_Candlestick_1_Hour_BID_01_df: correlation = 0.9575
UKGILT_df vs USDJPY_Candlestick_1_Hour_BID_01_df: correlation = -0.9550
DIESEL_df vs LIGHT_df: correlation = 0.9457
USA30_df vs USATECH_df: correlation = 0.9232
USDJPY_Candlestick_1_Hour_BID_01_df vs OJUICE_df: correlation = 0.9003
EURUSD_Candlestick_1_Hour_BID_01_df vs GBPUSD_Candlestick_1_Hour_BID_01_df: correlation = 0.8969
BTCUSD_Candlestick_1_Hour_BID_01_df vs ETHUSD_Candlestick_1_Hour_BID_01_df: correlation = 0.8959
DOLLAR_df vs NZDUSD_Candlestick_1_Hour_BID_01_df: correlation = -0.8956
UKGILT_df vs SUGAR_df: correlation = -0.8929
USDCAD_Candlestick_1_Hour_BID_01_df vs AUDUSD_Candlestick_1_Hour_BID_01_df: correlation = -0.8917
DOLLAR_df vs GBPUSD_Candlestick_1_Hour_BID_01_df: correlation = -0.8901
SUGAR_df vs USTBOND_df: correlation = -0.8779
BTCUSD_Candlestick_1_Hour_BID_01_df vs USATECH_df

In [13]:
# Set the correlation threshold range
lower_threshold = 0.7
upper_threshold = 0.9575

# List of all instruments
instruments = list(correlation_matrix.columns)

# Initialize an empty list to store the filtered results
filtered_correlations = []

# Loop through all pairs of instruments and apply the filter
for i in range(len(instruments)):
    for j in range(i + 1, len(instruments)):
        # Skip the pair AUDUSD vs NZDUSD
        if (instruments[i] == "AUDUSD_Candlestick_1_Hour_BID_01_df" and instruments[j] == "NZDUSD_Candlestick_1_Hour_BID_01_df") or (instruments[i] == "NZDUSD_Candlestick_1_Hour_BID_01_df" and instruments[j] == "AUDUSD_Candlestick_1_Hour_BID_01_df"):
            continue
        
        # Get the correlation value between the two instruments
        correlation_value = correlation_matrix.iloc[i, j]
        
        # Apply the filter: only keep correlations between 0.7 and 0.9575
        if lower_threshold <= abs(correlation_value) <= upper_threshold:
            filtered_correlations.append((instruments[i], instruments[j], correlation_value))

# Sort the filtered correlations in descending order by the correlation value
filtered_correlations_sorted = sorted(filtered_correlations, key=lambda x: abs(x[2]), reverse=True)

# Display the next two pairs after AUDUSD vs NZDUSD
print("The next two pairs after AUDUSD vs NZDUSD based on correlation:")
for pair in filtered_correlations_sorted[:2]:
    print(f"{pair[0]} vs {pair[1]}: correlation = {pair[2]:.4f}")


The next two pairs after AUDUSD vs NZDUSD based on correlation:
UKGILT_df vs USDJPY_Candlestick_1_Hour_BID_01_df: correlation = -0.9550
DIESEL_df vs LIGHT_df: correlation = 0.9457


In [14]:

# Define the top 2 pairs
top_pairs = [
    ('UKGILT_df', 'USDJPY_Candlestick_1_Hour_BID_01_df'),
    ('DIESEL_df', 'LIGHT_df')
]

# Perform the Engle-Granger Cointegration Test for each pair
for pair in top_pairs:
    instrument1, instrument2 = pair
    
    # Perform the cointegration test
    score, p_value, _ = coint(close_prices_df[instrument1], close_prices_df[instrument2])
    
    # Print the results
    print(f"Cointegration test result for {instrument1} vs {instrument2}:")
    print(f"p-value = {p_value:.4f}")
    if p_value < 0.05:
        print("The pair is likely cointegrated (p < 0.05).")
    else:
        print("The pair is not likely cointegrated (p >= 0.05).")
    print("-" * 50)


Cointegration test result for UKGILT_df vs USDJPY_Candlestick_1_Hour_BID_01_df:
p-value = 0.1208
The pair is not likely cointegrated (p >= 0.05).
--------------------------------------------------
Cointegration test result for DIESEL_df vs LIGHT_df:
p-value = 0.0584
The pair is not likely cointegrated (p >= 0.05).
--------------------------------------------------


In [15]:
'''
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint
from multiprocessing import Pool

# Set a correlation threshold to filter pairs before running the cointegration test
correlation_threshold = 0.8

# List of all instruments
instruments = list(close_prices_df.columns)

# Function to perform Engle-Granger Cointegration Test on a pair
def cointegration_test(pair):
    instrument1, instrument2 = pair
    score, p_value, _ = coint(close_prices_df[instrument1], close_prices_df[instrument2])
    return (instrument1, instrument2, p_value)

# Pre-filter pairs based on correlation (no exclusion of AUDUSD vs NZDUSD yet)
filtered_pairs = []
for i in range(len(instruments)):
    for j in range(i + 1, len(instruments)):
        # Get the correlation value between the two instruments
        correlation_value = correlation_matrix.iloc[i, j]
        
        # Only include pairs with correlation above the threshold
        if abs(correlation_value) >= correlation_threshold:
            filtered_pairs.append((instruments[i], instruments[j]))

# Use multiprocessing to run the cointegration tests in parallel
with Pool() as pool:
    results = pool.map(cointegration_test, filtered_pairs)

# Sort by p-value to find the top cointegrated pairs
cointegration_results_sorted = sorted(results, key=lambda x: x[2])

# Display the top 10 most cointegrated pairs
print("Top 10 Most Cointegrated Pairs (including AUDUSD vs NZDUSD):")
for pair in cointegration_results_sorted[:10]:
    print(f"{pair[0]} vs {pair[1]}: p-value = {pair[2]:.4f}")

'''

'\nimport statsmodels.api as sm\nfrom statsmodels.tsa.stattools import coint\nfrom multiprocessing import Pool\n\n# Set a correlation threshold to filter pairs before running the cointegration test\ncorrelation_threshold = 0.8\n\n# List of all instruments\ninstruments = list(close_prices_df.columns)\n\n# Function to perform Engle-Granger Cointegration Test on a pair\ndef cointegration_test(pair):\n    instrument1, instrument2 = pair\n    score, p_value, _ = coint(close_prices_df[instrument1], close_prices_df[instrument2])\n    return (instrument1, instrument2, p_value)\n\n# Pre-filter pairs based on correlation (no exclusion of AUDUSD vs NZDUSD yet)\nfiltered_pairs = []\nfor i in range(len(instruments)):\n    for j in range(i + 1, len(instruments)):\n        # Get the correlation value between the two instruments\n        correlation_value = correlation_matrix.iloc[i, j]\n        \n        # Only include pairs with correlation above the threshold\n        if abs(correlation_value) >= c

In [16]:
'''

import statsmodels.api as sm
from statsmodels.tsa.stattools import coint
from concurrent.futures import ProcessPoolExecutor

# Set a correlation threshold to filter pairs before running the cointegration test
correlation_threshold = 0.8

# List of all instruments
instruments = list(close_prices_df.columns)

# Function to perform Engle-Granger Cointegration Test on a pair
def cointegration_test(pair):
    instrument1, instrument2 = pair
    score, p_value, _ = coint(close_prices_df[instrument1], close_prices_df[instrument2])
    return (instrument1, instrument2, p_value)

# Pre-filter pairs based on correlation (no exclusion of AUDUSD vs NZDUSD yet)
filtered_pairs = []
for i in range(len(instruments)):
    for j in range(i + 1, len(instruments)):
        # Get the correlation value between the two instruments
        correlation_value = correlation_matrix.iloc[i, j]
        
        # Only include pairs with correlation above the threshold
        if abs(correlation_value) >= correlation_threshold:
            filtered_pairs.append((instruments[i], instruments[j]))

# Use concurrent.futures to run the cointegration tests in parallel
with ProcessPoolExecutor() as executor:
    results = list(executor.map(cointegration_test, filtered_pairs))

# Sort by p-value to find the top cointegrated pairs
cointegration_results_sorted = sorted(results, key=lambda x: x[2])

# Display the top 10 most cointegrated pairs
print("Top 10 Most Cointegrated Pairs (including AUDUSD vs NZDUSD):")
for pair in cointegration_results_sorted[:10]:
    print(f"{pair[0]} vs {pair[1]}: p-value = {pair[2]:.4f}")
'''

'\n\nimport statsmodels.api as sm\nfrom statsmodels.tsa.stattools import coint\nfrom concurrent.futures import ProcessPoolExecutor\n\n# Set a correlation threshold to filter pairs before running the cointegration test\ncorrelation_threshold = 0.8\n\n# List of all instruments\ninstruments = list(close_prices_df.columns)\n\n# Function to perform Engle-Granger Cointegration Test on a pair\ndef cointegration_test(pair):\n    instrument1, instrument2 = pair\n    score, p_value, _ = coint(close_prices_df[instrument1], close_prices_df[instrument2])\n    return (instrument1, instrument2, p_value)\n\n# Pre-filter pairs based on correlation (no exclusion of AUDUSD vs NZDUSD yet)\nfiltered_pairs = []\nfor i in range(len(instruments)):\n    for j in range(i + 1, len(instruments)):\n        # Get the correlation value between the two instruments\n        correlation_value = correlation_matrix.iloc[i, j]\n        \n        # Only include pairs with correlation above the threshold\n        if abs(cor

In [18]:
from statsmodels.tsa.stattools import coint
from joblib import Parallel, delayed

# Function to perform Engle-Granger Cointegration Test on a pair
def cointegration_test(pair):
    instrument1, instrument2 = pair
    score, p_value, _ = coint(close_prices_df[instrument1], close_prices_df[instrument2])
    return (instrument1, instrument2, p_value)

# Pre-filter pairs based on correlation (no exclusion of AUDUSD vs NZDUSD yet)
filtered_pairs = []
correlation_threshold = 0.7
for i in range(len(instruments)):
    for j in range(i + 1, len(instruments)):
        correlation_value = correlation_matrix.iloc[i, j]
        if abs(correlation_value) >= correlation_threshold:
            filtered_pairs.append((instruments[i], instruments[j]))

# Use joblib for parallel execution
results = Parallel(n_jobs=-1)(delayed(cointegration_test)(pair) for pair in filtered_pairs)

# Sort by p-value to find the top cointegrated pairs
cointegration_results_sorted = sorted(results, key=lambda x: x[2])

# Display the top 10 most cointegrated pairs
print("Top 10 Most Cointegrated Pairs (including AUDUSD vs NZDUSD):")
for pair in cointegration_results_sorted[:10]:
    print(f"{pair[0]} vs {pair[1]}: p-value = {pair[2]:.4f}")



Top 10 Most Cointegrated Pairs (including AUDUSD vs NZDUSD):
BRENT_df vs LIGHT_df: p-value = 0.0001
XAUUSD_Candlestick_1_Hour_BID_01_df vs COCOA_df: p-value = 0.0110
ETHUSD_Candlestick_1_Hour_BID_01_df vs COPPER_df: p-value = 0.0158
BRENT_df vs DIESEL_df: p-value = 0.0213
AUDUSD_Candlestick_1_Hour_BID_01_df vs NZDUSD_Candlestick_1_Hour_BID_01_df: p-value = 0.0470
USSC2000_df vs COPPER_df: p-value = 0.0490
DIESEL_df vs LIGHT_df: p-value = 0.0584
SUGAR_df vs USTBOND_df: p-value = 0.0755
UKGILT_df vs USTBOND_df: p-value = 0.0812
USA500_df vs USA30_df: p-value = 0.0852


<h2> Selecting <br>
<h6>BRENT_df vs LIGHT_df: p-value = 0.0001 <br>
<h6>XAUUSD_Candlestick_1_Hour_BID_01_df vs COCOA_df: p-value = 0.0110


In [20]:

# Create a folder to store the CSV files (if it doesn't already exist)
folder_name = "cointegrated_pairs_data"
os.makedirs(folder_name, exist_ok=True)

# Save the DataFrames for the top 2 most cointegrated pairs
BRENT_df.to_csv(os.path.join(folder_name, 'BRENT_df.csv'), index=False)
LIGHT_df.to_csv(os.path.join(folder_name, 'LIGHT_df.csv'), index=False)
XAUUSD_Candlestick_1_Hour_BID_01_df.to_csv(os.path.join(folder_name, 'XAUUSD_Candlestick_1_Hour_BID_01_df.csv'), index=False)
COCOA_df.to_csv(os.path.join(folder_name, 'COCOA_df.csv'), index=False)

print("DataFrames saved successfully in the folder:", folder_name)


DataFrames saved successfully in the folder: cointegrated_pairs_data


In [25]:
import os

# Example fetching for each DataFrame (adjust the names as needed)
BRENT_df = data_dict['BRENT.CMDUSD_Candlestick_1_Hour_BID_01.01.2020-31.08.2024.csv'][['Time', 'Close']]
LIGHT_df = data_dict['LIGHT.CMDUSD_Candlestick_1_Hour_BID_01.01.2020-31.08.2024.csv'][['Time', 'Close']]
XAUUSD_Candlestick_1_Hour_BID_01_df = data_dict['XAUUSD_Candlestick_1_Hour_BID_01.01.2020-31.08.2024.csv'][['Time', 'Close']]
COCOA_df = data_dict['COCOA.CMDUSD_Candlestick_1_Hour_BID_01.01.2020-31.08.2024.csv'][['Time', 'Close']]

# Create a folder to store the CSV files (if it doesn't already exist)
folder_name = "cointegrated_pairs_data"
os.makedirs(folder_name, exist_ok=True)

# Save the DataFrames with the Time column
BRENT_df.to_csv(os.path.join(folder_name, 'BRENT_df.csv'), index=False)
LIGHT_df.to_csv(os.path.join(folder_name, 'LIGHT_df.csv'), index=False)
XAUUSD_Candlestick_1_Hour_BID_01_df.to_csv(os.path.join(folder_name, 'XAUUSD_Candlestick_1_Hour_BID_01_df.csv'), index=False)
COCOA_df.to_csv(os.path.join(folder_name, 'COCOA_df.csv'), index=False)

print("DataFrames with Time column saved successfully in the folder:", folder_name)


DataFrames with Time column saved successfully in the folder: cointegrated_pairs_data


In [23]:
data_dict

{'USDCAD_Candlestick_1_Hour_BID_01.01.2020-31.08.2024.csv':           Open     High      Low    Close  Volume                Time
 0      1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 00:00:00
 1      1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 01:00:00
 2      1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 02:00:00
 3      1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 03:00:00
 4      1.29853  1.29853  1.29853  1.29853     0.0 2020-01-01 04:00:00
 ...        ...      ...      ...      ...     ...                 ...
 40914  1.34923  1.34923  1.34923  1.34923     0.0 2024-08-31 19:00:00
 40915  1.34923  1.34923  1.34923  1.34923     0.0 2024-08-31 20:00:00
 40916  1.34923  1.34923  1.34923  1.34923     0.0 2024-08-31 21:00:00
 40917  1.34923  1.34923  1.34923  1.34923     0.0 2024-08-31 22:00:00
 40918  1.34923  1.34923  1.34923  1.34923     0.0 2024-08-31 23:00:00
 
 [40919 rows x 6 columns],
 'UKGILT.TRGBP_Candlestick_1_Hour_BID_01.01.2020-31.08.2024.

In [27]:
AUDUSD_Candlestick_1_Hour_BID_01_df = data_dict['AUDUSD_Candlestick_1_Hour_BID_01.01.2020-31.08.2024.csv'][['Time', 'Close']]
NZDUSD_Candlestick_1_Hour_BID_01_df = data_dict['NZDUSD_Candlestick_1_Hour_BID_01.01.2020-31.08.2024.csv'][['Time', 'Close']]


# Create a folder to store the CSV files (if it doesn't already exist)
folder_name = "cointegrated_pairs_data"
os.makedirs(folder_name, exist_ok=True)

# Save the DataFrames with the Time column
AUDUSD_Candlestick_1_Hour_BID_01_df.to_csv(os.path.join(folder_name, 'AUDUSD_df.csv'), index=False)
NZDUSD_Candlestick_1_Hour_BID_01_df.to_csv(os.path.join(folder_name, 'NZDUSD_df.csv'), index=False)