In [1]:
import numpy as np
import pandas as pd

DATA = '/kaggle/input/2019-2024-us-stock-market-data/Stock Market Dataset.csv'

df = pd.read_csv(filepath_or_buffer=DATA, index_col=[0], parse_dates=['Date'], thousands=',', date_format="%d-%m-%Y")
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by='Date', ascending=True).reset_index(drop=True)

df.head()

Unnamed: 0,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,Platinum_Price,...,Berkshire_Price,Berkshire_Vol.,Netflix_Price,Netflix_Vol.,Amazon_Price,Amazon_Vol.,Meta_Price,Meta_Vol.,Gold_Price,Gold_Vol.
0,2019-02-04,2.66,116490.0,54.56,622470.0,2.7975,490.0,3462.8,503920.0,822.5,...,312000,310.0,351.34,9050000.0,81.67,98580000.0,169.25,20040000.0,1319.3,159560.0
1,2019-02-05,2.662,82250.0,53.66,609760.0,2.8205,90.0,3468.4,460950.0,821.35,...,310700,360.0,355.81,9050000.0,82.94,89060000.0,171.16,22560000.0,1319.2,129010.0
2,2019-02-06,2.662,98330.0,54.01,606720.0,2.84,100.0,3404.3,514210.0,807.1,...,308810,120.0,352.19,6720000.0,82.01,78800000.0,170.49,13280000.0,1314.4,137250.0
3,2019-02-07,2.551,211790.0,52.64,749010.0,2.832,320.0,3397.7,471360.0,800.8,...,302813,240.0,344.71,7860000.0,80.72,92530000.0,166.38,17520000.0,1314.2,166760.0
4,2019-02-08,2.583,147880.0,52.72,621000.0,2.814,270.0,3661.7,699230.0,802.2,...,300771,240.0,347.57,7560000.0,79.41,113150000.0,167.33,12560000.0,1318.5,150610.0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243 entries, 0 to 1242
Data columns (total 38 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1243 non-null   datetime64[ns]
 1   Natural_Gas_Price  1243 non-null   float64       
 2   Natural_Gas_Vol.   1239 non-null   float64       
 3   Crude_oil_Price    1243 non-null   float64       
 4   Crude_oil_Vol.     1220 non-null   float64       
 5   Copper_Price       1243 non-null   float64       
 6   Copper_Vol.        1206 non-null   float64       
 7   Bitcoin_Price      1243 non-null   float64       
 8   Bitcoin_Vol.       1243 non-null   float64       
 9   Platinum_Price     1243 non-null   float64       
 10  Platinum_Vol.      636 non-null    float64       
 11  Ethereum_Price     1243 non-null   float64       
 12  Ethereum_Vol.      1243 non-null   float64       
 13  S&P_500_Price      1243 non-null   float64       
 14  Nasdaq_1

In [3]:
price_columns = [column for column in df.columns if not column.endswith('Vol.')]
price_df = df[price_columns]
price_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243 entries, 0 to 1242
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               1243 non-null   datetime64[ns]
 1   Natural_Gas_Price  1243 non-null   float64       
 2   Crude_oil_Price    1243 non-null   float64       
 3   Copper_Price       1243 non-null   float64       
 4   Bitcoin_Price      1243 non-null   float64       
 5   Platinum_Price     1243 non-null   float64       
 6   Ethereum_Price     1243 non-null   float64       
 7   S&P_500_Price      1243 non-null   float64       
 8   Nasdaq_100_Price   1243 non-null   float64       
 9   Apple_Price        1243 non-null   float64       
 10  Tesla_Price        1243 non-null   float64       
 11  Microsoft_Price    1243 non-null   float64       
 12  Silver_Price       1243 non-null   float64       
 13  Google_Price       1243 non-null   float64       
 14  Nvidia_P

Let's look at some price correlations; we sort of expect these groups to move together:
* Commodities - gas, oil, and metals
* Stocks including stock indices
* Cryptocurrencies

To a lesser degree we expect the cryptocurrencies to move with the stocks and against the commodities. 

In [4]:
from plotly.express import imshow
imshow(img=price_df.corr(numeric_only=True))

This is neat and everything but hard to read; let's try grouping similar things together.

In [5]:
sorted_columns = ['Natural_Gas_Price', 'Crude_oil_Price', 'Platinum_Price', 'Silver_Price', 'Copper_Price', 'Gold_Price', 
                  'Apple_Price', 'Tesla_Price', 'Microsoft_Price', 'Google_Price', 'Nvidia_Price', 'Netflix_Price',
                  'Amazon_Price', 'Meta_Price', 'Berkshire_Price', 'S&P_500_Price', 'Nasdaq_100_Price', 'Bitcoin_Price', 'Ethereum_Price',]
imshow(img=price_df[sorted_columns].corr())

Is this any better? We have the commodities first, then the single issues, then the stock indices, then the cryptocurrencies. We do tend to see some in-group correlations that are higher than most out-of-group correlations.

In [6]:
from plotly.express import histogram
correlations_df = price_df[sorted_columns].corr()
correlations_df = correlations_df[ correlations_df < 0.999] # remove autocorrelations, which are always 1.0
most_correlated = {'{}/{}'.format(column, correlations_df.columns[correlations_df[column].argmax()]) : 
    correlations_df[column].iloc[correlations_df[column].argmax()] for column in correlations_df.columns}
histogram(data_frame=pd.DataFrame(data=most_correlated.items(), columns=['Pair', 'Correlation']).sort_values(by='Correlation'), x='Pair', y='Correlation')


Our intuition about groups has mostly been borne out; notable exceptions are Oil x Berkshire Hathaway, Copper x Etherium, Gold x Apple, and maybe Berkshire x Apple. Let's plot some of these interesting pairs and see what these correlations look like.

In [7]:
from plotly.express import line
from sklearn.preprocessing import MinMaxScaler

# we need to scale the series to be able to see the correlations so let's do that
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(data=scaler.fit_transform(X=price_df[price_df.columns[1:]].values), columns=price_df.columns[1:])
scaled_df['Date'] = price_df['Date']

# the three cases with the lowest high correlations (the low end of the histogram above)
for pair in sorted(most_correlated.items(), key=lambda x: x[1])[:4]:
    line(data_frame=scaled_df, x='Date', y=pair[0].split('/'), height=600).show()

# the highest correlations
for pair in sorted(most_correlated.items(), key=lambda x: x[1])[-3:]:
    line(data_frame=scaled_df, x='Date', y=pair[0].split('/'), height=600).show()