In [1]:
import numpy as np
import pandas as pd

pd.options.plotting.backend = 'plotly'

DATA = '/kaggle/input/2019-2024-us-stock-market-data/Stock Market Dataset.csv'

df = pd.read_csv(filepath_or_buffer=DATA, index_col=[0], parse_dates=['Date'], thousands=',')

df.head()

Unnamed: 0,Date,Natural_Gas_Price,Natural_Gas_Vol.,Crude_oil_Price,Crude_oil_Vol.,Copper_Price,Copper_Vol.,Bitcoin_Price,Bitcoin_Vol.,Platinum_Price,...,Berkshire_Price,Berkshire_Vol.,Netflix_Price,Netflix_Vol.,Amazon_Price,Amazon_Vol.,Meta_Price,Meta_Vol.,Gold_Price,Gold_Vol.
0,02-02-2024,2.079,,72.28,,3.8215,,43194.7,42650.0,901.6,...,589498,10580.0,564.64,4030000.0,171.81,117220000.0,474.99,84710000.0,2053.7,
1,01-02-2024,2.05,161340.0,73.82,577940.0,3.8535,,43081.4,47690.0,922.3,...,581600,9780.0,567.51,3150000.0,159.28,66360000.0,394.78,25140000.0,2071.1,260920.0
2,31-01-2024,2.1,142860.0,75.85,344490.0,3.906,,42580.5,56480.0,932.6,...,578020,9720.0,564.11,4830000.0,155.2,49690000.0,390.14,20010000.0,2067.4,238370.0
3,30-01-2024,2.077,139750.0,77.82,347240.0,3.911,,42946.2,55130.0,931.7,...,584680,9750.0,562.85,6120000.0,159.0,42290000.0,400.06,18610000.0,2050.9,214590.0
4,29-01-2024,2.49,3590.0,76.78,331930.0,3.879,,43299.8,45230.0,938.3,...,578800,13850.0,575.79,6880000.0,161.26,42840000.0,401.02,17790000.0,2034.9,1780.0


In [2]:
price_columns = [column for column in df.columns if not column.endswith('Vol.')]
price_df = df[price_columns]
price_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1243 entries, 0 to 1242
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               1243 non-null   object 
 1   Natural_Gas_Price  1243 non-null   float64
 2   Crude_oil_Price    1243 non-null   float64
 3   Copper_Price       1243 non-null   float64
 4   Bitcoin_Price      1243 non-null   float64
 5   Platinum_Price     1243 non-null   float64
 6   Ethereum_Price     1243 non-null   float64
 7   S&P_500_Price      1243 non-null   float64
 8   Nasdaq_100_Price   1243 non-null   float64
 9   Apple_Price        1243 non-null   float64
 10  Tesla_Price        1243 non-null   float64
 11  Microsoft_Price    1243 non-null   float64
 12  Silver_Price       1243 non-null   float64
 13  Google_Price       1243 non-null   float64
 14  Nvidia_Price       1243 non-null   float64
 15  Berkshire_Price    1243 non-null   int64  
 16  Netflix_Price      1243 non-n

Let's look at some price correlations; we sort of expect these groups to move together:
* Commodities - gas, oil, and metals
* Stocks including stock indices
* Cryptocurrencies

To a lesser degree we expect the cryptocurrencies to move with the stocks and against the commodities. 

In [3]:
from plotly.express import imshow
imshow(img=price_df.corr(numeric_only=True))

This is neat and everything but hard to read; let's try grouping similar things together.

In [4]:
sorted_columns = ['Natural_Gas_Price', 'Crude_oil_Price', 'Platinum_Price', 'Silver_Price', 'Copper_Price', 'Gold_Price', 
                  'Apple_Price', 'Tesla_Price', 'Microsoft_Price', 'Google_Price', 'Nvidia_Price', 'Netflix_Price',
                  'Amazon_Price', 'Meta_Price', 'Berkshire_Price', 'S&P_500_Price', 'Nasdaq_100_Price', 'Bitcoin_Price', 'Ethereum_Price',]
imshow(img=price_df[sorted_columns].corr())

Is this any better? We have the commodities first, then the single issues, then the stock indices, then the cryptocurrencies. We do tend to see some in-group correlations that are higher than most out-of-group correlations.

In [5]:
from plotly.express import histogram
correlations_df = price_df[sorted_columns].corr()
correlations_df = correlations_df[ correlations_df < 0.999] # remove autocorrelations, which are always 1.0
most_correlated = {'{}/{}'.format(column, correlations_df.columns[correlations_df[column].argmax()]) : 
    correlations_df[column].iloc[correlations_df[column].argmax()] for column in correlations_df.columns}
histogram(data_frame=pd.DataFrame(data=most_correlated.items(), columns=['Pair', 'Correlation']).sort_values(by='Correlation'), x='Pair', y='Correlation')


Our intuition about groups has mostly been borne out; notable exceptions are Oil x Berkshire Hathaway, Copper x Etherium, Gold x Apple, and maybe Berkshire x Apple. Let's plot some of these interesting pairs and see what these correlations look like.

In [6]:
from plotly.express import line
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_df = pd.DataFrame(data=scaler.fit_transform(X=price_df[price_df.columns[1:]].values), columns=price_df.columns[1:])
scaled_df['Date'] = price_df['Date']

line(data_frame=scaled_df, x='Date', y=[])