In [76]:
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt

In [92]:
def get_cleaned_log_returns(data, start_year, end_year):
  period = (data.index <= str(end_year)) & (data.index >= str(start_year))
  sliced_data = data['Close'][period]
  clean_data = sliced_data.dropna(axis=1)
  log_returns = np.log(clean_data / clean_data.shift(1))
  return log_returns.dropna()

In [98]:
def make_shape_table(data, start_year_range, end_year_range):
  shape_table = pd.DataFrame(index=start_year_range, columns=end_year_range)
  for start_year in start_year_range:
    for end_year in end_year_range:
      clean_data = get_cleaned_log_returns(data, start_year, end_year)
      shape_table.loc[start_year, end_year] = clean_data.shape[1]
  return shape_table

# Stocks

In [79]:
stocks_tickers = pd.read_html(
    'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    )[0]['Symbol'].tolist()

stocks = yf.download(stocks_tickers, start='1900-01-01', end='2024-02-16')

[*********************100%%**********************]  503 of 503 completed
ERROR:yfinance:
2 Failed downloads:
ERROR:yfinance:['BRK.B']: Exception('%ticker%: No timezone found, symbol may be delisted')
ERROR:yfinance:['BF.B']: Exception('%ticker%: 1d data not available for startTime=-2208971040 and endTime=1708059600. Only 100 years worth of day granularity data are allowed to be fetched per request.')


In [99]:
make_shape_table(stocks, range(1990, 2005), range(2015, 2024))

Unnamed: 0,2015,2016,2017,2018,2019,2020,2021,2022,2023
1990,247,247,247,247,247,247,247,247,247
1991,252,252,252,252,252,252,252,252,252
1992,263,263,263,263,263,263,263,263,263
1993,273,273,273,273,273,273,273,273,273
1994,290,290,290,290,290,290,290,290,290
1995,302,302,302,302,302,302,302,302,302
1996,318,318,318,318,318,318,318,318,318
1997,323,323,323,323,323,323,323,323,323
1998,338,338,338,338,338,338,338,338,338
1999,348,348,348,348,348,348,348,348,348


In [100]:
stocks_log_returns = get_cleaned_log_returns(stocks, 2001, 2024)
stocks_log_returns.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5784 entries, 2001-01-03 to 2023-12-29
Columns: 369 entries, A to ZION
dtypes: float64(369)
memory usage: 16.3 MB


# Commodities

In [96]:
commodities_tickers = ['GC=F', 'SI=F', 'CL=F', 'C=F', 'S=F', 'HG=F', 'PL=F',
                       'PA=F', 'NG=F', 'HE=F', 'CC=F', 'CT=F', 'SB=F', 'ALI=F',
                       'NI=F', 'PB=F', 'HG=F', 'SN=F', 'LCO=F', 'OP=F', 'WO=F',
                       'KC=F', 'C=F', 'ZR=F', 'CO=F', 'LIT-USD', 'UX-U21.NYMEX',
                       'HRC=F', 'DC=F', 'MTF=F']
commodities = yf.download(commodities_tickers,
                          start='1900-01-01',
                          end='2024-02-16')

[*********************100%%**********************]  27 of 28 completed
ERROR:yfinance:
10 Failed downloads:
[*********************100%%**********************]  27 of 28 completedERROR:yfinance:['CO=F', 'WO=F', 'PB=F', 'NI=F', 'LCO=F', 'SN=F', 'UX-U21.NYMEX', 'OP=F']: Exception('%ticker%: No timezone found, symbol may be delisted')
ERROR:yfinance:['S=F', 'C=F']: Exception('%ticker%: No price data found, symbol may be delisted (1d 1900-01-01 -> 2024-02-16)')


In [119]:
make_shape_table(commodities, range(2005, 2014), range(2015, 2024))

Unnamed: 0,2015,2016,2017,2018,2019,2020,2021,2022,2023
2005,0,0,0,0,0,0,0,0,0
2006,0,0,0,0,0,0,0,0,0
2007,1,1,1,1,0,0,0,0,0
2008,6,5,5,5,2,0,0,0,0
2009,6,5,5,5,2,0,0,0,0
2010,8,7,6,6,3,0,0,0,0
2011,10,9,7,7,4,0,0,0,0
2012,12,11,9,9,6,0,0,0,0
2013,12,11,9,9,6,0,0,0,0


In [105]:
commodities_log_returns = get_cleaned_log_returns(commodities, 2008, 2018)
commodities_log_returns.head()

Ticker,CL=F,GC=F,HG=F,NG=F,SI=F
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-01-03,-0.004427,0.010909,0.039372,-0.022676,0.014076
2008-01-04,-0.012888,-0.003816,-0.009977,0.021528,-0.002343
2008-01-07,-0.029225,-0.004063,-0.005266,0.004835,-0.010876
2008-01-08,0.012956,0.021179,0.046425,0.011107,0.034128
2008-01-09,-0.006875,0.001707,-0.002753,0.016433,0.001718


Это золото, серебро, медь, нефть и природный газ

# Cryptos

In [109]:
cryptos_tickers = ['BTC-USD', 'ETH-USD', 'USDT-USD', 'BNB-USD', 'XRP-USD',
                   'ADA-USD', 'SOL-USD', 'AVAX-USD', 'DOGE-USD', 'LUNA1-USD',
                   'DOT-USD', 'SHIB-USD', 'LINK-USD', 'CRO-USD', 'LTC-USD',
                   'ALGO-USD', 'WBTC-USD', 'MATIC-USD', 'ICP-USD', 'ATOM1-USD',
                   'VET-USD', 'FIL-USD', 'TRX-USD', 'THETA-USD', 'XLM-USD',
                   'FTT-USD', 'UST-USD', 'HBAR-USD', 'XTZ-USD']
cryptos = yf.download(cryptos_tickers, start='1900-01-01', end='2024-02-16')

[*********************100%%**********************]  29 of 29 completed
ERROR:yfinance:
1 Failed download:
ERROR:yfinance:['ATOM1-USD']: Exception('%ticker%: No price data found, symbol may be delisted (1d 1900-01-01 -> 2024-02-16)')


In [112]:
make_shape_table(cryptos, range(2015, 2021), range(2020, 2025))

Unnamed: 0,2020,2021,2022,2023,2024
2015,2,2,2,2,2
2016,2,2,2,2,2
2017,2,2,2,2,2
2018,13,13,13,13,13
2019,16,16,16,16,16
2020,22,22,22,21,21


In [114]:
cryptos_log_returns = get_cleaned_log_returns(cryptos, 2018, 2024)
cryptos_log_returns.head()

Ticker,ADA-USD,BNB-USD,BTC-USD,DOGE-USD,ETH-USD,FIL-USD,LINK-USD,LTC-USD,TRX-USD,USDT-USD,XLM-USD,XRP-USD,XTZ-USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-02,0.071402,0.049065,0.092589,0.026145,0.135145,0.031848,-0.085111,0.110076,0.420053,-0.002366,0.162609,0.036897,0.202254
2018-01-03,0.321796,0.076027,0.014505,0.018955,0.084803,-0.014177,0.011005,-0.041183,0.185331,0.008462,0.461782,0.224511,0.246077
2018-01-04,0.031419,-0.034339,0.025858,0.034173,0.01873,0.029337,0.368192,-0.016428,0.786667,-0.010824,-0.213333,0.028964,-0.141638
2018-01-05,-0.108506,0.481792,0.110945,0.232391,0.01698,0.166555,-0.081322,0.03221,0.058734,-0.003894,-0.08852,-0.047379,-0.11492
2018-01-06,0.027229,0.422481,0.005578,0.200148,0.043117,0.014473,0.158708,0.173351,-0.232522,0.010317,0.068847,0.014736,-0.006403


# Save to CSV

In [115]:
stocks_log_returns.head()

Ticker,A,AAPL,ABT,ACGL,ADBE,ADI,ADM,ADP,ADSK,AEE,...,WMT,WRB,WST,WY,XEL,XOM,XRAY,YUM,ZBRA,ZION
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-01-03,0.09821,0.096075,-0.022325,0.0,0.214885,0.148053,-0.043675,-0.021375,0.06134,-0.043823,...,0.081291,-0.012154,0.040005,0.024632,-0.035959,-0.044452,-0.052644,0.0552,0.102258,0.032823
2001-01-04,0.037163,0.041128,-0.096107,0.0,-0.075021,-0.044556,0.004454,-0.069213,0.07067,-0.076556,...,-0.039263,-0.144461,-0.002454,0.02996,-0.07358,-0.028255,-0.066342,0.00185,-0.004003,-0.020388
2001-01-05,-0.056275,-0.041128,0.005831,0.0,-0.042762,-0.090287,-0.017938,-0.032496,-0.010718,-0.001561,...,-0.040868,0.017121,-0.007398,-0.003548,0.00246,0.004515,0.010772,-0.031928,-0.029853,-0.011393
2001-01-08,-0.033471,0.011385,-0.010227,0.021323,-0.038347,-0.034716,0.061423,0.070359,-0.006487,0.029255,...,0.0,0.034878,-0.00995,-0.020348,0.01221,-0.004515,0.029905,-0.005742,-0.015268,0.002081
2001-01-09,-0.004706,0.037042,0.021787,0.004211,0.005031,-0.002721,0.0,0.030321,0.141364,-0.001519,...,-0.022262,-0.036423,0.004988,-0.033193,0.007255,-0.010614,-0.029905,-0.005775,0.04512,-0.010449


In [116]:
commodities_log_returns.head()

Ticker,CL=F,GC=F,HG=F,NG=F,SI=F
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-01-03,-0.004427,0.010909,0.039372,-0.022676,0.014076
2008-01-04,-0.012888,-0.003816,-0.009977,0.021528,-0.002343
2008-01-07,-0.029225,-0.004063,-0.005266,0.004835,-0.010876
2008-01-08,0.012956,0.021179,0.046425,0.011107,0.034128
2008-01-09,-0.006875,0.001707,-0.002753,0.016433,0.001718


In [117]:
cryptos_log_returns.head()

Ticker,ADA-USD,BNB-USD,BTC-USD,DOGE-USD,ETH-USD,FIL-USD,LINK-USD,LTC-USD,TRX-USD,USDT-USD,XLM-USD,XRP-USD,XTZ-USD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2018-01-02,0.071402,0.049065,0.092589,0.026145,0.135145,0.031848,-0.085111,0.110076,0.420053,-0.002366,0.162609,0.036897,0.202254
2018-01-03,0.321796,0.076027,0.014505,0.018955,0.084803,-0.014177,0.011005,-0.041183,0.185331,0.008462,0.461782,0.224511,0.246077
2018-01-04,0.031419,-0.034339,0.025858,0.034173,0.01873,0.029337,0.368192,-0.016428,0.786667,-0.010824,-0.213333,0.028964,-0.141638
2018-01-05,-0.108506,0.481792,0.110945,0.232391,0.01698,0.166555,-0.081322,0.03221,0.058734,-0.003894,-0.08852,-0.047379,-0.11492
2018-01-06,0.027229,0.422481,0.005578,0.200148,0.043117,0.014473,0.158708,0.173351,-0.232522,0.010317,0.068847,0.014736,-0.006403


In [118]:
stocks_log_returns.to_csv('stocks.csv')
commodities_log_returns.to_csv('commodities.csv')
cryptos_log_returns.to_csv('cryptos.csv')

Получилось три разных датасета: большой размерности (d = 369) и с большим количеством данных (14 лет) для стоков, маленькой размерности (d = 4) и с большим количеством данных (11 лет) для коммодити и средней размерности (d = 10) и с небольшим количеством данных (6 лет) для крипты.