In [1]:
# ───────────────────────────────────────────────
# 1. 패키지 준비
# ───────────────────────────────────────────────
import datetime as dt
import numpy as np
import pandas as pd
import pandas_datareader.data as web
import yfinance as yf                # 시가총액 조회용

dj_START = dt.datetime(2010, 1, 1)
snp_START = dt.datetime(2005, 1, 1)
END   = dt.datetime(2024, 12, 31)

# ───────────────────────────────────────────────
# 2. 지수 구성 종목 크롤링
# ───────────────────────────────────────────────
def get_ticker_list(url, symbol_col='Symbol'):
    tables = pd.read_html(url, flavor="bs4")
    # 첫 번째(또는 유의한) 테이블에서 Symbol 열 추출
    for tbl in tables:
        if symbol_col in tbl.columns:
            return tbl[symbol_col].str.replace('.', '-', regex=False).tolist()

dj_tickers = ['MMM', 'AXP', 'BA', 'CAT', 'CVX', 'CSCO', 'KO', 'DIS', 'HD', 'IBM', 'JNJ', 'MCD', 'MRK', 'MSFT', 'PG', 'TRV', 'VZ', 'WMT'] # 2010 to 2024 (18)

snp_tickers = ["MSFT", "NVDA", "AAPL", "WMT", "JPM", "LLY", "COST", "XOM", "ORCL", "PG",
                "JNJ", "HD", "BAC", "KO", "UNH", "CSCO", "GE", "CVX", "WFC", "IBM",
                "ABT", "MCD", "LIN", "MS", "INTU", "AXP", "T", "DIS", "MRK", "ACN"
                ] # 2005 to 2024 (top 30)


# ───────────────────────────────────────────────
# 4. 가격 데이터 다운로드 (조정종가)
# ───────────────────────────────────────────────
# Using yfinance instead of pandas_datareader due to API changes
dj_prices = yf.download(dj_tickers, start=dj_START, end=END)["Close"]
snp_prices = yf.download(snp_tickers, start=snp_START, end=END)["Close"]


# 결측치 확인
print("DOW 30 종목 결측치:\n", dj_prices.isnull().sum())    
print("S&P 100 Top 50 종목 결측치:\n", snp_prices.isnull().sum())

# ───────────────────────────────────────────────
# 6. 단순수익률 계산
# ───────────────────────────────────────────────
dj_ret = dj_prices.pct_change().dropna()
snp_ret = snp_prices.pct_change().dropna()

# 결과 요약 출력
print(f"DOW 30\t: {len(dj_tickers)} tickers")
print(f"S&P100 Top 50\t: {len(snp_tickers)} tickers")
print(f"최종 유지된 종목 수\t: {dj_ret.shape[1]} (누락·상장일 부족 종목 제외)")
print(f"최종 유지된 종목 수\t: {snp_ret.shape[1]} (누락·상장일 부족 종목 제외)")


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  18 of 18 completed
[*********************100%***********************]  30 of 30 completed

DOW 30 종목 결측치:
 Ticker
AXP     0
BA      0
CAT     0
CSCO    0
CVX     0
DIS     0
HD      0
IBM     0
JNJ     0
KO      0
MCD     0
MMM     0
MRK     0
MSFT    0
PG      0
TRV     0
VZ      0
WMT     0
dtype: int64
S&P 100 Top 50 종목 결측치:
 Ticker
AAPL    0
ABT     0
ACN     0
AXP     0
BAC     0
COST    0
CSCO    0
CVX     0
DIS     0
GE      0
HD      0
IBM     0
INTU    0
JNJ     0
JPM     0
KO      0
LIN     0
LLY     0
MCD     0
MRK     0
MS      0
MSFT    0
NVDA    0
ORCL    0
PG      0
T       0
UNH     0
WFC     0
WMT     0
XOM     0
dtype: int64
DOW 30	: 18 tickers
S&P100 Top 50	: 30 tickers
최종 유지된 종목 수	: 18 (누락·상장일 부족 종목 제외)
최종 유지된 종목 수	: 30 (누락·상장일 부족 종목 제외)





In [2]:
dj_prices.head(15)

Ticker,AXP,BA,CAT,CSCO,CVX,DIS,HD,IBM,JNJ,KO,MCD,MMM,MRK,MSFT,PG,TRV,VZ,WMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-01-04,32.720375,43.777557,39.547813,16.255333,42.603546,27.47588,19.810135,73.778816,40.97364,17.692522,41.03273,43.356606,20.993826,23.211441,38.950218,34.578194,14.213722,13.017121
2010-01-05,32.648418,45.211338,40.020622,16.182915,42.905327,27.407341,19.95524,72.887573,40.498512,17.478495,40.718994,43.085033,21.078924,23.218939,38.962986,33.759029,14.23935,12.887506
2010-01-06,33.176144,46.582798,40.142208,16.077576,42.910725,27.261692,19.886143,72.414101,40.827919,17.472296,40.163433,43.696068,21.362549,23.076448,38.77816,33.280029,13.829906,12.858699
2010-01-07,33.714233,48.468548,40.304317,16.149996,42.749062,27.270262,20.121069,72.163429,40.53653,17.428862,40.457558,43.727398,21.396576,22.836456,38.56786,33.759029,13.747586,12.8659
2010-01-08,33.690147,48.001011,40.756859,16.235582,42.824482,27.313097,20.024338,72.887573,40.675896,17.106283,40.418343,44.035519,21.385235,22.99395,38.51688,33.710449,13.756249,12.801088
2010-01-11,33.304638,47.432163,43.316849,16.189491,43.584301,26.867586,19.457741,72.124413,40.682213,17.453678,40.732059,43.857948,21.470327,22.701462,38.363937,33.696556,13.81258,13.012322
2010-01-12,33.746357,47.089302,42.040241,15.93273,43.331051,26.404942,19.33337,72.698151,40.89761,17.642889,40.954288,43.894524,21.300148,22.551472,38.803661,33.911758,13.825578,13.137136
2010-01-13,33.850765,47.658157,42.101025,16.222414,43.002331,26.807617,19.437014,72.542221,41.157341,17.689413,40.90855,43.753502,22.082951,22.761457,39.205139,34.029781,13.808244,13.204349
2010-01-14,34.276394,47.969833,41.864624,16.426517,42.867596,26.576294,19.554472,73.700829,41.239693,17.720438,40.947754,43.607288,22.684238,23.218939,39.24337,34.210262,13.526623,13.012322
2010-01-15,34.043499,47.393204,40.885323,16.064407,42.69516,26.216461,19.741035,73.405586,40.89761,17.459881,40.705925,43.539371,22.389259,23.143944,38.759048,33.870113,13.249331,12.885101


In [3]:
dj_ret.head(15)

Ticker,AXP,BA,CAT,CSCO,CVX,DIS,HD,IBM,JNJ,KO,MCD,MMM,MRK,MSFT,PG,TRV,VZ,WMT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-01-05,-0.002199,0.032751,0.011955,-0.004455,0.007083,-0.002495,0.007325,-0.01208,-0.011596,-0.012097,-0.007646,-0.006264,0.004053,0.000323,0.000328,-0.02369,0.001803,-0.009957
2010-01-06,0.016164,0.030334,0.003038,-0.006509,0.000126,-0.005314,-0.003463,-0.006496,0.008134,-0.000355,-0.013644,0.014182,0.013455,-0.006137,-0.004744,-0.014189,-0.028754,-0.002235
2010-01-07,0.016219,0.040482,0.004038,0.004504,-0.003767,0.000314,0.011814,-0.003462,-0.007137,-0.002486,0.007323,0.000717,0.001593,-0.0104,-0.005423,0.014393,-0.005952,0.00056
2010-01-08,-0.000714,-0.009646,0.011228,0.005299,0.001764,0.001571,-0.004807,0.010035,0.003438,-0.018508,-0.000969,0.007046,-0.00053,0.006897,-0.001322,-0.001439,0.00063,-0.005037
2010-01-11,-0.011443,-0.011851,0.062811,-0.002839,0.017743,-0.016311,-0.028295,-0.01047,0.000155,0.020308,0.007762,-0.004032,0.003979,-0.01272,-0.003971,-0.000412,0.004095,0.016501
2010-01-12,0.013263,-0.007228,-0.029471,-0.01586,-0.005811,-0.017219,-0.006392,0.007955,0.005295,0.010841,0.005456,0.000834,-0.007926,-0.006607,0.011462,0.006386,0.000941,0.009592
2010-01-13,0.003094,0.01208,0.001446,0.018182,-0.007586,0.01525,0.005361,-0.002145,0.006351,0.002637,-0.001117,-0.003213,0.036751,0.009311,0.010346,0.00348,-0.001254,0.005116
2010-01-14,0.012574,0.00654,-0.005615,0.012582,-0.003133,-0.008629,0.006043,0.015971,0.002001,0.001754,0.000958,-0.003342,0.027229,0.020099,0.000975,0.005304,-0.020395,-0.014543
2010-01-15,-0.006795,-0.012021,-0.023392,-0.022044,-0.004023,-0.01354,0.009541,-0.004006,-0.008295,-0.014704,-0.005906,-0.001557,-0.013004,-0.00323,-0.012341,-0.009943,-0.0205,-0.009777
2010-01-19,0.013446,-0.002795,0.013473,0.018443,0.00568,0.013399,0.010851,0.017909,0.012237,0.002309,0.019268,0.020991,0.029136,0.007777,0.014798,0.013117,0.021256,0.00652


In [4]:
# to csv
dj_prices.to_csv('dataset/dow_30.csv')
snp_prices.to_csv('dataset/snp_500.csv')

In [6]:
# to csv
dj_ret.to_csv('dataset/dow30.csv')
snp_ret.to_csv('dataset/snp500.csv')