In [1]:
import numpy as np
import pandas as pd
import yfinance as yf

# Data Collection
We collected historical data of 8 stocks from Yahoo Finance using the `yfinance` library from 1 September 2017 to 1 October 2024.
- AAPL
- MSFT
- GOOGL
- AMZN
- TSLA
- META
- ^GSPC: S&P 500 Index (market index)
- ^TNX: U.S. 10-Year Treasury yield (interest rate)

In [2]:
def download_stock_data(ticker, start_date, end_date):
    """
    Download stock data from Yahoo Finance.
    
    :param ticker: Stock ticker symbol (e.g., AAPL for Apple).
    :param start_date: Start date for the data (YYYY-MM-DD).
    :param end_date: End date for the data (YYYY-MM-DD).
    :return: DataFrame with stock data.
    """
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

In [15]:
# Collect raw data
if __name__ == "__main__":

    # List of stock tickers to process
    stock_tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'TSLA', 'META', '^GSPC', '^TNX']
    
    # Start and end dates for data collection
    start_date = '2017-09-01'
    end_date = '2024-10-01'
    
    for ticker in stock_tickers:
        # Download stock data
        data = download_stock_data(ticker, start_date, end_date)
        
        # Save the preprocessed data to a CSV file
        data.to_csv(f"../data/{ticker.lower()}.csv")
        
        print(f"Raw data for {ticker} saved to data/{ticker.lower()}.csv")

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Raw data for AAPL saved to data/aapl.csv
Raw data for MSFT saved to data/msft.csv
Raw data for GOOGL saved to data/googl.csv
Raw data for AMZN saved to data/amzn.csv
Raw data for TSLA saved to data/tsla.csv
Raw data for META saved to data/meta.csv
Raw data for ^GSPC saved to data/^gspc.csv


[*********************100%***********************]  1 of 1 completed


Raw data for ^TNX saved to data/^tnx.csv


# Data Preprocessing
We combine the Adjusted Close Price from each stock into a single dataframe and perform feature engineering by calculating technical indicators like Simple Moving Average (SMA) and Exponential Moving Average (EMA) of **AAPL** Stock Price. Missing values are also dropped.

In [3]:
# Load raw data
stocks = ['aapl', 'msft', 'googl', 'amzn', 'tsla', 'meta', '^gspc', '^tnx']
dfs = {}

for stock in stocks:
    df = pd.read_csv(f'../data/{stock}.csv', index_col='Date', parse_dates=True)
    dfs[stock] = df[['Adj Close']]  # Focus on 'Adj Close' prices

# Concatenate the data into a single dataframe, aligning by date
data = pd.concat([dfs[stock] for stock in stocks], axis=1)
data.columns = [stock + '_adj_close' for stock in stocks]
data = data.rename(columns={'^gspc_adj_close': 'sp500_adj_close'})  # Rename S&P 500 column
data = data.rename(columns={'^tnx_adj_close': '10y_treasury_yield'})  # Rename 10-year Treasury yield column

# Feature engineering
data['aapl_sma_10'] = data['aapl_adj_close'].rolling(window=10).mean()
data['aapl_ema_20'] = data['aapl_adj_close'].ewm(span=20, adjust=False).mean()  # 20-day EMA

data = data.dropna()  # Drop rows with missing values

We define our Y (target to predict) and X (features) series:

Y: **AAPL** Daily Adjusted Close Price

X: 
- 1-Day Lags of MSFT, GOOGL, AMZN, TSLA, META, SP500 Adjusted Close Price
- 1-Day Lag of U.S. 10-Year Treasury yield
- 1-Day Lags of technical indicators: SMA, EMA of AAPL's Stock Price

In [4]:
y = data['aapl_adj_close'][1:]
X = data.drop(columns=['aapl_adj_close']).shift(1).dropna()
X.columns = [f'{col}_lag' for col in X.columns]

data = pd.concat([y, X], axis=1)
data.head()

Unnamed: 0_level_0,aapl_adj_close,msft_adj_close_lag,googl_adj_close_lag,amzn_adj_close_lag,tsla_adj_close_lag,meta_adj_close_lag,sp500_adj_close_lag,10y_treasury_yield_lag,aapl_sma_10_lag,aapl_ema_20_lag
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-09-18,37.234501,69.565742,46.648991,49.3395,25.320667,171.124756,2500.22998,2.202,37.736682,37.964495
2017-09-19,37.248585,69.427216,46.372673,48.709499,25.666668,169.499649,2503.870117,2.229,37.610431,37.894972
2017-09-20,36.624371,69.685829,46.727295,48.493,25.006666,172.002121,2506.649902,2.243,37.531819,37.833411
2017-09-21,35.995461,69.223961,47.259975,48.6605,24.927334,171.653168,2508.23999,2.277,37.394775,37.718264
2017-09-22,35.643456,68.549644,47.260471,48.232498,24.431999,170.596359,2500.600098,2.278,37.210093,37.554188


In [5]:
# Save the preprocessed data to a CSV file
data.to_csv("../data/stocks_clean.csv")