In [1]:
import pandas as pd
import yfinance as yf
from bs4 import BeautifulSoup
import requests

def get_sp500_tickers():
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.find_all('tr')[1:]:
        ticker = row.find_all('td')[0].text.strip()
        tickers.append(ticker)
    return tickers

def load_data(tickers, start_date, end_date):
    data = pd.DataFrame()
    for ticker in tickers:
        stock_data = yf.download(ticker, start=start_date, end=end_date)
        stock_data['Ticker'] = ticker
        data = pd.concat([data, stock_data], ignore_index=True)
    return data

In [2]:
import pandas as pd
import numpy as np
import talib
from sklearn.preprocessing import LabelEncoder

def calculate_rolling_features(df):
    df['30D_MA'] = df.groupby('Ticker')['Close'].rolling(window=30).mean().reset_index(0, drop=True)
    df['60D_MA'] = df.groupby('Ticker')['Close'].rolling(window=60).mean().reset_index(0, drop=True)
    df['30D_STD'] = df.groupby('Ticker')['Close'].rolling(window=30).std().reset_index(0, drop=True)
    return df

def calculate_volatility(df):
    df['Log_Return'] = np.log(df.groupby('Ticker')['Close'].pct_change() + 1)
    df['Volatility'] = df.groupby('Ticker')['Log_Return'].rolling(window=20).std().reset_index(0, drop=True)
    return df

def calculate_talib_features(df):
    df['RSI'] = talib.RSI(df['Close'], timeperiod=14)
    df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = talib.MACD(df['Close'], fastperiod=12, slowperiod=26, signalperiod=9)
    df['ATR'] = talib.ATR(df['High'], df['Low'], df['Close'])
    df['OBV'] = talib.OBV(df['Close'], df['Volume'])
    return df

def calculate_lagged_features(df):
    for i in range(1, 6):
        df[f'Close_Lag_{i}'] = df.groupby('Ticker')['Close'].shift(i)
        df[f'Volume_Lag_{i}'] = df.groupby('Ticker')['Volume'].shift(i)
    return df

def calculate_rsi_features(df):
    df['RSI_14'] = talib.RSI(df['Close'], timeperiod=14)
    df['RSI_28'] = talib.RSI(df['Close'], timeperiod=28)
    return df

def calculate_bollinger_bands(df):
    df['BB_upper'], df['BB_middle'], df['BB_lower'] = talib.BBANDS(df['Close'], timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
    df['BB_Pct'] = (df['Close'] - df['BB_lower']) / (df['BB_upper'] - df['BB_lower'])
    return df

def calculate_sma_ema_features(df):
    df['SMA_50'] = df.groupby('Ticker')['Close'].rolling(window=50).mean().reset_index(0, drop=True)
    df['EMA_20'] = df.groupby('Ticker')['Close'].ewm(span=20, adjust=False).mean().reset_index(0, drop=True)
    return df

def calculate_obv_sma_ratio(df):
    df['OBV_SMA_ratio'] = df['OBV'] / df['SMA_50']
    return df

def calculate_high_low_range(df):
    df['High_Low_Range'] = df['High'] - df['Low']
    return df

def engineer_features(df):
    df = calculate_rolling_features(df)
    df = calculate_volatility(df)
    df = calculate_talib_features(df)
    df = calculate_lagged_features(df)
    df = calculate_rsi_features(df)
    df = calculate_bollinger_bands(df)
    df = calculate_sma_ema_features(df)
    df = calculate_obv_sma_ratio(df)
    df = calculate_high_low_range(df)
    return df




In [3]:
import pandas as pd
from data_loader import get_sp500_tickers, load_data
from feature_engineering import engineer_features

if __name__ == '__main__':
    tickers = get_sp500_tickers()
    data = load_data(tickers, '2022-09-25', '2024-09-30')
    data = engineer_features(data)
    data.to_csv('sp500_stock_data.csv')
    print(data.head())

ModuleNotFoundError: No module named 'data_loader'