# S&P 500 Signal Classifier

In [None]:
Approach:
1. Create features
2. Winsorize data
3. Test, Train split data
4. Scale the test and train data
6. Original, Oversample, and Undersample the data
7. Apply RandomForestClassifier, XGBoost, and LightBoost to predict

## Install Required Packages

In [None]:
# Install Yahoo Finance
!pip install yfinance --upgrade --no-cache-dir

In [None]:
# Install SciKit Learn
!pip install scikit-learn

In [None]:
# Install XGBoost
!pip install xgboost

In [None]:
# Install LightBoost
!pip install lightgbm

## Import Required Dependencies

In [1]:
# Import the required libraries and dependencies
import yfinance as yf
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.model_selection import train_test_split
import pandas as pd
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from xgboost import XGBClassifier
from xgboost import plot_importance
import lightgbm as lgb

In [None]:
import warnings
# Ignore warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

## Retrieve and Load Data

In [None]:
# Retreive data
sp500 = yf.Ticker('^GSPC')
sp500 = sp500.history(period='max')

In [None]:
# Drop timezone from datetime
sp500 = sp500.reset_index()
sp500['Date'] = sp500['Date'].dt.tz_localize(None)
sp500.set_index('Date', inplace=True)

In [None]:
# Get the last rows of the DataFrame
sp500 = sp500.tail(24252) # Get max available data

In [None]:
# Display data
sp500

In [None]:
# Create stock df
stock_df = pd.DataFrame(sp500).dropna()
stock_df

In [None]:
# Drop columns
stock_df.drop(columns={'Volume', 'Dividends', 'Stock Splits'}, inplace=True)

# Sort by ascending date
stock_df = stock_df.sort_values(by="Date", ascending=True)

# Review the first and last five rows of the DataFrame
display(stock_df.head())
display(stock_df.tail())

In [None]:
# Rename dataframe to data
data = stock_df

## Perform Feature Engineering

In [None]:
# Feature Engineering
# Moving Averages
data['SMA10'] = data['Close'].rolling(window=10).mean()
data['SMA20'] = data['Close'].rolling(window=20).mean()
data['SMA30'] = data['Close'].rolling(window=20).mean()
data['SMA50'] = data['Close'].rolling(window=50).mean()
data['SMA100'] = data['Close'].rolling(window=100).mean()
data['SMA200'] = data['Close'].rolling(window=200).mean()

# RSI
def rsi(data, window=14):
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

data['RSI'] = rsi(data)

# MACD
data['EMA12'] = data['Close'].ewm(span=12, adjust=False).mean()
data['EMA26'] = data['Close'].ewm(span=26, adjust=False).mean()
data['MACD'] = data['EMA12'] - data['EMA26']
data['SL'] = data['MACD'].ewm(span=9, adjust=False).mean()

# Bollinger Bands
data['BollM'] = data['Close'].rolling(window=20).mean()
data['BollU'] = data['BollM'] + (2 * data['Close'].rolling(window=20).std())
data['BollL'] = data['BollM'] - (2 * data['Close'].rolling(window=20).std())

# Identify crossover signal for Target Column
# Close < Bollinger Mid and RSI < 50 and MACD < 0
data['Signal'] = np.where((data['Close'] < data['BollM']) & (data['RSI'] < 50) & (data['MACD'] < 0), 1, 0)

# Calculate return
data['Return'] = data['Close'].pct_change().shift(-1)  # Next day's return

# Label Data
# Buy ONLY if all Crossover conditions are met
# Assume profitable if return > 0%
data['Target'] = np.where((data['Signal'] > 0) & (data['Return'] > 0.0), 1, 0)

# Prepare dataset
features = ['SMA10', 'SMA20', 'SMA30', 'SMA50', 'SMA100', 'SMA200', 'Signal', 'RSI', 'MACD', 'SL', 'BollM', 'BollU', 'BollL']
dataset = data.dropna()[features + ['Target']]

## Perform Feature Engineering

In [None]:
# Feature Engineering
# Moving Averages
data['SMA10'] = data['Close'].rolling(window=10).mean()
data['SMA20'] = data['Close'].rolling(window=20).mean()
data['SMA30'] = data['Close'].rolling(window=20).mean()
data['SMA50'] = data['Close'].rolling(window=50).mean()
data['SMA100'] = data['Close'].rolling(window=100).mean()
data['SMA200'] = data['Close'].rolling(window=200).mean()

# RSI
def rsi(data, window=14):
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

data['RSI'] = rsi(data)

# MACD
data['EMA12'] = data['Close'].ewm(span=12, adjust=False).mean()
data['EMA26'] = data['Close'].ewm(span=26, adjust=False).mean()
data['MACD'] = data['EMA12'] - data['EMA26']
data['SL'] = data['MACD'].ewm(span=9, adjust=False).mean()

# Bollinger Bands
data['BollM'] = data['Close'].rolling(window=20).mean()
data['BollU'] = data['BollM'] + (2 * data['Close'].rolling(window=20).std())
data['BollL'] = data['BollM'] - (2 * data['Close'].rolling(window=20).std())

# Identify crossover signal for Target Column
# Close < Bollinger Mid and RSI < 50 and MACD < 0
data['Signal'] = np.where((data['Close'] < data['BollM']) & (data['RSI'] < 50) & (data['MACD'] < 0), 1, 0)

# Calculate return
data['Return'] = data['Close'].pct_change().shift(-1)  # Next day's return

# Label Data
# Buy ONLY if all Crossover conditions are met
# Assume profitable if return > 0%
data['Target'] = np.where((data['Signal'] > 0) & (data['Return'] > 0.0), 1, 0)

# Prepare dataset
features = ['SMA10', 'SMA20', 'SMA30', 'SMA50', 'SMA100', 'SMA200', 'Signal', 'RSI', 'MACD', 'SL', 'BollM', 'BollU', 'BollL']
dataset = data.dropna()[features + ['Target']]

In [2]:
# Stock list
stock_list = [
                'XLK',
                'XLV',
                'XLE',
                'VNQ',
                'XLF',
                'XLB',
                'XLU',
                'XLI',
                'XLP',
                'XLY',
                'XTL',
              ]

# Define dictionary
stock_df = {}

# Loop through stocks 
for stock in stock_list:
    # Read the data into DataFrames
    df = pd.read_csv(f'Resources/{stock}.csv')
    
    # Trim to last 10 years
    stock_df[stock] = df.iloc[-2520:]
    
    # Confirm shape
    print(f'{stock}',stock_df[stock].shape)

XLK (2520, 7)
XLV (2520, 7)
XLE (2520, 7)
VNQ (2520, 7)
XLF (2520, 7)
XLB (2520, 7)
XLU (2520, 7)
XLI (2520, 7)
XLP (2520, 7)
XLY (2520, 7)
XTL (2520, 7)


In [3]:
# Access a specific DataFrame
stock_df['XLK']

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
3897,2014-06-20,38.200001,38.220001,38.009998,38.090000,33.428440,5427200
3898,2014-06-23,38.049999,38.200001,38.020000,38.200001,33.524986,3767100
3899,2014-06-24,38.180000,38.380001,37.910000,38.000000,33.349468,8471300
3900,2014-06-25,37.919998,38.209999,37.880001,38.150002,33.481102,5111900
3901,2014-06-26,38.180000,38.200001,37.889999,38.110001,33.446007,11352300
...,...,...,...,...,...,...,...
6412,2024-06-18,231.350006,232.169998,230.490005,231.410004,231.004745,4338900
6413,2024-06-20,232.449997,232.589996,228.039993,228.809998,228.409302,6621500
6414,2024-06-21,228.860001,229.759995,227.369995,228.410004,228.009995,6780400
6415,2024-06-24,225.639999,226.660004,222.360001,222.419998,222.419998,6874300
