In [3]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os

In [4]:
def initialize():
    sp500_stock_data = pd.read_csv(
        "./Data/YahooFinanceHistoricalPriceData/sp500_stocks.csv"
    )
    sp500_stock_data["Date"] = sp500_stock_data["Date"].astype("datetime64[ns]")
    sp500_stock_names = pd.read_csv(
        "./Data/YahooFinanceHistoricalPriceData/sp500_companies.csv"
    )
    sp500_stock_names.loc[len(sp500_stock_names.index)] = [
        "SPY",
        "SPY",
        "Index",
    ]
    sp500_stock_names = sp500_stock_names.set_index("Symbol")

    spy_data = pd.read_csv(
        "./Data/YahooFinanceHistoricalPriceData/spy_daily.csv"
    )
    spy_data["Date"] = spy_data["Date"].astype("datetime64[ns]")
    spy_data["Open"] = spy_data["Open"].astype("float")
    spy_data["High"] = spy_data["High"].astype("float")
    spy_data["Low"] = spy_data["Low"].astype("float")
    spy_data["Close"] = spy_data["Close"].astype("float")
    spy_data["Volume"] = spy_data["Volume"].astype("float")
    spy_data["Symbol"] = "SPY"

    # combine all three dataframes
    sp500_stock_data = pd.concat([sp500_stock_data, spy_data])
    sp500_stock_data = sp500_stock_data.reset_index()
    sp500_stock_data = sp500_stock_data.join(sp500_stock_names, on="Symbol", how="left")
    # reindex with index as stock symbols
    sp500_stock_data = sp500_stock_data.set_index(["Symbol", "Sector", "Date"])

    return sp500_stock_data, sp500_stock_names

In [5]:
sp500_stock_symbols, names= initialize()

# Separate company names and symbols and make tupel list
company_names = names["Name"].tolist()
stock_symbols = names.index.tolist()
companies = list(zip(company_names, stock_symbols))
display(companies)

[('3M Co.', 'MMM'),
 ('ACE Limited', 'ACE'),
 ('Abbott Laboratories', 'ABT'),
 ('Abercrombie & Fitch Company A', 'ANF'),
 ('Accenture', 'ACN'),
 ('Adobe Systems Inc', 'ADBE'),
 ('Advanced Micro Devices', 'AMD'),
 ('AES Corp', 'AES'),
 ('Aetna Inc', 'AET'),
 ('AFLAC Inc', 'AFL'),
 ('Agilent Technologies Inc', 'A'),
 ('AGL Resources Inc.', 'GAS'),
 ('Air Products & Chemicals Inc', 'APD'),
 ('Airgas Inc', 'ARG'),
 ('Akamai Technologies Inc', 'AKAM'),
 ('Alcoa Inc', 'AA'),
 ('Alexion Pharmaceuticals', 'ALXN'),
 ('Allegheny Technologies Inc', 'ATI'),
 ('Allergan Inc', 'AGN'),
 ('Allstate Corp', 'ALL'),
 ('Alpha Natural Resources', 'ANR'),
 ('Altera Corp', 'ALTR'),
 ('Altria Group Inc', 'MO'),
 ('Amazon.com Inc', 'AMZN'),
 ('Ameren Corp', 'AEE'),
 ('American Electric Power', 'AEP'),
 ('American Express Co', 'AXP'),
 ('American Intl Group Inc', 'AIG'),
 ('American Tower Corp A', 'AMT'),
 ('Ameriprise Financial', 'AMP'),
 ('AmerisourceBergen Corp', 'ABC'),
 ('Amgen Inc', 'AMGN'),
 ('Amphenol C

In [14]:
# What stock do you want to analyze
stock_name="QUALCOMM Inc."


# get stock symbols
stock_symbols = sp500_stock_symbols.index.get_level_values(0).unique()


# get stock ticker data for one stock
ticker_Data = sp500_stock_symbols.loc[
    sp500_stock_symbols.Name == stock_name
]

# format ticker_Data
if isinstance(ticker_Data.index, pd.MultiIndex):
    ticker_Data = ticker_Data.droplevel([0, 1])


In [15]:
#Indicators

# calculate different indicators
# Simple Moving Average
def sma(data, period):
    # set dataframe
    sma = pd.DataFrame()

    # calculate sma
    sma["SMA"] = data["Close"].rolling(period).mean()
    sma = sma.fillna(0)
    return sma


# Exponential Moving Average
def ema(data, period):
    # set dataframe
    ema = pd.DataFrame()

    # calculate ema
    ema["EWMA"] = data["Close"].ewm(alpha=1 / period, adjust=False).mean()
    ema = ema.fillna(0)
    return ema


# Moving Average Convergence Divergence
def macd(data):
    # set dataframe
    macd = pd.DataFrame()

    # calculate macd
    macd["MACD"] = (data["Close"].ewm(span=12, adjust=False).mean()) - (
        data["Close"].ewm(span=26, adjust=False).mean()
    )
    macd["Signal_Line"] = macd["MACD"].ewm(span=9, adjust=False).mean()
    return macd


# Relative Strength Index
def rsi(data):
    # calculate ups and downs and split them in two groups
    change = data["Close"].diff()
    change.fillna(0)

    change_up = change.copy()
    change_down = change.copy()

    change_up[change_up < 0] = 0
    change_down[change_down > 0] = 0

    # verify
    change.equals(change_up + change_down)

    # set dataframe
    rsi = pd.DataFrame()

    # calculate moving average of ups and downs
    avg_up = change_up.rolling(14).mean()
    avg_down = change_down.rolling(14).mean().abs()

    # calculate rsi
    rsi["RSI"] = 100 - (100 / (1 + (avg_up + avg_down)))
    return rsi


# Stochastic Oscillator
def stochOsc(data):
    # calculate highs and lows in 14 day period
    high = data["High"].rolling(14).max()
    high = high.fillna(0)
    low = data["Low"].rolling(14).min()
    low = low.fillna(0)

    # set dataframe
    stoch = pd.DataFrame()

    # calculate oscillator (fast)
    stoch["%K"] = ((data["Close"] - low) / (high - low)) * 100
    # calculate oscillator (slow)
    stoch["%D"] = stoch["%K"].rolling(3).mean()
    stoch = stoch.replace([np.inf, -np.inf], np.nan)
    stoch = stoch.fillna(0)
    return stoch


# true range
def trueRange(data):
    # prepare data
    tr = data.copy()
    high = data["High"]
    low = data["Low"]
    close = data["Close"]
    tr["tr0"] = abs(high - low)
    tr["tr1"] = abs(high - close.shift())
    tr["tr2"] = abs(low - close.shift())

    # calculate true_range
    true_range = pd.DataFrame()
    true_range["TR"] = tr[["tr0", "tr1", "tr2"]].max(axis=1)
    true_range = true_range.fillna(0)
    return true_range


# directional movement indicator
def directionalMovement(data, tr):
    move = pd.DataFrame()

    # calculate directional movement
    up = data["High"].diff()
    down = data["Low"].diff()
    smoothedUp = up.ewm(alpha=1 / 14).mean()
    smoothedDown = down.ewm(alpha=1 / 14).mean()

    move["Up_DI"] = 100 * (smoothedUp / tr["ATR"])
    move["Down_DI"] = abs(100 * (smoothedDown.values / tr["ATR"]))

    return move


# Average True Range
def atr(data):
    # set dataframe
    atr = pd.DataFrame()

    # calculate avg true range
    true_range = trueRange(data=data)
    atr["ATR"] = true_range.ewm(alpha=1 / 14, adjust=False).mean()
    return atr


# Average Directional Index
def adx(data):
    adx = pd.DataFrame()

    # get avg true range and directional movement
    tr = atr(data)
    dm = directionalMovement(data=data, tr=tr)

    # calculate adx and its variants
    adx["DX"] = (
        abs(dm["Up_DI"] - dm["Down_DI"]) / abs(dm["Up_DI"] + dm["Down_DI"])
    ) * 100
    adx["ADX"] = ((adx["DX"].shift(1) * (14 - 1)) + adx["DX"]) / 14
    adx["ADX_smooth"] = adx["ADX"].ewm(alpha=1 / 14).mean()
    adx = adx.fillna(0)
    return adx


# Bollinger Bands
def bollinger(data):
    bollinger = pd.DataFrame()
    # middle band
    bollinger["Mid"] = data["Close"].rolling(20).mean()

    # standard deviation
    std = data["Close"].rolling(window=20).std()

    # upper and lower bands
    bollinger["Lower"] = bollinger["Mid"] - 2 * std
    bollinger["Upper"] = bollinger["Mid"] + 2 * std
    return bollinger


# On-Balance Volume
def obv(data):
    obv = pd.DataFrame()
    # calculate obv
    change = data["Close"].diff()
    obv["OBV"] = (np.sign(change) * data["Volume"]).fillna(0).cumsum()
    return obv


def moneyFlowVolumeSeries(data):
    # calculate money flow
    mfv = pd.DataFrame()
    mfv["MFV"] = (
        data["Volume"]
        * (2 * data["Close"] - data["High"] - data["Low"])
        / (data["High"] - data["Low"])
    )
    return mfv


def moneyFlowVolume(data, n):
    # calculate money flow volume
    mfv = pd.DataFrame()
    mfv["MFV"] = moneyFlowVolumeSeries(data).rolling(n).sum()
    return mfv


# Chaikin Money Flow
def chaikinMoneyFlow(data):
    # calculate chaikin money flow
    cmf = pd.DataFrame()
    cmf["CMF"] = moneyFlowVolume(data, 20)["MFV"] / data["Volume"].rolling(20).sum()
    return cmf


# make basic chart with stock data
def makeChart(data):
    graph = go.Figure()
    graph.add_traces(
        go.Scatter(
            y=data.iloc[:, 0],
            x=data.index,
            mode="lines",
        )
    )
    return graph

In [16]:
# calculate all indicators for machine learning processing
# change from yesterday close to today close
ticker_Data['Diff'] = ticker_Data['Close'].diff()

# prediction validation for tomorrows close
ticker_Data['FutureClose'] = ticker_Data['Diff'].shift(-1)
ticker_Data['FutureClose'] = np.where(ticker_Data['FutureClose'] > 0, 1, -1)

# rsi calculation
ticker_Data['rsi'] = rsi(ticker_Data)

# stochastic oscillator calculation
stoch = stochOsc(ticker_Data)
ticker_Data['stochOsc%K'] = stoch["%K"]
ticker_Data['stochOsc%D'] = stoch["%D"]

#on-balance volume calculation
ticker_Data['obv'] = obv(ticker_Data)

#average directional index calculation
adx = adx(ticker_Data)
#ticker_Data['adx'] = adx['ADX']
ticker_Data['adx_smooth'] = adx['ADX_smooth']

data = ticker_Data.iloc[20:].drop('index',axis=1).drop('Name', axis=1)

In [17]:
data_train = data['2018-01-31':'2020-12-31'].drop(['High', 'Low', 'Open', 'Adj Close'],axis=1)
data_valid = data['2021-01-04':'2023-12-29'].drop(['High', 'Low', 'Open', 'Adj Close'],axis=1)

y_train = data_train.pop('FutureClose')
X_train = data_train
y_valid = data_valid.pop('FutureClose').fillna(0)
X_valid = data_valid

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

In [19]:
# set scikit machine learning pipeline
pipeline = Pipeline(steps=[('preprocessor', SimpleImputer(strategy='constant')),
                              ('model', RandomForestRegressor(n_estimators=500,
                                                              random_state=0))])

# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 0.9949216467463479


In [20]:
# set xgboost model
model = XGBRegressor()
model.fit(X_train, y_train,
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

predictions = model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))


Mean Absolute Error: 0.9936520671923391
