# Objective

Develop a minimum viable model that can predict which direction a stock will go

## The Data

### Input Variables

1. Sentiment
    - Bullish, Bearish, Total_compound
2. Financial
3. Technical

### Target Variable

1. 1-day price direction
2. 2-day price direction

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

import requests
# import json
# import datetime

# Cleaning the Data

In [2]:
# Import data and convert date column to datetime datatype
data = pd.read_csv('historic_sentiment_analysis.csv')
data['date'] = pd.to_datetime(data['date'])

In [None]:
data.head()

In [None]:
data.info()

## Unnecessary Columns

Let's dig into dividend data. 

In [3]:
data[['divYield', 'divAmount', 'divDate', 'dividendYield', 'dividendAmount', 'dividendDate']].head(10)

Unnamed: 0,divYield,divAmount,divDate,dividendYield,dividendAmount,dividendDate
0,0.0,0.0,,0.0,0.0,
1,0.0,0.0,,0.0,0.0,
2,0.0,0.0,,0.0,0.0,
3,0.7,0.88,00:00.0,0.7,0.88,00:00.0
4,0.0,0.0,,0.0,0.0,
5,0.0,0.0,,0.0,0.0,
6,0.0,0.0,,0.0,0.0,
7,0.0,0.0,,0.0,0.0,
8,0.71,0.88,00:00.0,0.71,0.88,00:00.0
9,0.09,0.64,00:00.0,0.09,0.64,00:00.0


Most of the values are null/zero values because most stocks don't provide dividends.

Also, there are duplicate columns (ex: divAmount & dividendAmount).

For simplicity, let's consolidate them columns into one as follows:
1. Remove the dividendDate/divDate columns. Keeping this would be redundant
2. Remove divYield column, it contains the same information as divAmount
3. The information from the 6 columns is contained in divAmount:
    - Whether the stock pays a dividend or not
    - How much is paid per stock owned

In [4]:
data.drop(['divYield', 'divDate', 'dividendYield', 'dividendAmount', 'dividendDate', 'dividendPayDate'], axis=1, inplace=True)

Several columns are either identifiers, duplicates or empty, we don't need them for this project

In [5]:
data.drop(['cusip',
           'assetType',
           'description',
           'assetMainType',
           'symbol',
           'securityStatus',
           'symbol.1',
           'bidTick',
           'exchangeName',
           'peRatio.1'], axis=1, inplace=True)

Categorical columns

In [6]:
data.select_dtypes(include='object')

Unnamed: 0,stock,bidId,askId,lastId,exchange
0,CLOV,P,P,P,q
1,CLNE,Q,P,P,q
2,TLRY,P,P,P,q
3,AAPL,P,P,D,q
4,WKHS,P,P,D,q
...,...,...,...,...,...
620,UPST,Q,P,P,q
621,SOFI,P,P,P,q
622,MU,P,P,P,q
623,AMZN,P,P,D,q


In [7]:
print(data['bidId'].nunique())
print(data['askId'].nunique())
print(data['lastId'].nunique())
print(data['exchange'].nunique())

13
13
14
1


exchange column has only 1 unique value, which would likely not add predictability

In [8]:
data.drop(['exchange'], axis=1, inplace=True)

In [None]:
data.info()

## Boolean Values

In [9]:
data.select_dtypes(include='boolean')

Unnamed: 0,marginable,shortable,delayed,realtimeEntitled
0,True,True,True,False
1,True,True,True,False
2,True,True,True,False
3,True,True,True,False
4,True,True,True,False
...,...,...,...,...
620,True,True,True,False
621,True,True,True,False
622,True,True,True,False
623,True,True,True,False


In [10]:
print(data['marginable'].nunique())
print(data['shortable'].nunique())
print(data['delayed'].nunique())
print(data['realtimeEntitled'].nunique())

2
2
1
1


Two of these columns provide no valuable information

In [None]:
data.drop(['delayed', 'realtimeEntitled'], axis=1, inplace=True, )

In [None]:
#data = data.transpose(copy=True).drop_duplicates().transpose(copy=True)

## Null Values

In [None]:
data.isna().sum().sum()

We're good to go

## Columns with minimal unique values

Variables with a single value in the column will not likely provide any predictability

In [None]:
list(data.columns)

for column in list(data.columns):
    if data[column].nunique() <= 1:
        data.drop(column, axis=1, inplace=True)

# Bring in price data with TDAmeritrade API

In [None]:
# Date range of our dataset
print(data['date'].min().date())
print(data['date'].max().date())
print(data['date'].max().date() - data['date'].min().date())

Based on the date range of our dataset, our API call should generate about 2 months of price history

In [None]:
### Note: The API does not produce data on the weekends
n_period = 3
api_key = "***REMOVED***"
price_data = pd.DataFrame()

for stock in list(data['stock'].unique()):
    symbol = stock
    url = f'https://api.tdameritrade.com/v1/marketdata/{symbol}/pricehistory?apikey={api_key}&periodType=month&period={n_period}&frequencyType=daily&frequency=1'
    raw_data = requests.get(url).json()
    # raw_data = requests.get(url)
    # print(raw_data.status_code)
    # print(raw_data)
    raw_data = pd.json_normalize(raw_data, record_path=['candles'])
    raw_data.rename(columns = {'datetime': 'date'}, inplace=True)
    raw_data['date'] = pd.to_datetime(raw_data['date'], unit='ms')
    raw_data['date'] = [raw_data['date'][i].date() for i in range(len(raw_data['date']))]
    raw_data['stock'] = [stock for x in range(len(raw_data))]

    # Calc returns
    raw_data['1d-logreturn'] = np.log(raw_data['close'] / raw_data['close'].shift(1))
    raw_data['2d-logreturn'] = np.log(raw_data['close'] / raw_data['close'].shift(2))
    raw_data['5d-logreturn'] = np.log(raw_data['close'] / raw_data['close'].shift(5))

    # Determine direction of return
    raw_data['1d-direction'] = [1 if x > 0 else -1 if x < 0 else 0 for x in raw_data['1d-logreturn']]
    raw_data['2d-direction'] = [1 if x > 0 else -1 if x < 0 else 0 for x in raw_data['2d-logreturn']]
    raw_data['5d-direction'] = [1 if x > 0 else -1 if x < 0 else 0 for x in raw_data['5d-logreturn']]

    # Concat dataframes
    price_data = pd.concat([price_data, raw_data], ignore_index=True)
    price_data = price_data[['date',
                             'stock',
                             'close',
                             '1d-logreturn',
                             '1d-direction',
                             '2d-logreturn',
                             '2d-direction',
                             '5d-logreturn',
                             '5d-direction']]

# First n values in direction columns should be NaN
price_data['1d-direction'][0] = np.nan
price_data['2d-direction'][0: 2] = [np.nan for x in price_data['2d-direction'][0: 2]]
price_data['5d-direction'][0: 5] = [np.nan for x in price_data['5d-direction'][0: 5]]

price_data['1d-logreturn'] = price_data['1d-logreturn'].shift(1)
price_data['1d-direction'] = price_data['1d-direction'].shift(1)

price_data['2d-logreturn'] = price_data['2d-logreturn'].shift(1)
price_data['2d-direction'] = price_data['2d-direction'].shift(1)

price_data['5d-logreturn'] = price_data['5d-logreturn'].shift(1)
price_data['5d-direction'] = price_data['5d-direction'].shift(1)

price_data.dropna(inplace=True)
price_data.reset_index(inplace=True)
price_data.drop('index', axis=1, inplace=True)

In [None]:
# Filter out dates to match those of the 'data' dataframe
filter_ = (price_data['date'] >= data['date'].min().date()) & (price_data['date'] <= data['date'].max().date())

price_data = price_data[filter_]
price_data.reset_index(inplace=True)
price_data.drop('index', axis=1, inplace=True)

In [None]:
# Instantiate combined dataframe
column_list = list(price_data.columns) + list(data.columns)
combined_df = pd.DataFrame(columns=column_list)

# Iterate through both dataframes to match date and stock and append matching rows into combined_df
for ind in price_data.index:
    for indx in data.index:
        if price_data['date'][ind] == data['date'][indx] and price_data['stock'][ind] == data['stock'][indx]:
            series_list = [
                pd.to_datetime(price_data['date'][ind]),
                price_data['stock'][ind],
                price_data['close'][ind],
                price_data['1d-logreturn'][ind],
                price_data['1d-direction'][ind],
                price_data['2d-logreturn'][ind],
                price_data['2d-direction'][ind],
                price_data['5d-logreturn'][ind],
                price_data['5d-direction'][ind]] + list(data.iloc[indx])
            combined_df = combined_df.append(pd.Series(
                    series_list,
                    index=column_list
                ), ignore_index=True)

# We don't need duplicate 'date' and 'stock' columns anymore
combined_df = combined_df.iloc[:, 2:]
combined_df.sort_values(by='date', ignore_index=True, inplace=True)

Officially ready for modeling

# Minimal Viable Product

## Features and Target Variables

In [None]:
# Convert target to numeric datatype
int_list = ['bidSize', 'askSize', 'lastSize', 'totalVolume', 'regularMarketLastSize']

for int_ in int_list:
    combined_df[int_] = pd.to_numeric(combined_df[int_])

combined_df['1d-direction'] = pd.to_numeric(combined_df['1d-direction'])

In [None]:
X = combined_df.loc[:, 'Bearish':].drop('date', axis=1)
y = combined_df['1d-direction']

## Encode Categorical Variables

In [None]:
from sklearn.preprocessing import OneHotEncoder

categoricals = list(X.select_dtypes('object').columns)
numericals = list(X.select_dtypes(['int64', 'float64']).columns)

def encode_cats(categoricals, numericals):
    """
    Takes in a list of categorical columns and a list of numerical columns and returns the dataframe with encoded variables
    """
    ohe = OneHotEncoder(sparse=False, drop='first')
    cat_matrix = ohe.fit_transform(X.loc[:, categoricals])
    X_ohe = pd.DataFrame(cat_matrix,
                         columns=ohe.get_feature_names(categoricals), #create meaningful column names
                         index=X.index) #keep the same index values
    
    return pd.concat([X.loc[:, numericals], X_ohe], axis=1)

In [None]:
X = encode_cats(categoricals, numericals)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import auc, roc_auc_score, plot_roc_curve, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, PrecisionRecallDisplay, plot_confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, label_binarize

import matplotlib.pyplot as plt

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

X, X_test, y, y_test = train_test_split(X, y, test_size=.2, random_state=42) #hold out 20% of the data for final testing

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [None]:
knn = KNeighborsClassifier()
knn_scores = cross_val_score(knn, X_scaled, y, cv=5)

lr = LogisticRegression(max_iter=1000)
lr_scores = cross_val_score(lr, X_scaled, y, cv = 5)

rf = RandomForestClassifier()
rf_scores = cross_val_score(rf, X_scaled, y, cv=5)

gbm = xgb.XGBClassifier()
gbm_scores = cross_val_score(gbm, X_scaled, y, cv=5)

print(f"KNN mean scores: {np.mean(knn_scores):.4}")

print(f"Logistic Regression mean scores: {np.mean(lr_scores):.4}")

print(f"Random Forest mean scores: {np.mean(rf_scores):.4}")

print(f"XGBoost mean scores: {np.mean(gbm_scores):.4}")

## Confusion Matrix

In [None]:
# Precicion score
rf.fit(X_scaled, y)
print(f"Precision Score: {precision_score(y_test, rf.predict(X_test_scaled), average='weighted'):.4f}")

In [None]:
# Recall Score
print(f"Recall Score: {recall_score(y_test, rf.predict(X_test_scaled), average='weighted'):.4f}")

In [None]:
plot_confusion_matrix(rf, X_test, y_test)
plt.grid(b=None)

# Optimize Precision

We want to be as accurate as possible to ensure profitability

## Remove the noise