In [1]:
import nasdaqdatalink
import os
import json
import quandl
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas_market_calendars as mcal
from datetime import datetime
import requests
from refresh_functions import *
pd.options.display.float_format = '{:.6f}'.format

#store my API key
with open('C:/Users/meich/.nasdaq/data_link_apikey.json') as f:
    data=json.load(f)
    key=data['api_key']
quandl.ApiConfig.api_key = key

# Data Processing
-----

### Sharadar (SEP ~ Equity Prices)

for now, this cell should be run daily (except post-holiday, sundays, mondays)

must update code to pass list of dates between max(csv date) - daily date, in case miss a run

In [2]:
sharadarSEP()
sep = pd.read_csv('C:/Users/meich/CareerDocs/projects/stock_prediction/Data/SHARADAR_SEP.csv')

Data up to date
2022-12-30
'sharadarSEP' 0.06 mins


### Tickers (filters)

In [3]:
tickers = sharadarTICKERS()

'sharadarTICKERS' 0.50 mins


### Daily Metrics (filters)

In [4]:
sharadarDAILY()
daily = pd.read_csv('C:/Users/meich/CareerDocs/projects/stock_prediction/Data/SHARADAR_DAILY.csv')

# FILTER OUT STOCKS THAT WERE NEVER 500M MARKETCAP MINIMUM
daily.set_index('ticker',inplace=True)
daily['marketcap_max'] = daily.groupby('ticker').max()['marketcap']
daily = daily[daily['marketcap_max']>=500]
daily = daily.reset_index()

Data up to date
2022-12-30
'sharadarDAILY' 0.11 mins


### Short Interest Activity (Finra)

In [5]:
finraSHORTS()
si = pd.read_csv('C:/Users/meich/CareerDocs/projects/stock_prediction/Data/FINRA_SI.csv')

Data up to date:
2022-12-30
'finraSHORTS' 0.06 mins


### Retail Trader Activity

In [6]:
# get historic values, process data
nasdaqRTAT()
rtat = pd.read_csv('C:/Users/meich/CareerDocs/projects/stock_prediction/Data/NDAQ_RTAT.csv')

Data up to date:
2022-12-30
'nasdaqRTAT' 0.09 mins


### Create foundational dataset (shar)

In [7]:
combined = daily.merge(tickers,left_on='ticker',right_on='ticker',how='left')
combined = combined.merge(sep,left_on=['date','ticker'],right_on=['date','ticker'],how='left')
combined = combined.merge(rtat,left_on=['date','ticker'],right_on=['date','ticker'],how='left')
combined = combined.merge(si,left_on=['date','ticker'],right_on=['date','ticker'],how='left')
combined.sort_values(['ticker','date'],inplace=True)

# Exploration/Analysis
-----

In [36]:
#ADD PRICE METRICS - logdiff 
    # rdiff 	row-on-row % change	z[t] = (y[t] – y[t-1]) / y[t-1]
    # log returns ln(stock price t1/ stock price t0)
combined = lagged_features(combined)

combined['closeadj_pct1'] = (combined['closeadj_lag1'] - combined['closeadj']) / combined['closeadj']*100
combined['closeadj_pct5'] = (combined['closeadj_lag5'] - combined['closeadj']) / combined['closeadj']*100
combined['closeadj_pct30'] = (combined['closeadj_lag30'] - combined['closeadj']) / combined['closeadj']*100
combined['closeadj_pct90'] = (combined['closeadj_lag90'] - combined['closeadj']) / combined['closeadj']*100
combined['closeadj_pct180'] = (combined['closeadj_lag180'] - combined['closeadj']) / combined['closeadj']*100
combined['closeadj_pct360'] = (combined['closeadj_lag360'] - combined['closeadj']) / combined['closeadj']*100

'lagged_features' 0.58 mins


In [9]:
#SI METRICS
combined = short_features(combined)

'short_features' 0.39 mins


In [10]:
#  ---- RTAT NOTES-------#
# CHECK FOR AUTO CORRELATION IN BOTH METRICS
# TEST FOR RMSE STRATEGY OF SIMILARITY INDEX USING ACT/SENT TO PRODUCE A SINGLE SCORE FOR AGG
# SET UP ACTIVITY TRIGGER TO CATCH DAILY VALUES AND SEND NOTIFICATION + VISUAL

combined = rtat_features(combined)

'rtat_features' 0.84 mins


In [92]:
stock = combined[combined['ticker'] == 'FSLR'].copy()
stock = stock.sort_values(by='date')

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Scatter(x=stock['date'], y=stock['closeadj'], name="price"),
    secondary_y=False,
)

# fig.add_trace(
#     go.Scatter(x=stock['date'], y=stock['sentiment_Z'], name="sentiment_Z"),
#     secondary_y=True,
# )

fig.add_trace(
    go.Scatter(x=stock['date'], y=stock['activity_Z'], name="activity_Z"),
    secondary_y=True,
)
fig.show()

### Short Term Model (Retail Activity + SI + Price Metrics)

In [None]:
# BUILD SIMPLE BASELINE MODEL TO PREDICT PRICE AT DIFFERENT TIME POINTS (EX: 5 DAY, 10 DAY, 20 DAY, 1 MONTH, 3 MONTH, 6 MONTH)
#       OR COULD CONSTRUCT AS A CLASSIFICATION MODEL TOO - PREDICT PROBABILITY THAT Y WILL FALL Y%

In [87]:
combined.filter(regex='Short|pct|date|ticker')[combined['activity'] >=.1]

Unnamed: 0,ticker,date,lastupdated,ShortVolume,closeadj_pct1,closeadj_pct5,closeadj_pct30,closeadj_pct90,closeadj_pct180,closeadj_pct360,ShortRatio,ShortRatio_5,ShortRatio_15,ShortRatio_30
12624,AAPL,2020-08-21,2021-12-19,19975518.000000,1.195765,0.352127,-6.327690,7.703594,1.736997,29.413203,0.477328,0.428117,0.478792,0.440836
12625,AAPL,2020-08-24,2021-12-19,19615969.000000,-0.819976,2.529199,-10.088602,5.610954,-1.972614,27.507853,0.437642,0.437683,0.466785,0.441870
12630,AAPL,2020-08-31,2021-12-19,39073388.000000,3.983031,-12.569723,-6.153665,2.509231,-2.895750,35.047529,0.380058,0.416739,0.424211,0.443849
12634,AAPL,2020-09-04,2021-12-19,50168166.000000,-6.729021,-4.633797,-4.116696,6.756677,5.427468,47.072972,0.326653,0.346856,0.401076,0.437370
12659,AAPL,2020-10-12,2021-12-19,47434124.000000,-2.653346,-6.768639,-8.322672,1.612706,10.638645,30.301598,0.393652,0.421267,0.446051,0.418625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5233610,TSLA,2022-12-23,2022-12-23,39552497.000000,-11.408851,,,,,,0.427293,0.449858,0.468959,0.464066
5233611,TSLA,2022-12-27,2022-12-27,51809655.000000,3.308891,,,,,,0.446564,0.441380,0.466133,0.463572
5233612,TSLA,2022-12-28,2022-12-28 00:00:00,57052138.000000,8.082690,,,,,,0.456871,0.438742,0.463514,0.462123
5233613,TSLA,2022-12-29,2022-12-29 00:00:00,71067150.000000,1.116401,,,,,,0.534666,0.452572,0.466899,0.465295


In [83]:
#activity z (crazy high activity has a strong inverse correlation w/ price that goes lower as tiem increases.)
# this is a lot more pronouced for those where activity is ultra high.. 
px.bar(combined[combined['activity'] >=.05].filter(regex='activity_Z|pct').corr()['activity_Z'])

In [90]:
px.bar(combined.filter(regex='ShortRatio|pct').corr()['ShortRatio'])

In [None]:
# build model here with just activity metrics vs a 90 day close price (regression) OR logistic regression for +/- current price.

### Long Term Model (Fundamentals)

In [None]:
features = combined[['date','ticker','ev','marketcap','pb','pe','ps','closeadj_pct360']]
target = combined['closeadj_pct360']

# CONFIRM CORRELATION B/W FUNDAMENTAL METRICS -- CAN DROP SOME (keep pe, ev, marketcap, pb, ps)
#i wonder why theres almost no corr b/w pe and 360 pct change (mb try market adjusting pct change..)
# likely bc these values aren't changing much - and they are repeated per day! (it would make more sense to use monthly obs)
# also industry/sector is an important factor in pe
features.corr()

### Notification System

In [14]:
combined[combined['activity']>0.1]['ticker'].unique()

array(['AAPL', 'AMC', 'BA', 'BABA', 'NFLX', 'NIO', 'TSLA'], dtype=object)

In [29]:
combined[combined['ticker'] =='TSLA'].sort_values('activity_Z',ascending=False).filter(regex='act')

Unnamed: 0,activity,activity_5,activity_15,activity_30,activity_recent_ratio,prod_sent_act,prod_sent_act_5,prod_sent_act_15,prod_sent_act_30,activity_Z
5225422,0.206000,0.098020,0.068567,0.055803,1.756526,0.000000,0.000000,0.000000,0.744178,6.236564
5226153,0.178900,0.146800,0.115493,0.097733,1.502046,-71.564000,-26.425800,-15.400444,-10.425956,5.254548
5225423,0.166000,0.122060,0.074720,0.060340,2.022870,-16.601000,-2.441400,-0.498200,0.804667,4.787093
5225567,0.165700,0.081120,0.077387,0.059433,1.364891,33.142000,11.358200,10.835533,7.925778,4.776222
5226151,0.156500,0.132180,0.105333,0.092257,1.432742,-46.953000,-7.931400,-11.938911,-9.226667,4.442844
...,...,...,...,...,...,...,...,...,...,...
5224652,0.004200,0.006180,,,,-0.842000,-0.371400,,,-1.076016
5224667,0.004100,0.006300,0.007707,,,-0.822000,-0.504800,-0.771667,,-1.079639
5225313,0.003600,0.005800,0.006153,0.008713,0.665647,0.722000,0.464800,-0.657422,-1.104956,-1.097758
5224920,0.003400,0.005960,0.008767,0.011257,0.529464,-0.682000,-0.477600,0.292556,0.413111,-1.105005
