# Prepare Stock Data

In this notebook, we download the necessary stock data to support later analysis. Every earnings call needs open and close data to determine the impact of the earnings announcement. We augment this data with actual and surprise earnings data which will be used as a comparison with the impact of the earnings calls. Finally, we perform calculations to create additional columns that may be useful later in our analysis.

In [12]:
import numpy as np
import pandas as pd
import datetime
import re
import random
import time
import pytz
import yfinance as yf
import os, contextlib

In [3]:
# general function to get stock data over a range of dates for a given stock symbol
def get_stock_data(ticker, timestamp):
    start = (timestamp - datetime.timedelta(days=35)).date()
    end = (timestamp + datetime.timedelta(days=7)).date()
    with open(os.devnull, 'w') as devnull:
        with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
            data = yf.download(ticker, start=start, end=end)                
            if len(data.index) == 0:
                return None
            return data

In [2]:
stock = pd.read_pickle('data/transcript_df.pkl.bz2')

In [4]:
# download previous day close and next day open
# detect whether valid bars exist for each row
count = 0
for index, row in stock.iterrows():
    count += 1
    if (count % 1000) == 0:
        print('='*80)
        print('Completed %d rows' % (count,))
        #print(stock.iloc[count-1000:count-995].head())
    
    h = row['date'].hour
    m = row['date'].minute
    
    stock.loc[index, 'valid_bars'] = False

    sdf = get_stock_data(row['ticker'], row['date'])
    if sdf is None:
        continue

    market_open_time = sdf.index + datetime.timedelta(hours=9, minutes=30) # nyse + nasdaq both open at 0930 ET
    
    prev_bars = sdf.loc[market_open_time < row['date']]
    next_bars = sdf.loc[market_open_time >= row['date']]

    vol10 = np.average(np.abs(np.diff(prev_bars['Close'][-11:])))
    vol20 = np.average(np.abs(np.diff(prev_bars['Close'][-21:])))

    prev_close = prev_bars.iloc[-1]['Close']
    next_open = next_bars.iloc[0]['Open']

    stock.loc[index, 'vol10'] = vol10
    stock.loc[index, 'vol20'] = vol20
    stock.loc[index, 'prev_bar'] = prev_close
    stock.loc[index, 'next_bar'] = next_open
    stock.loc[index, 'valid_bars'] = True

print('='*80)
print('Completed %d rows' % (count,))
print('='*80)

Completed 1000 rows
Completed 2000 rows
Completed 3000 rows
Completed 4000 rows
Completed 5000 rows
Completed 6000 rows
Completed 7000 rows
Completed 8000 rows
Completed 9000 rows
Completed 10000 rows
Completed 11000 rows
Completed 12000 rows
Completed 13000 rows
Completed 13337 rows


In [10]:
stock = stock.query('valid_bars == True')

In [5]:
# create additional features related to price movement
stock['pct'] = stock.apply(lambda row: 100.0*(row.next_bar - row.prev_bar)/row.prev_bar, axis=1)
stock['score'] = stock.apply(lambda row: (np.sign(row.pct)*abs(row.pct))/(4.0+abs(row.pct)), axis=1)
stock['class'] = stock.apply(lambda row: (2 if abs(row.score) < 0.2 else 0 if np.sign(row.score) > 0 else 1), axis=1)
stock['vol_score'] = (stock['next_bar'] - stock['prev_bar']) / stock['vol10']

In [13]:
# download earnings data and create additional features
us_et = pytz.timezone('America/New_York')
count = 0
err_count = 0
bad_tickers = []
for ticker in stock['ticker'].unique():
    count += 1
    #print(ticker)
    try:
        t_data = yf.Ticker(ticker)
        t_num = int(random.random() * 20) + 35
        t_earn = t_data.get_earnings_dates(limit=t_num)
        #print(t_earn)
        if t_earn is None:
            continue
    except:
        err_count += 1
        print('possible issue with ticker %s. count=%d' % (ticker,count))
        bad_tickers.append(ticker)
        if err_count > 10:
            print('too many errors. exiting...')
            break
        #time.sleep(600)
        continue
    
    data_rows = stock.query('ticker == "%s"' % (ticker,))
    #print(data_rows)
    for index, row in data_rows.iterrows():
        #print(row)
        row_time = row['date']
        row_time = us_et.localize(row_time)
        t_idx = np.argmin(np.abs(t_earn.index - row_time))

        #if np.abs(t_earn.index[t_idx] - row_time) > datetime.timedelta(days=2):
        #    continue
        t_diff = row_time - t_earn.index[t_idx] 
        t_diff_hours = t_diff.total_seconds() / 3600.0

        #print('would set %d %f' % (index, t_earn.iloc[t_idx]['EPS Estimate']))
        stock.loc[index, 'release_call_time_diff'] = t_diff_hours
        stock.loc[index, 'eps_est'] = t_earn.iloc[t_idx]['EPS Estimate']
        stock.loc[index, 'eps_act'] = t_earn.iloc[t_idx]['Reported EPS']
        stock.loc[index, 'surprise_pct'] = t_earn.iloc[t_idx]['Surprise(%)']

    if count == 1:
        print(stock.head())
    
    if count % 100 == 0:
        print('='*80)
        print('completed %d tickers' % (count,))

    # rate-limited API, sleep 5 seconds between calls
    time.sleep(5)

print('='*80)
print('completed %d tickers' % (count,))
print('='*80)

  exchange        q ticker                                         transcript  \
0   nasdaq  2020-Q2   bili  Prepared Remarks:\nOperator\nGood day, and wel...   
1     nyse  2020-Q3    gff  Prepared Remarks:\nOperator\nThank you for sta...   
2   nasdaq  2020-Q1   lrcx  Prepared Remarks:\nOperator\nGood day and welc...   
4   nasdaq  2019-Q2   cste  Prepared Remarks:\nOperator\nGreetings and wel...   
5     nyse  2020-Q3   gdot  Prepared Remarks:\nOperator\nGood afternoon, a...   

                 date  tz  market_open valid_bars     vol10     vol20  \
0 2020-08-27 21:00:00  ET        False       True  1.153000  1.201000   
1 2020-07-30 16:30:00  ET        False       True  0.346000  0.402000   
2 2019-10-23 17:00:00  ET        False       True  3.161005  3.300003   
4 2019-08-07 08:30:00  ET        False       True  0.208000  0.183000   
5 2020-11-04 17:00:00  ET        False       True  1.345000  1.224000   

     prev_bar    next_bar       pct     score  class  vol_score  \
0   45.

SIR: $SIR: possibly delisted; No earnings dates found


completed 1500 tickers


SLN: $SLN: possibly delisted; No earnings dates found


completed 1600 tickers
completed 1700 tickers
completed 1800 tickers
completed 1900 tickers
completed 2000 tickers
completed 2015 tickers


In [14]:
# create additional features related to earnings data
stock['surprise_score'] = stock.apply(lambda row: (np.sign(row.surprise_pct)*abs(row.surprise_pct * 100.0))/(4.0+abs(row.surprise_pct * 100.0)), axis=1)
stock['surprise_vol'] = stock.apply(lambda row: np.NaN if row.vol10 == 0 else ((row.eps_act - row.eps_est) / row.vol10), axis=1)
stock['eps_act_vol'] = stock.apply(lambda row: np.NaN if row.vol10 == 0 else (row.eps_act / row.vol10), axis=1)
stock['eps_act_pct'] = stock.apply(lambda row: np.NaN if row.vol10 == 0 else (100.0 * row.eps_act / row.prev_bar), axis=1)

In [15]:
stock.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11902 entries, 0 to 18754
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   exchange                11902 non-null  object        
 1   q                       11902 non-null  object        
 2   ticker                  11902 non-null  object        
 3   transcript              11902 non-null  object        
 4   date                    11902 non-null  datetime64[ns]
 5   tz                      11902 non-null  object        
 6   market_open             11902 non-null  bool          
 7   valid_bars              11902 non-null  object        
 8   vol10                   11902 non-null  float64       
 9   vol20                   11902 non-null  float64       
 10  prev_bar                11902 non-null  float64       
 11  next_bar                11902 non-null  float64       
 12  pct                     11902 non-null  float64    

In [16]:
stock

Unnamed: 0,exchange,q,ticker,transcript,date,tz,market_open,valid_bars,vol10,vol20,...,class,vol_score,release_call_time_diff,eps_est,eps_act,surprise_pct,surprise_score,surprise_vol,eps_act_vol,eps_act_pct
0,nasdaq,2020-Q2,bili,"Prepared Remarks:\nOperator\nGood day, and wel...",2020-08-27 21:00:00,ET,False,True,1.153000,1.201000,...,0,0.494362,26.0,-1.53,-1.35,0.1195,0.749216,0.156115,-1.170859,-2.990696
1,nyse,2020-Q3,gff,Prepared Remarks:\nOperator\nThank you for sta...,2020-07-30 16:30:00,ET,False,True,0.346000,0.402000,...,0,2.919073,0.5,0.23,0.59,1.6029,0.975653,1.040462,1.705201,2.858527
2,nasdaq,2020-Q1,lrcx,Prepared Remarks:\nOperator\nGood day and welc...,2019-10-23 17:00:00,ET,False,True,3.161005,3.300003,...,0,6.096165,1.0,3.01,3.18,0.0571,0.588054,0.053780,1.006009,1.363753
4,nasdaq,2019-Q2,cste,Prepared Remarks:\nOperator\nGreetings and wel...,2019-08-07 08:30:00,ET,False,True,0.208000,0.183000,...,1,-4.759612,1.5,0.21,0.23,0.0926,0.698341,0.096154,1.105769,1.614035
5,nyse,2020-Q3,gdot,"Prepared Remarks:\nOperator\nGood afternoon, a...",2020-11-04 17:00:00,ET,False,True,1.345000,1.224000,...,0,1.241634,1.0,0.11,0.25,1.3133,0.970443,0.104089,0.185874,0.454545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18745,nasdaq,2021-Q3,avxl,Prepared Remarks:\nOperator\nGood afternoon. M...,2021-08-12 16:30:00,ET,False,True,0.562000,0.711500,...,2,-0.266903,9.5,-0.13,-0.14,-0.0448,-0.528302,-0.017794,-0.249110,-0.726518
18747,nasdaq,2023-Q2,nvda,Prepared Remarks:\nOperator\nGood afternoon. M...,2022-08-24 17:00:00,ET,False,True,0.430700,0.443600,...,1,-0.877641,1.0,1.26,0.51,-0.5938,-0.936889,-1.741350,1.184118,2.961329
18749,nasdaq,2022-Q2,dh,"Prepared Remarks:\nOperator\nGood day, and wel...",2022-08-04 17:00:00,ET,False,True,0.856000,0.661500,...,1,-7.429906,0.0,0.05,0.06,0.1834,0.820949,0.011682,0.070093,0.204569
18753,nasdaq,2021-Q4,dvax,"Prepared Remarks:\nOperator\nGood day, ladies ...",2022-02-28 16:30:00,ET,False,True,0.380000,0.312000,...,2,-0.289475,0.5,0.64,0.55,-0.1406,-0.778516,-0.236842,1.447369,4.486134


In [17]:
stock.to_pickle('data/stock_earnings_df.pkl.bz2')