In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

In [2]:
df_news = pd.read_csv('news_data.csv')
#df_news.info()

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# change formats
df_news['TICKER'] = df_news['COMPANY'].str[3:]
df_news['TIMESTAMP_UTC'] = pd.to_datetime(df_news['TIMESTAMP_UTC'], utc = True)
df_news['RPNA_DATE_UTC'] = df_news['TIMESTAMP_UTC'].dt.date
df_news['RPNA_TIME_UTC'] = df_news['TIMESTAMP_UTC'].dt.time

In [4]:
# get Eastern Standard Time
df_news['TIMESTAMP_EST'] = df_news['TIMESTAMP_UTC'].dt.tz_convert('America/New_York')
df_news['RPNA_DATE_EST'] = df_news['TIMESTAMP_EST'].dt.date
df_news['RPNA_TIME_EST'] = df_news['TIMESTAMP_EST'].dt.time

In [5]:
from pandas.tseries.offsets import CustomBusinessDay

from pandas.tseries.holiday import AbstractHolidayCalendar, Holiday, nearest_workday, \
    USMartinLutherKingJr, USPresidentsDay, GoodFriday, USMemorialDay, \
    USLaborDay, USThanksgivingDay

class USTradingCalendar(AbstractHolidayCalendar):
    rules = [Holiday('NewYearsDay', month = 1, day = 1, observance = nearest_workday),
             USMartinLutherKingJr,
             USPresidentsDay,
             GoodFriday,
             USMemorialDay,
             Holiday('USIndependenceDay', month = 7, day = 4, observance = nearest_workday),
             USLaborDay,
             USThanksgivingDay,
             Holiday('Christmas', month = 12, day = 25, observance = nearest_workday)]

bday_us = CustomBusinessDay(calendar = USTradingCalendar())

nasdaq_close = pd.to_datetime('16:00:00').time()

df_news['NEXT_MARKET_DATE'] = pd.to_datetime(np.where(df_news['RPNA_TIME_EST'] > nasdaq_close,
                                                      df_news['RPNA_DATE_EST'] + bday_us,
                                                      df_news['RPNA_DATE_EST'] + 0 * bday_us))
df_news['NEXT_MARKET_CLOSE'] = (df_news['NEXT_MARKET_DATE'] + pd.Timedelta(16, unit ='h')).dt.tz_localize('America/New_York')
df_news['TIME_TO_CLOSE'] = df_news['NEXT_MARKET_CLOSE'] - df_news['TIMESTAMP_EST']

In [6]:
df_market = pd.read_csv('djia.csv')
df_market.head(1)

Unnamed: 0,gvkey,iid,datadate,tic,conm,cshtrd,prccd,prchd,prcld,prcod,trfd,exchg
0,1447,1,4/1/2016,AXP,AMERICAN EXPRESS CO,9228601.0,67.59,68.18,66.77,68.09,2.417612,11


In [7]:
df_market['datadate'] = pd.to_datetime(df_market['datadate'],
                                       dayfirst = True)

In [8]:
df_market['dreturn'] = np.log(df_market.prccd / df_market.prcod)
df_market['dvar'] = ((np.log(df_market.prchd) - np.log(df_market.prcld))**2) / 4 * np.log(2) #Applying the Parkinson (1980) H-L measure
df_market['dvol'] = np.sqrt(df_market.dvar)
df_market['dreturn_flag'] = np.where(df_market['dreturn'] > 0, 1, 0)

In [9]:
market_return=pd.read_csv('gspc.csv')
market_return['mkt_return'] = np.log(market_return.Close / market_return.Open)
market_return=market_return[['Date','mkt_return']]
market_return.columns=['datadate','mkt_return']
market_return['datadate'] = pd.to_datetime(market_return['datadate'])
market_return.head(1)

Unnamed: 0,datadate,mkt_return
0,2015-12-31,-0.008113


In [10]:
df_market=pd.merge(df_market, market_return,how='left',on='datadate')
df_market.head(1)

Unnamed: 0,gvkey,iid,datadate,tic,conm,cshtrd,prccd,prchd,prcld,prcod,trfd,exchg,dreturn,dvar,dvol,dreturn_flag,mkt_return
0,1447,1,2016-01-04,AXP,AMERICAN EXPRESS CO,9228601.0,67.59,68.18,66.77,68.09,2.417612,11,-0.00737,7.6e-05,0.008699,0,-0.01261


In [11]:
#need to check tomorrow if more market return is available
len(df_market)

28248

In [12]:
df_market = df_market[np.isfinite(df_market['prcod'])]
df_market = df_market[np.isfinite(df_market['mkt_return'])]
len(df_market)

27876

In [13]:
#calculate beta = cov/var
#get covariance
grouped = df_market.groupby('conm')
group_cov=grouped.apply(lambda x: x['dreturn'].cov(x['mkt_return']))
group_cov=pd.DataFrame(data=group_cov)
group_cov.columns = ['cov']

In [14]:
group_cov.reset_index(inplace=True)
#group_cov

In [15]:
market_var=(df_market.groupby('conm')['mkt_return'].var())
market_var=pd.DataFrame(data=market_var)
market_var.columns = ['var']
market_var.reset_index(inplace=True)
#market_var

In [16]:
merged_df=pd.merge(group_cov, market_var,how='left',on='conm')
#merged_df

In [17]:
merged_df['beta']=merged_df['cov']/merged_df['var']
merged_df

Unnamed: 0,conm,cov,var,beta
0,3M CO,3.3e-05,4.1e-05,0.794666
1,AMERICAN EXPRESS CO,3.4e-05,4.1e-05,0.833967
2,APPLE INC,3.8e-05,4.1e-05,0.927551
3,BOEING CO,4.4e-05,4.1e-05,1.067496
4,CATERPILLAR INC,4.8e-05,4.1e-05,1.172869
5,CHEVRON CORP,3.2e-05,4.1e-05,0.784118
6,CISCO SYSTEMS INC,4.2e-05,4.1e-05,1.014859
7,COCA-COLA CO,1.9e-05,4.1e-05,0.450555
8,DISNEY (WALT) CO,2.7e-05,4.1e-05,0.647995
9,EXXON MOBIL CORP,2.4e-05,4.1e-05,0.575956


In [18]:
#average market return over 2016-2018 period
averagereturn=market_return["mkt_return"].mean()

In [19]:
df_market=pd.merge(df_market, merged_df,how='left',on='conm')
df_market['exp_return']=df_market['beta']*df_market['mkt_return']
df_market['abnormal_return']=df_market['dreturn']-df_market['exp_return']
df_market.head()

Unnamed: 0,gvkey,iid,datadate,tic,conm,cshtrd,prccd,prchd,prcld,prcod,...,dreturn,dvar,dvol,dreturn_flag,mkt_return,cov,var,beta,exp_return,abnormal_return
0,1447,1,2016-01-04,AXP,AMERICAN EXPRESS CO,9228601.0,67.59,68.18,66.77,68.09,...,-0.00737,7.6e-05,0.008699,0,-0.01261,3.4e-05,4.1e-05,0.833967,-0.010516,0.003146
1,1447,1,2016-01-05,AXP,AMERICAN EXPRESS CO,10807680.0,66.55,67.71,65.67,67.37,...,-0.012246,0.000162,0.012735,0,0.001454,3.4e-05,4.1e-05,0.833967,0.001212,-0.013459
2,1447,1,2016-01-06,AXP,AMERICAN EXPRESS CO,9743069.0,64.42,65.55,64.24,65.24,...,-0.012649,7.1e-05,0.008403,0,-0.01072,3.4e-05,4.1e-05,0.833967,-0.00894,-0.003709
3,1447,1,2016-01-07,AXP,AMERICAN EXPRESS CO,11317630.0,63.84,64.2516,63.084,63.31,...,0.008337,5.8e-05,0.007634,1,-0.021501,3.4e-05,4.1e-05,0.833967,-0.017931,0.026267
4,1447,1,2016-01-08,AXP,AMERICAN EXPRESS CO,10003330.0,63.63,64.4085,63.57,64.18,...,-0.008607,3e-05,0.005455,0,-0.012379,3.4e-05,4.1e-05,0.833967,-0.010323,0.001717


In [21]:
df = pd.merge(df_news,df_market , how = 'left',
              left_on = ['NEXT_MARKET_DATE', 'TICKER'],
              right_on = ['datadate', 'tic'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 823480 entries, 0 to 823479
Data columns (total 77 columns):
TIMESTAMP_UTC           823480 non-null datetime64[ns, UTC]
RPNA_DATE_UTC           823480 non-null object
RPNA_TIME_UTC           823480 non-null object
RP_ENTITY_ID            823480 non-null object
ENTITY_TYPE             823480 non-null object
ENTITY_NAME             823480 non-null object
POSITION_NAME           5568 non-null object
RP_POSITION_ID          5568 non-null object
COUNTRY_CODE            823480 non-null object
RELEVANCE               823480 non-null int64
TOPIC                   45300 non-null object
GROUP                   45300 non-null object
TYPE                    45300 non-null object
SUB_TYPE                21440 non-null object
PROPERTY                14679 non-null object
EVALUATION_METHOD       0 non-null float64
MATURITY                0 non-null float64
CATEGORY                45300 non-null object
ESS                     45300 non-null float64
AE

In [22]:
# drop columns with no explanatory worth
drop = drop = ['TIMESTAMP_UTC', 'ENTITY_TYPE', 'RP_POSITION_ID', 'COUNTRY_CODE',
        'EVALUATION_METHOD', 'ENS_KEY', 'G_ENS_KEY', 'EVENT_SIMILARITY_KEY',
        'RP_STORY_ID', 'PRODUCT_KEY', 'COMPANY', 'ISIN', 'BER', 'ANL_CHG',
        'TICKER', 'NEXT_MARKET_DATE', 'NEXT_MARKET_CLOSE', 'gvkey', 'iid', 'datadate', 'tic',
        'conm', 'cshtrd', 'prccd', 'prchd', 'prcld', 'prcod', 'trfd', 'exchg', 'RP_ENTITY_ID','cov','var','beta','exp_return']
df = df.drop(drop, axis = 1)

In [23]:
df.head()

Unnamed: 0,RPNA_DATE_UTC,RPNA_TIME_UTC,ENTITY_NAME,POSITION_NAME,RELEVANCE,TOPIC,GROUP,TYPE,SUB_TYPE,PROPERTY,...,TIMESTAMP_EST,RPNA_DATE_EST,RPNA_TIME_EST,TIME_TO_CLOSE,dreturn,dvar,dvol,dreturn_flag,mkt_return,abnormal_return
0,2016-01-01,10:30:01.663000+00:00,3M Co.,,3,,,,,,...,2016-01-01 05:30:01.663000-05:00,2016-01-01,05:30:01.663000,3 days 10:29:58.337000,-0.008343,6.9e-05,0.008277,0.0,-0.01261,0.001678
1,2016-01-01,10:44:40.104000+00:00,3M Co.,,3,,,,,,...,2016-01-01 05:44:40.104000-05:00,2016-01-01,05:44:40.104000,3 days 10:15:19.896000,-0.008343,6.9e-05,0.008277,0.0,-0.01261,0.001678
2,2016-01-03,23:35:51.992000+00:00,3M Co.,,4,,,,,,...,2016-01-03 18:35:51.992000-05:00,2016-01-03,18:35:51.992000,0 days 21:24:08.008000,-0.008343,6.9e-05,0.008277,0.0,-0.01261,0.001678
3,2016-01-04,05:08:02.003000+00:00,3M Co.,,3,,,,,,...,2016-01-04 00:08:02.003000-05:00,2016-01-04,00:08:02.003000,0 days 15:51:57.997000,-0.008343,6.9e-05,0.008277,0.0,-0.01261,0.001678
4,2016-01-04,07:05:01.705000+00:00,3M Co.,,3,,,,,,...,2016-01-04 02:05:01.705000-05:00,2016-01-04,02:05:01.705000,0 days 13:54:58.295000,-0.008343,6.9e-05,0.008277,0.0,-0.01261,0.001678


In [24]:
# show missing data in rows
missing = (df.isnull().sum(axis = 1) / df.isnull().count(axis = 1)).sort_values(ascending = False)
missing_rows = pd.concat([missing], axis = 1, keys = ['Missing'])
drop_rows = missing_rows[missing_rows['Missing'] > 0.25]
drop_rows.count()

Missing    778180
dtype: int64

In [25]:
# drop observations where most features are missing
df = df.drop(drop_rows.index, axis = 0)

In [26]:
# show missing data in columns
total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending = False)
missing_clmns = pd.concat([total, percent], axis = 1, keys = ['Total', 'Percent'])
missing_clmns.head(5)

Unnamed: 0,Total,Percent
MATURITY,45300,1.0
POSITION_NAME,39732,0.877086
PROPERTY,30621,0.67596
SUB_TYPE,23860,0.526711
abnormal_return,2754,0.060795


In [27]:
# drop features where most observations are missing
df = df.drop((missing_clmns[missing_clmns['Percent'] > 0.25]).index, axis = 1)

In [28]:
len(df)

45300

In [29]:
df.head()

Unnamed: 0,RPNA_DATE_UTC,RPNA_TIME_UTC,ENTITY_NAME,RELEVANCE,TOPIC,GROUP,TYPE,CATEGORY,ESS,AES,...,TIMESTAMP_EST,RPNA_DATE_EST,RPNA_TIME_EST,TIME_TO_CLOSE,dreturn,dvar,dvol,dreturn_flag,mkt_return,abnormal_return
12,2016-01-05,18:04:50.860000+00:00,3M Co.,100,business,technical-analysis,technical-view,technical-view-bearish,41.0,29,...,2016-01-05 13:04:50.860000-05:00,2016-01-05,13:04:50.860000,02:55:09.140000,0.00435,2.9e-05,0.005368,1.0,0.001454,0.003194
38,2016-01-12,15:00:32.995000+00:00,3M Co.,100,business,technical-analysis,technical-view,technical-view-bearish,41.0,29,...,2016-01-12 10:00:32.995000-05:00,2016-01-12,10:00:32.995000,05:59:27.005000,-0.003543,8e-05,0.008939,0.0,0.005612,-0.008003
53,2016-01-19,09:26:56.787000+00:00,3M Co.,100,business,investor-relations,major-shareholders-disclosure,major-shareholders-disclosure,50.0,28,...,2016-01-19 04:26:56.787000-05:00,2016-01-19,04:26:56.787000,11:33:03.213000,-0.012281,0.000103,0.010145,0.0,-0.003889,-0.00919
65,2016-01-20,15:02:58.977000+00:00,3M Co.,100,business,technical-analysis,relative-strength-index,relative-strength-index-oversold,62.0,29,...,2016-01-20 10:02:58.977000-05:00,2016-01-20,10:02:58.977000,05:57:01.023000,7.3e-05,9e-05,0.009493,1.0,-0.009022,0.007242
86,2016-01-22,16:46:36.045000+00:00,3M Co.,100,business,insider-trading,insider-gift,insider-gift,57.0,30,...,2016-01-22 11:46:36.045000-05:00,2016-01-22,11:46:36.045000,04:13:23.955000,-0.004434,4.9e-05,0.007015,0.0,0.015591,-0.016824
