In [23]:
exec(open("./utils.py").read())
import numpy as np

def load_news(sector, tickers):
    news_data = pd.DataFrame()
    for t in tickers:
        t_data = load_news_data(sector, t)
        t_data = t_data.drop_duplicates(["TITLE"])
        t_data['tick'] = t
        news_data = news_data.append(t_data)
    return news_data

def load_stocks(sector, tickers):
    stocks_data = pd.DataFrame()
    for t in tickers:
        try:
            t_data = load_stocks_data(sector, t)
            t_data['tick'] = t
            stocks_data = stocks_data.append(t_data)
        except:
            pass
    return stocks_data

In [3]:
sector = "healthcare"
tickers = load_tickers(sector)

<h3> News Data </h3>

In [39]:
import pysentiment as ps
lm = ps.LM()

df_news = load_news(sector, tickers)
df_news['PUBLICATION_DATE'] = df_news['PUBLICATION_DATE'].map(lambda x: x[:10])
df_news['SUMMARY_SCORES'] = df_news.SUMMARY.map(lambda x: lm.get_score(lm.tokenize(str(x))))
df_news['POLARITY'] = df_news['SUMMARY_SCORES'].map(lambda x: x['Polarity'])

df_news = df_news.groupby(['tick', 'PUBLICATION_DATE']).sum().reset_index()

df_news.head()

Unnamed: 0,tick,PUBLICATION_DATE,POLARITY
0,AAAP,2015-02-05,0.0
1,AAAP,2015-02-18,0.999999
2,AAAP,2015-10-02,0.999999
3,AAAP,2015-11-02,0.999999
4,AAAP,2015-11-07,0.0


<h3> Stocks Data </h3>

In [40]:
df_stocks = load_stocks(sector, tickers)
df_stocks['Date'] = pd.to_datetime(df_stocks['Date']).astype(str)
df_stocks = df_stocks[['tick', 'Date', 'Close']]

df_stocks.head()

Unnamed: 0,tick,Date,Close
0,NVRO,2017-10-20,91.49
1,NVRO,2017-10-19,91.1
2,NVRO,2017-10-18,90.65
3,NVRO,2017-10-17,90.44
4,NVRO,2017-10-16,90.35


<h3> All days in Time Period </h3>

In [41]:
from datetime import date, timedelta

DATE_FROM = date(2016, 1, 4)
DATE_TO = date(2017, 9, 30)

days = []
d1 =  DATE_FROM # start date
d2 =  DATE_TO # end date

delta = d2 - d1         # timedelta

for i in range(delta.days + 1):
    days.append(d1 + timedelta(days=i))

days[0], days[-1]

(datetime.date(2016, 1, 4), datetime.date(2017, 9, 30))

<h3> Merge Data </h3>

In [42]:
df_analysis = pd.DataFrame()
ticks_intersect = set(df_news.tick.unique()) & set(df_stocks.tick.unique())

for t in ticks_intersect:
    df_news_tick = df_news[df_news['tick'] == t][['PUBLICATION_DATE', 'POLARITY']]
    df_stocks_tick = df_stocks[df_stocks['tick'] == t][['Date', 'Close']]
    df_t = pd.DataFrame()
    df_t['Date'] = map(lambda x: x.strftime("%Y-%m-%d"), days)
    df_t['Tick'] = t
    df_t = pd.merge(df_t, df_news_tick, how='left', left_on='Date', right_on='PUBLICATION_DATE')
    df_t = pd.merge(df_t, df_stocks_tick, how='left', on='Date')
    df_t['News'] = df_t['POLARITY'].map(lambda x: 0 if x!=x else 1)
    df_t['Polarity'] = df_t['POLARITY'].fillna(0)
    df_analysis = df_analysis.append(df_t[['Date', 'Tick', 'Close', 'Polarity', 'News']])

df_analysis = df_analysis.interpolate()
df_analysis.head()

Unnamed: 0,Date,Tick,Close,Polarity,News
0,2016-01-04,AAAP,30.67,0.0,0
1,2016-01-05,AAAP,30.88,0.0,0
2,2016-01-06,AAAP,29.96,0.0,0
3,2016-01-07,AAAP,29.65,0.0,0
4,2016-01-08,AAAP,28.0,0.0,0


In [43]:
df_analysis.to_csv(sector + "_dataset.csv", index=False)