In [1]:
import numpy as np
import json
import pandas as pd

The following function loads relevant information regarding stock performance after an article publication. The goal in our project is to predict whether a stock price will increase, decrease, or stay the same following the publication of an article about a company. The 'start_price' is the first avaible stock price after the article was published ('pub_time'). 'end_price_1days' is the price of the stock at the end of the trading day (or the following trading day if the article was published after 4pm ET). 'BUY', 'SELL', and 'HOLD' are determined as follows:
- 'BUY': If the percent increase in stock price from publication time ('start_price') to the stock price at the end of the day ('end_price_1days') is >0.5%
- 'SELL': If the percent increase in stock price from publication time ('start_price') to the stock price at the end of the day ('end_price_1days') is <-0.5%
- 'HOLD': If the percent increase in stock price from publication time ('start_price') to the stock price at the end of the day ('end_price_1days')is between -0.5, +0.5


In [2]:
def load_data_into_df(path):
    with open(path, "r") as f:
        data = json.load(f)
    id = 0

    ids = []
    tickers = []
    pub_times = []
    start_times = []
    start_price_opens = []
    end_price_1days = []
    stock_price_changes = []
    actions = []
    for item in data:
        try:
            pub_time = item['pub_time']
            ticker = item['labels']['ticker']
            start_price = item['labels']['start_price_open']
            end_price = item['labels']['end_price_1day']
        except KeyError:
            continue # Skip any articles that have missing data

        ids.append(id)
        id += 1
        pub_times.append(pub_time)
        tickers.append(ticker)
        start_price_opens.append(start_price)
        end_price_1days.append(end_price)
        stock_price_changes.append(end_price - start_price)

        percent_change = (end_price - start_price) / start_price

        if abs(percent_change) <= 0.005:
            action = 'HOLD'
        elif percent_change > 0.005:
            action = 'BUY'
        elif percent_change < -0.005:
            action = 'SELL'
        else:
            action = None

        actions.append(action)

    return pd.DataFrame.from_dict(
        {
            'id': ids,
            'pub_time': pub_times,
            'ticker': tickers,
            'start_price': start_price_opens,
            'end_price_1day': end_price_1days,
            'price_change': stock_price_changes,
            'action': actions
        }
    )

In [3]:
df = load_data_into_df('data/Trading_benchmark/evaluate_news.json')

In [4]:
df

Unnamed: 0,id,pub_time,ticker,start_price,end_price_1day,price_change,action
0,0,2020-12-08 09:00:00-05:00,MIK,12.070,12.80,0.730,BUY
1,1,2020-10-01 12:11:00-04:00,DUK,89.740,90.05,0.310,HOLD
2,2,2020-10-05 08:00:00-04:00,PLRX,20.000,21.43,1.430,BUY
3,3,2021-02-22 06:05:00-05:00,DISH,34.250,32.00,-2.250,SELL
4,4,2021-01-21 07:30:00-05:00,TGT,188.350,191.00,2.650,BUY
...,...,...,...,...,...,...,...
106614,106614,2020-12-07 20:00:00-05:00,PPD,35.570,34.94,-0.630,SELL
106615,106615,2020-11-24 10:00:00-05:00,TTWO,169.565,169.90,0.335,HOLD
106616,106616,2020-12-22 12:56:00-05:00,DRIV,23.230,23.50,0.270,BUY
106617,106617,2021-01-13 06:00:00-05:00,STT,80.330,78.66,-1.670,SELL
