In [1]:
import pandas as pd
import numpy as np
from dataloaders.finbert_dataloaders import ValDataset
import datetime

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader
from dataloaders.finbert_dataloaders import ValDataset
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tweets = pd.read_csv("Tweet.csv")
company_tweet = pd.read_csv("Company_Tweet.csv")
aapl_tweet_ids = company_tweet[company_tweet['ticker_symbol'] == 'AAPL']
ids = aapl_tweet_ids['tweet_id'].tolist()
aapl_tweets = tweets[tweets['tweet_id'].isin(ids)]

In [3]:
aapl_tweets['post_date'] = pd.to_datetime(aapl_tweets['post_date'].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))
mask = (aapl_tweets['post_date'] >= pd.to_datetime(1514790000, unit='s')) & (aapl_tweets['post_date'] <= pd.to_datetime(1546239600, unit='s'))
time_tweets = aapl_tweets[mask]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aapl_tweets['post_date'] = pd.to_datetime(aapl_tweets['post_date'].apply(lambda x: datetime.datetime.fromtimestamp(x).isoformat()))


In [5]:
len(time_tweets), len(aapl_tweets)

(222646, 1425013)

In [6]:
hour_mask = (time_tweets['post_date'].dt.hour >= 9) & (time_tweets['post_date'].dt.hour <= 16)
time_tweets = time_tweets[hour_mask]
len(time_tweets)

108605

In [7]:
gb_date = time_tweets.groupby(time_tweets.post_date.dt.strftime('%y-%m-%d'))  

In [8]:
num_tweets_per_day = gb_date.count()['tweet_id']
idx = list(gb_date.indices.keys())

In [9]:
m = num_tweets_per_day.rolling(20).mean()
s = num_tweets_per_day.rolling(20).std()

In [10]:
trading_days = num_tweets_per_day.to_numpy() > (m+s)

In [11]:
trading_date = np.array(idx)[trading_days == True]
len(trading_date)

45

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device == 'cuda':
    print('cleaning cache')
    torch.cuda.empty_cache()

In [13]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

# The model will give softmax outputs for three labels: positive, negative or neutral (in this order)
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
model = model.to(device=device)
model.eval()
print('ready!')

ready!


In [14]:
def maj_sent(news_articles):
    torch.cuda.empty_cache()        
    tokens_encoding = tokenizer(news_articles,max_length=64, padding='max_length', truncation=True, return_tensors="pt")
    val_loader = ValDataset(tokens_encoding)
    val_dataloader = DataLoader(val_loader, shuffle=False, batch_size=2)

    softmax = nn.Softmax(dim=1)
    
    mood = []

    for ids,attn_mask,ttype_ids in val_dataloader:
        ids = ids.to(device=device)
        attn_mask = attn_mask.to(device=device)
        ttype_ids = ttype_ids.to(device=device)

        output = model(input_ids=ids, attention_mask=attn_mask, token_type_ids=ttype_ids)
        output = softmax(output.logits)
     
        s = output.argmax(dim=1).cpu().numpy()
        s[s == 1] = -1
        s[s == 0] = 1
        s[s == 2] = 0

        mood.extend(s)

    return np.sign(sum(mood))


In [15]:
date_format = '%y-%m-%d'
date_format_chart = '%Y-%m-%d'

In [16]:
chart = pd.read_csv("./charts/APPLE1440.csv")
chart['date'] = pd.to_datetime(chart['date'].apply(lambda x: datetime.datetime.strptime(x, date_format_chart)))

In [17]:
pred = []
ans = []
for i, d in enumerate(trading_date):
    date = datetime.datetime.strptime(d, date_format).strftime('%y-%m-%d')

    #THIS HAS LOOKAHEAD BIAS!!! WE CANNOT USE THIS!
    stk_mv = chart[chart['date'].dt.strftime('%y-%m-%d') == date]
    
    #if the market isn't open
    if len(stk_mv) == 0:
        print('market closed', date, stk_mv,)
        break
    
    direction = np.sign(stk_mv['close'].values - stk_mv['open'].values)

    mask = (time_tweets['post_date'].dt.strftime('%y-%m-%d') == date)
    daily_tweets = time_tweets[mask]

    sent = maj_sent(daily_tweets['body'].tolist())
    
    ans.extend(direction)
    pred.append(sent)
    print(f"{i+1}/53")

acc = sum(np.array(pred) == np.array(ans)) / len(ans)
print('ACC:',acc)
# ACC: 0.8222222222222222 (tweets all hours)

1/53
2/53
3/53
4/53
5/53
6/53
7/53
8/53
9/53
10/53
11/53
12/53
13/53
14/53
15/53
16/53
17/53
18/53
19/53
20/53
21/53
22/53
23/53
24/53
25/53
26/53
27/53
28/53
29/53
30/53
31/53
32/53
33/53
34/53
35/53
36/53
37/53
38/53
39/53
40/53
41/53
42/53
43/53
44/53
45/53
ACC: 0.8222222222222222


In [19]:
pred = []
ans = []
for i, d in enumerate(trading_date):
    date = datetime.datetime.strptime(d, date_format).strftime('%y-%m-%d')

    stk_idx = chart[chart['date'].dt.strftime('%y-%m-%d') == date].index.values
    
    #go forward one day so no lookahead bias
    stk_mv = chart.iloc[stk_idx[0]+1]

    #if the market isn't open
    if len(stk_mv) == 0:
        print('market closed', date, stk_mv,)
        break
    
    direction = np.sign(stk_mv['close'] - stk_mv['open'])
 
    mask = (time_tweets['post_date'].dt.strftime('%y-%m-%d') == date)
    daily_tweets = time_tweets[mask]
    sent = maj_sent(daily_tweets['body'].tolist())
    
    ans.append(direction)
    pred.append(sent)
    print(f"{i+1}/53")

acc = sum(np.array(pred) == np.array(ans)) / len(ans)
print('ACC:',acc)

1/53
2/53
3/53
4/53
5/53
6/53
7/53
8/53
9/53
10/53
11/53
12/53
13/53
14/53
15/53
16/53
17/53
18/53
19/53
20/53
21/53
22/53
23/53
24/53
25/53
26/53
27/53
28/53
29/53
30/53
31/53
32/53
33/53
34/53
35/53
36/53
37/53
38/53
39/53
40/53
41/53
42/53
43/53
44/53
45/53
ACC: 0.5333333333333333


In [19]:
#THIS IS A BENCH MARK!
pred = []
ans = []
for i, d in enumerate(trading_date):
    date = datetime.datetime.strptime(d, date_format).strftime('%y-%m-%d')

    stk_mv = chart[chart['date'].dt.strftime('%y-%m-%d') == date]
    
    #if the market isn't open
    if len(stk_mv) == 0:
        print('market closed', date, stk_mv,)
        break
    
    direction = np.sign(stk_mv['close'].values - stk_mv['open'].values)

    mask = (time_tweets['post_date'].dt.strftime('%y-%m-%d') == date)
    daily_tweets = time_tweets[mask]

    #always buy!!
    sent = 1
    
    ans.extend(direction)
    pred.append(sent)
    print(f"{i+1}/53")

acc = sum(np.array(pred) == np.array(ans)) / len(ans)
print('ACC:',acc)
# ACC: 0.7547169811320755 (tweets)

1/53
2/53
3/53
4/53
5/53
6/53
7/53
8/53
9/53
10/53
11/53
12/53
13/53
14/53
15/53
16/53
17/53
18/53
19/53
20/53
21/53
22/53
23/53
24/53
25/53
26/53
27/53
28/53
29/53
30/53
31/53
32/53
33/53
34/53
35/53
36/53
37/53
38/53
39/53
40/53
41/53
42/53
43/53
44/53
45/53
46/53
47/53
48/53
49/53
50/53
51/53
52/53
53/53
ACC: 0.5471698113207547
