In [132]:
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoModel
from typing import List
from torch_geometric.data import HeteroData
from datetime import timedelta
import torch
from tqdm.notebook import tqdm

import spacy
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import os

%run "../src/helper_fns.py"


### Load processed data

In [47]:
nasdaq_screener = pd.read_csv("../../data/saved_files/nasdaq_screener.csv")
news_df = pd.read_csv("../../data/saved_files/processed_news_df.csv")
news_df["mentioned_companies"] = news_df["mentioned_companies"].apply(lambda x: str(x)[1:-1].replace("'","").replace(" ", "").split(","))
stock_df = pd.read_csv("../../data/saved_files/processed_stock_df.csv")

In [48]:
# Dict of a stock history df for each company
company_stocks = {}
for symbol, data in stock_df.groupby('symbol')[stock_df.columns]: #groupby company
  # checks every close of the day if it's higher that the close in the day before and creates a binary array
  # data["out"] is the target
  data["target"] = np.where(data["close"] - data["close"].shift(1).fillna(0) > 0, 1, 0)
  data.index = pd.to_datetime(data.dateOfPrice) # setting the index as the dateOfPrice for faster look ups when creating target output for each week_graph
  company_stocks[symbol] = data

# Each Symbol has its own historical data


# Target length is 1296
data["target"].value_counts()[1] # 1 --> 690  & 0 --> 606

690

In [124]:
tokenizer = AutoTokenizer.from_pretrained("Sigma/financial-sentiment-analysis")

sigma =  AutoModel.from_pretrained("Sigma/financial-sentiment-analysis")

sigma.eval()

if torch.cuda.is_available():
  sigma = sigma.to("cuda")

device = next(sigma.parameters()).device

news_vectors = get_article_vectors(list(news_df["content"]), sigma, tokenizer, device)
torch.save(news_vectors, "../../data/weights/news_vectors_sota.pt")

Some weights of the model checkpoint at Sigma/financial-sentiment-analysis were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2546/2546 [1:29:34<00:00,  2.11s/it]


In [133]:
news_vectors = torch.load("../../data/weights/news_vectors_sota.pt")
# news_vectors.shape = torch.Size([10, 768])

In [135]:
# edge_types
# article - main_company - company
# article - mentioned - company
# article - in_industry - industry
# company - in_industry - industry


lag = 21
stock_days = lag - lag// 7 * 2 # saturday and friday for each week

news_df["release_date"] = pd.to_datetime(news_df["release_date"])
company_to_industry = nasdaq_screener.groupby("Symbol")["Sector"].agg(lambda x: list(x)[0]) # agg list(x)[0] as x is just a list of repeated sectors and we need 1 sector for each company
# company_to_industry["ABMD"] = "Health Care"
industry_to_index = {k : v for v,k in enumerate(nasdaq_screener["Sector"].unique()) }
company_to_index = {k :v for v,k in zip(range(news_df["symbol"].nunique()), news_df["symbol"].unique()) }
index_to_company = {v:k for k,v in company_to_index.items()} #reverse
week_graphs = []

for day in pd.to_datetime(pd.Series(stock_df["dateOfPrice"].unique()[lag:])):

  start = day - timedelta(lag)
  # print(day,  start)

  target_news = news_df[(news_df["release_date"]>= start) & (news_df["release_date"] < day) ].copy()

  week_graph = HeteroData()
  edges = {}
  # creating edge_index
  edges["article-main_company-company"] = [[],[]]
  edges["article-mentioned-company"] = [[],[]]
  edges["article-in_industry-industry"] = [[],[]]
  edges["company-mentioned_in-article"] = [[],[]]
  edges["company-in_industry-industry"] = [[],[]]
  edges["industry-has_company-company"] = [[],[]]

  for company in company_to_index.keys():
    edges["company-in_industry-industry"][0].append(company_to_index[company])
    edges["company-in_industry-industry"][1].append(industry_to_index[company_to_industry[company]])
    edges["industry-has_company-company"][0].append(industry_to_index[company_to_industry[company]])
    edges["industry-has_company-company"][1].append(company_to_index[company])
    # print(company)
    # print(company_to_index[company])
    # print(company_to_industry[company])
    # print(edges["company-in_industry-industry"])
    # print(edges["industry-has_company-company"])

    
  # creating an array that says stock price info(gone up or down) exists for that day and company or not
  # if info doesn't exist we won't consider the predictions for that day and company
  info_exists = [ day in company_stocks[index_to_company[i]].index for i in range(news_df["symbol"].nunique() )]

  y = [ company_stocks[index_to_company[i]].loc[day]["target"] if info_exists[i] else 0  for i in range(news_df["symbol"].nunique()) ]

  company_timeseries = [company_stocks[index_to_company[i]].loc[start:day-timedelta(1)][["open", "high", "low", "close", "volume"]].to_numpy()  for i in range(news_df["symbol"].nunique()) ]

  ## Check for missing prices
  missing_prices = [ False if x.shape[0] == stock_days else True  for x in company_timeseries ]

  company_timeseries =[ x if len(x) == stock_days else np.concatenate([x, np.nan_to_num(x.mean(axis=0).reshape(1,-1), nan=0)+np.zeros((int(stock_days-len(x)), x.shape[1]))])   for x in company_timeseries]
  company_timeseries = np.concatenate([np.expand_dims(x, 0) for x in company_timeseries])

  # # creating gaph edge_index
  for i, (_, r) in enumerate(target_news.iterrows()):
    edges["article-main_company-company"][0].append(i)
    edges["article-main_company-company"][1].append(company_to_index[r["symbol"]])
    
    for comp in r["mentioned_companies"]:
      if comp not in company_to_index.keys(): continue
      edges["article-mentioned-company"][0].append(i)
      edges["article-mentioned-company"][1].append(company_to_index[comp])
      edges["company-mentioned_in-article"][1].append(i)
      edges["company-mentioned_in-article"][0].append(company_to_index[comp])
    edges["article-in_industry-industry"][0].append(i)
    edges["article-in_industry-industry"][1].append(industry_to_index[company_to_industry[r["symbol"]]])

  for k, v in edges.items():
    edge_name = k.split("-")
    week_graph[edge_name[0],edge_name[1],edge_name[2]].edge_index  = torch.tensor(v)

  week_graph["target"] = torch.as_tensor(y)
  week_graph["missing_prices"] = torch.as_tensor(missing_prices)
  week_graph["info_exists"] = torch.as_tensor(info_exists)
  week_graph["company_timeseries"] =torch.from_numpy(company_timeseries)
  week_graph["company"].x = torch.as_tensor(range(len(index_to_company.keys()))) # just the ids to be passed to the embedding layer
  week_graph["article"].x =  news_vectors[target_news.index]  # get_article_vectors(target_news["content"].to_list()) # Nx768 (number of articles X the embedding dim of finbert)
  week_graph["industry"].x = torch.as_tensor(range(len(industry_to_index.values())))
  week_graph["date"] = day
  week_graphs.append(week_graph)

# 25 24 23 22 21 20

  company_timeseries =[ x if len(x) == stock_days else np.concatenate([x, np.nan_to_num(x.mean(axis=0).reshape(1,-1), nan=0)+np.zeros((int(stock_days-len(x)), x.shape[1]))])   for x in company_timeseries]
  ret = um.true_divide(
  company_timeseries =[ x if len(x) == stock_days else np.concatenate([x, np.nan_to_num(x.mean(axis=0).reshape(1,-1), nan=0)+np.zeros((int(stock_days-len(x)), x.shape[1]))])   for x in company_timeseries]
  ret = um.true_divide(
  company_timeseries =[ x if len(x) == stock_days else np.concatenate([x, np.nan_to_num(x.mean(axis=0).reshape(1,-1), nan=0)+np.zeros((int(stock_days-len(x)), x.shape[1]))])   for x in company_timeseries]
  ret = um.true_divide(
  company_timeseries =[ x if len(x) == stock_days else np.concatenate([x, np.nan_to_num(x.mean(axis=0).reshape(1,-1), nan=0)+np.zeros((int(stock_days-len(x)), x.shape[1]))])   for x in company_timeseries]
  ret = um.true_divide(
  company_timeseries =[ x if len(x) == stock_days else np.concatenate([x, np.nan_to_

KeyboardInterrupt: 

In [128]:
week_graphs[0]

HeteroData(
  target=[310],
  missing_prices=[310],
  info_exists=[310],
  company_timeseries=[310, 15, 5],
  date=2015-01-28 00:00:00,
  [1mcompany[0m={ x=[310] },
  [1marticle[0m={ x=[235, 768] },
  [1mindustry[0m={ x=[12] },
  [1m(article, main_company, company)[0m={ edge_index=[2, 235] },
  [1m(article, mentioned, company)[0m={ edge_index=[2, 395] },
  [1m(article, in_industry, industry)[0m={ edge_index=[2, 235] },
  [1m(company, mentioned_in, article)[0m={ edge_index=[2, 395] },
  [1m(company, in_industry, industry)[0m={ edge_index=[2, 310] },
  [1m(industry, has_company, company)[0m={ edge_index=[2, 310] }
)

In [129]:
import torch
"""
https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html
"""
from torch_geometric.data  import InMemoryDataset

class WeekGraphs(InMemoryDataset):
    def __init__(self, root, data_list, transform=None):
        self.data_list = data_list
        super().__init__(root, transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def processed_file_names(self):
        return 'data.pt'

    def process(self):
        torch.save(self.collate(self.data_list), self.processed_paths[0])
# dataset = WeekGraphs("/content/drive/MyDrive/Stock Market Prediction Graduation Project/graph_dataset_1st_year", week_graphs)

In [130]:
WeekGraphs("./graph_trainset_sota_financial_1st_year", week_graphs[:-30])
WeekGraphs("./graph_testset_sota_financial_1st_year", week_graphs[-30:])

Processing...
Done!
Processing...
Done!


WeekGraphs(30)

In [None]:
!cp -r "./Stock Market Prediction Graduation Project" ./wandb

In [None]:
wandb.save("./graph_trainset_sota_financial_1st_year/processed/data.pt",)
wandb.save("./graph_trainset_sota_financial_1st_year/processed/pre_filter.pt",)
wandb.save("./graph_trainset_sota_financial_1st_year/processed/pre_transform.pt",)

In [None]:
torch.save(news_vectors, "../../data/weights/news_vectors_sota.pt")

In [None]:
wandb.save("./graph_testset_sota_financial_1st_year/processed/data.pt",)
wandb.save("./graph_testset_sota_financial_1st_year/processed/pre_filter.pt",)
wandb.save("./graph_testset_sota_financial_1st_year/processed/pre_transform.pt",)

In [None]:
wandb.finish()

In [None]:
test.data

In [None]:
test[0]

In [None]:
# HeteroData(
#   target=[325], price movement 0 or 1 on the target
#   info_exists=[325],  0 or 1 whether we have information about the price movement on the target day
#   company_timeseries=[325, 6, 5], tensor of shape (n_companies, 6 days, 5 features(low, high, open, close, volume) ) that will be input for lstm part of the model
#   company={ x=[325] }, just a range(325) which wil be used to index the company embedding layer
#   article={ x=[N, 768] }, article vectors for that week
#   industry={ x=[12] }, a range(12) which will be used to index the industries embedding layer
#   (article, main_company, company)={ edge_index=[2, n_articles] },
#   (article, mentioned, company)={ edge_index=[2, n] },
#   (article, in_industry, industry)={ edge_index=[2, n_articles] },
#   (company, mentioned_in, article)={ edge_index=[2, n_articles] }
# )

# week_graphs[4]