In [None]:
!pip install vaderSentiment

In [None]:
!pip install transformers

Format: {date:[{company:{feature:value}}]}

In [None]:
folder_list = open('/content/drive/MyDrive/get_financials/list_of_folders.txt').read().splitlines()

In [None]:
for idx, val in enumerate(folder_list):
  folder_list[idx] = val.strip('""').strip("''").strip(' ')

In [None]:
import pandas as pd
import tqdm.notebook as tq

# VADER
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

# finBERT
import torch
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/get_financials/pytorch_model.bin',config='/content/drive/MyDrive/get_financials/config.json',num_labels=3)

In [None]:
path_start = '/content/drive/MyDrive/get_financials/ReleasedDataset_mp3/'
path_end = '/Text.txt'
feature_df = dict()
for path_middle in tq.tqdm(folder_list):
  # get date, company, path
  date = path_middle[path_middle.index('_')+1:]
  company = path_middle[:path_middle.index('_')]
  path = path_start + path_middle + path_end

  # get document
  file = open(path,"r+")
  sent_list = file.readlines()
  document = "".join(sent_list).replace('\n', ' ')
  file.close()
  
  # extract features from VADER
  v_scores = analyser.polarity_scores(document)

  # extract features from finBERT
  inputs = tokenizer(document, return_tensors="pt", max_length=512, truncation=True)
  outputs = model(**inputs)
  f_scores = [float(outputs[0][0][0]), float(outputs[0][0][1]), float(outputs[0][0][2])]

  company_to_feature = dict()
  feature_to_value = dict()
  feature_to_value['VADER_neg'] = v_scores['neg']
  feature_to_value['VADER_neu'] = v_scores['neu']
  feature_to_value['VADER_pos'] = v_scores['pos']
  feature_to_value['VADER_compound'] = v_scores['compound']
  feature_to_value['finBERT_pos'] = f_scores[0]
  feature_to_value['finBERT_neg'] = f_scores[1]
  feature_to_value['finBERT_neu'] = f_scores[2]
  company_to_feature[company] = feature_to_value

  if date in feature_df.keys():
    feature_df[date].append(company_to_feature)
  else:
    feature_df[date] = [company_to_feature]

No max sequence length

In [None]:
path_start = '/content/drive/MyDrive/get_financials/ReleasedDataset_mp3/'
path_end = '/Text.txt'
text_paths = dict()
for path_middle in tq.tqdm(folder_list):
  # get date, company, path
  date = path_middle[path_middle.index('_')+1:]
  company = path_middle[:path_middle.index('_')]
  path = path_start + path_middle + path_end

  # get document
  file = open(path,"r+")
  sent_list = file.readlines()
  document = "".join(sent_list).replace('\n', ' ')
  file.close()
  
  # extract features from VADER
  v_scores = analyser.polarity_scores(document)

  # extract features from finBERT
  n = 512 # chunk length
  chunks = [call[i:i+n] for i in range(0, len(call), 512)]
  chunks
  res = []
  for chunk in chunks:
    inputs = tokenizer(chunk, return_tensors="pt")
    outputs = model(**inputs)
    res.append(np.array([float(outputs[0][0][0]), float(outputs[0][0][1]), float(outputs[0][0][2])]))
  f_scores = np.mean(np.array(res), axis = 0)

  company_to_feature = dict()
  feature_to_value = dict()
  feature_to_value['VADER_neg'] = v_scores['neg']
  feature_to_value['VADER_neu'] = v_scores['neu']
  feature_to_value['VADER_pos'] = v_scores['pos']
  feature_to_value['VADER_compound'] = v_scores['compound']
  feature_to_value['finBERT_pos'] = f_scores[0]
  feature_to_value['finBERT_neg'] = f_scores[1]
  feature_to_value['finBERT_neu'] = f_scores[2]
  company_to_feature[company] = feature_to_value

  if date in text_paths.keys():
    text_paths[date].append(company_to_feature)
  else:
    text_paths[date] = [company_to_feature]
text_paths

Write

In [None]:
import json
with open('/content/drive/MyDrive/get_financials/vader_finbert_features.txt', 'w') as file:
    json.dump(text_paths, file)

Read

In [None]:
with open('/content/drive/MyDrive/get_financials/vader_finbert_features.txt', 'r') as file:
    new_d = json.load(file)

In [None]:
analyser.polarity_scores("""Showcasing its continued global appeal, Overwatch has players all over the world, including great success in Korea and China""")

In [None]:
inputs = tokenizer("""Showcasing its continued global appeal, Overwatch has players all over the world, including great success in Korea and China""", return_tensors="pt", max_length=512, truncation=True)
outputs = model(**inputs)
f_scores = [float(outputs[0][0][0]), float(outputs[0][0][1]), float(outputs[0][0][2])]
pos,neg,neu = f_scores
pos,neg,neu

In [None]:
"""In-game purchases were yet again close to $1 billion in the quarter, and a record $3.8 billion for the year, more than double last year's $1.7 billion"""

In [None]:
analyser.polarity_scores("""In-game purchases were yet again close to $1 billion in the quarter, and a record $3.8 billion for the year, more than double last year's $1.7 billion""")

In [None]:
inputs = tokenizer("""In-game purchases were yet again close to $1 billion in the quarter, and a record $3.8 billion for the year, more than double last year's $1.7 billion""", return_tensors="pt", max_length=512, truncation=True)
outputs = model(**inputs)
f_scores = [float(outputs[0][0][0]), float(outputs[0][0][1]), float(outputs[0][0][2])]
pos,neg,neu = f_scores
pos,neg,neu

In [None]:
pos,neg,neu

In [None]:
test = """Shares of food delivery companies surged despite the catastrophic impact of coronavirus on global markets."""

In [None]:
analyser.polarity_scores(test)

In [None]:
inputs = tokenizer(test, return_tensors="pt", max_length=512, truncation=True)
outputs = model(**inputs)
f_scores = [float(outputs[0][0][0]), float(outputs[0][0][1]), float(outputs[0][0][2])]
pos,neg,neu = f_scores
pos,neg,neu