# Modules

installation of packages and dependencies

In [None]:
!pip install "transformers==4.10.3"
!pip install datasets
!pip install optuna
!pip install "tqdm==4.44.1"

In [None]:
import ast
import json
import numpy as np
import optuna
import pandas as pd
import random as rn
import re
import string
import sys
import torch
import seaborn as sns
import math

from datasets import load_metric
from IPython.display import clear_output
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, confusion_matrix, r2_score, f1_score, mean_squared_error
from torch import nn
from torch.nn import MSELoss
from torch.nn.utils import clip_grad_norm_
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW, BertModel, TrainingArguments, Trainer, AutoConfig,AutoModelForSequenceClassification, BertTokenizer, RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig, RobertaModel
from transformers.trainer_utils import set_seed

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

%cd 'drive/MyDrive/MasterThesis'
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', 100)
SEED = 454616
KFOLDS = 5
sns.set_theme()
rn.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

# Data handler

Class which reads, transforms, edits, combines, subsets, tokenizes data to be used by the various BERT models

In [None]:
class DataHandler(object):
  '''
  Class which handles data
  '''
  def __init__(self, tokenizer_model = 'bert', tokenizer_version = 'bert-base-uncased', maxtokens = 48):
    self.train = self.format_data('data/train.json')
    self.train['phase'] = 'train'
    self.test = self.format_data('data/test.json')
    self.test['phase'] = 'test'
    self.original_df = pd.concat([self.train, self.test], ignore_index=True)
    ## add UUID
    self.original_df['uuid'] = self.original_df['phase'] + '_' + self.original_df.index.astype(str)

    self.df = None
    self.label_dict = None
    self.maxtokens = maxtokens
    
    self.tokenizer_model = tokenizer_model
    if tokenizer_model == 'bert':
      self.tokenizer = BertTokenizer.from_pretrained(tokenizer_version)
    elif tokenizer_model == 'roberta':
      self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_version)

  def format_data(self, path):
    '''
    Loads and formats original jsons to dataframe
    '''
    with open(path, encoding='utf-8') as f:
      data = json.load(f).values()

    df = []
    for d in data:
      temp = d['info'][0]
      temp['text'] = d['sentence']
      aspects = temp['aspects']
      
      if type(aspects) == str:
          aspects = ast.literal_eval(aspects)
      aspects = aspects[0].split('/')
      
      # only first two levels
      for aspectnum in range(2):
          temp['aspect_{}'.format(aspectnum+1)] = aspects[aspectnum].lstrip()
      temp['sentiment'] = (float(temp['sentiment_score']) + 1) / 2
      df.append(temp)
    df = pd.DataFrame(df)
    # shuffle df
    df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)
    return df[['text','aspect_1','aspect_2','sentiment']]

  def normalize_text(self, config):
    '''
    Normalizes string based on a config dict
    '''
    df = self.original_df.copy()
    for idx, row in df.iterrows():
        
      text = row['text']
      if config['make_lower']:
          text = text.lower()
      if config['remove_url']:
          text =  re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", text)
      if config['remove_twitter_usernames']:
          text = re.sub('@(\w){1,15}', '', text)
      if config['remove_hashtags']:
          text = re.sub('#(\w){1,15}', '', text)
      if config['remove_punctuation']:
          # Remove all punctuation except period
          text = re.sub('[,\\\/#!%\^@&\*;:{}=\-_`~()]','',text)
              
          if config['remove_numbers']:
              text = text.replace('.','')
          else:
              chars = list(text)
              replacelist = []

              for idex,char in enumerate(chars):
                  if (char == '.') and (idex > 0) and (idex < (len(chars) - 1)):
                      if chars[idex-1].isdigit() and chars[idex+1].isdigit():
                          None
                      else:
                          replacelist.append(''.join(chars[idex-1:idex+2]))
                  elif (char == '.') and (idex == (len(chars) - 1)):
                      text = text[:-1]

              for repl in replacelist:
                  text = text.replace(repl, repl[0]+repl[2])
      elif config['remove_numbers']:
        text = text.translate(str.maketrans('','',string.digits))

      if config['remove_whitespaces']:
          text = text.strip()
          text = re.sub('( |\t)+', ' ', text)
      if config['remove_dollar']:
          text = re.sub('[$€]','',text)
      df.at[idx,'text'] = text
        
    self.df = df

  def get_dataset(self, label_col, kfolds = None, phase = 'traintest', classification = True):
    df = self.df if self.df is not None else self.original_df
    self.label_col = label_col

    # filter necessary for hypertuning
    if kfolds is not None and classification:
      sufficient_labels = df[df['phase'] == 'train'].groupby(label_col).size() > kfolds
      include_list = list(pd.DataFrame(sufficient_labels).index[sufficient_labels])
      df = df[df[label_col].isin(include_list)]
    df = df.reset_index(drop=True)

    # transform label_col to integer mapping
    if classification:
      label_list = list(set(df[label_col]))
      label_list.sort()
      self.label_dict = {v:i for i,v in enumerate(label_list)}
      df[label_col] = df[label_col].apply(lambda x: self.label_dict[x])

    if phase == 'traintest':
      train = FIQaDataset(*self.get_encodings_labels(df[df.phase == 'train']))
      test = FIQaDataset(*self.get_encodings_labels(df[df.phase == 'test']))
      return train, test, df
    elif phase == 'trainvalid':
      train_valid_folds = []
      train_valid = df[df.phase == 'train']
      kf = KFold(n_splits=kfolds, shuffle=True, random_state=SEED)
      for train_index, valid_index in kf.split(train_valid):
        train_valid_folds.append({
            'train' : FIQaDataset(*self.get_encodings_labels(train_valid.iloc[train_index])),
            'valid' : FIQaDataset(*self.get_encodings_labels(train_valid.iloc[valid_index]))
        })
      return train_valid_folds, train_valid

    
  def get_encodings_labels(self, df):
    df = df.reset_index(drop=True)
    text = list(df['text'])
    encodings = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.maxtokens, return_tensors = 'pt')
    labels = df[self.label_col]
    uuid = df['uuid']
    return (encodings, labels, text, uuid)

class FIQaDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, text, uuid):
      self.encodings = encodings
      self.labels = labels
      self.text = text
      self.return_labels = True
      self.state = None
      self.uuid = uuid

    def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      item['text'] = self.text[idx]
      item['uuid'] = self.uuid[idx]
      if self.return_labels:
        item['labels'] = torch.tensor(self.labels[idx])
      if self.state:
        item['state'] = torch.tensor(self.state[idx])
      return item

    def __len__(self):
      return len(self.labels)

    def set_state(self, state):
      self.state = state

    def get_subset(self, selection):
      listdict = []
      for idx in selection:
        listdict.append(self.__getitem__(idx))
        
      dictlist = {}
      for key in listdict[0]:
        if key not in ['uuid', 'text']:
          dictlist[key] = torch.stack([dic[key] for dic in listdict])

      labels = dictlist.pop('labels')
      text = [dic['text'] for dic in listdict]
      uuid = [dic['uuid'] for dic in listdict]
      return FIQaDataset(dictlist, labels, text, uuid)

#General function(s)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def make_confusion_matrix(y_true,
                          y_pred,
                          categories='auto',
                          figsize=None,
                          title=None):

    cf = confusion_matrix(y_true, y_pred)
    # f1 = np.round(metrics.f1_score(y_true, y_pred, average='weighted'), 3)
    # stats_text = 'F1: {}'.format(f1)
    if figsize==None:
        figsize = plt.rcParams.get('figure.figsize')
    plt.figure(figsize=figsize)

    sns.heatmap(cf,annot=True,fmt="",cmap='Blues',cbar=False, xticklabels=categories,yticklabels=categories)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    if title:
        plt.title(title)

# Data examples
code which is runned to create general data examples/figures/graphs/statistics

In [None]:
normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}
dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
dh.normalize_text(normalize_config)

print(dh.df.groupby(by=['aspect_1','aspect_2']).size().to_latex())
temp = dh.df.copy()
temp['sentiment'] = 2*temp['sentiment']-1
sns.displot(temp, x='sentiment', hue='aspect_1', height=5, aspect=1.5)

example_sentence = 'acom downside breakout looks to be coming soon'
print(dh.tokenizer.tokenize(example_sentence))
print(dh.tokenizer(example_sentence)['input_ids'])

In [None]:
# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
string = '$amzn'
tokens = bert_tokenizer.tokenize(string)
token_ids = bert_tokenizer.convert_tokens_to_ids(tokens)
print(string)
print(tokens)
print(token_ids)

In [None]:
normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
dh.normalize_text(normalize_config)
folds, df = dh.get_dataset('sentiment', kfolds=5, phase='trainvalid', classification=False)
df = dh.df

In [None]:
df = dh.df
# print(np.var(2*df[df.phase == 'train']['sentiment']), np.var(2*df[df.phase == 'test']['sentiment']))

df

# Aspect 1 models

## Standard models hypertuning
standard model is Bert/Roberta without modification / hierarchical features

In [None]:
normalize_config = {
    'make_lower':True,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-cased')
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
folds, df = dh.get_dataset('aspect_1', kfolds=5, phase='trainvalid')
label_dict = dh.label_dict

def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  # return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))

best_runs = []

training_args = TrainingArguments("trainer_aspect1_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

for f in folds:
  train = f['train']
  valid = f['valid']

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
      compute_metrics=compute_f1
  )

  best_run = trainer.hyperparameter_search(
      direction="maximize",
      hp_space=my_hp_space,
      n_trials = 10
  )
  best_runs.append(best_run)


hp_performance = {}
for run in best_runs:
  fold_hp = run.hyperparameters
  training_args = TrainingArguments("trainer_aspect1_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **fold_hp)
  
  # check if hyperparameters already evaluated
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = [] 
  for f in folds:
    train = f['train']
    valid = f['valid']
    trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
    )
    trainer.train()
    hp_performance[str(fold_hp)] += [compute_f1(trainer.predict(valid)[:2])['f1']]

print('BEST RUNS', best_runs)
for k,v in hp_performance.items():
  print('HYPERPARAMETER PERFORMANCE:', k, v, np.median(v), np.mean(v))

# ### BERT BASE UNCASED
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.8872531309962565, 0.8822281295450474, 0.8301586303911884, 0.8957098550569582, 0.8462152773702927] 0.8822281295450474 0.8683130046719487
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.908247479722948, 0.8971699532044358, 0.8401958838016848, 0.8855041482160124, 0.8784665656085763] 0.8855041482160124 0.8819168061107314
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.8946693453698956, 0.903564791115353, 0.8685250361944816, 0.8807188931670259, 0.8945845487064126] 0.8945845487064126 0.8884125229106339
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.86781977364937, 0.8916428744014951, 0.8607978874138569, 0.8928055337316835, 0.8888682072320515] 0.8888682072320515 0.8803868552856914
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.8633645374366503, 0.8895605103245131, 0.8418656481617341, 0.8945476726726727, 0.8684684684684685] 0.8684684684684685 0.8715613674128078

# ### BERT BASE CASED
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.8884072916642043, 0.8654759248412498, 0.8352086690796366, 0.8666666666666667, 0.8911541643684501] 0.8666666666666667 0.8693825433240414
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.8625719074149567, 0.8713630201022929, 0.7861042861042862, 0.8598638676358884, 0.8843819395878892] 0.8625719074149567 0.8528570041690628
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.8229927573387393, 0.8599775336617442, 0.823879441354836, 0.827159174985262, 0.848943149687805] 0.827159174985262 0.8365904114056771
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.8773561929797433, 0.8857606519314004, 0.832575236971915, 0.8567824403350719, 0.8712910162435615] 0.8712910162435615 0.8647531076923386
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.1} [0.8395947398881844, 0.8536273404316258, 0.8257995851913561, 0.8440741719962499, 0.885546950459231] 0.8440741719962499 0.8497285575933295

# ### TRC2
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.8931163879871493, 0.8948051278561693, 0.829247823358924, 0.8654515441805244, 0.8809867984740649] 0.8809867984740649 0.8727215363713665
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.878333077392467, 0.9164462486201617, 0.8478097341611691, 0.8797640411897574, 0.8802147300965267] 0.8797640411897574 0.8805135662920163
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.8995269555465358, 0.9043516819436551, 0.8823863018835845, 0.8787094789534549, 0.896631872416722] 0.896631872416722 0.8923212581487905
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.8831137725968085, 0.8978044295963371, 0.8252217344887671, 0.883707067251371, 0.8901769152665211] 0.883707067251371 0.8760047838399609
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.9037434911874374, 0.8986924371539756, 0.8704650824078914, 0.8850081932049144, 0.8956038540322219] 0.8956038540322219 0.8907026115972881

# ### RoBERTa
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.8901702935431512, 0.867547311478889, 0.86077792341168, 0.8842503065290643, 0.8490431076237528] 0.867547311478889 0.8703577885173074
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.8962626287705042, 0.8801801801801801, 0.8876385488079642, 0.8996106659423243, 0.8828132392205617] 0.8876385488079642 0.8893010525843069
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.8925819919827243, 0.8854568854568854, 0.880060223266745, 0.9067029411856998, 0.9024708045018156] 0.8925819919827243 0.893454569278774
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.8811582585183102, 0.8636564869138824, 0.8662567313413645, 0.900619467913433, 0.8752138535922319] 0.8752138535922319 0.8773809596558445
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.8599415905319729, 0.8778492486357656, 0.8766645767700215, 0.9123624131230621, 0.8832391068177603] 0.8778492486357656 0.8820113871757165

### Text normalization influence
investigatino of which normalization methods work best for Financial Data

In [None]:
normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':False,
    'remove_whitespaces':True,
    'remove_dollar': False
}

# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-cased')
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
folds, df = dh.get_dataset('aspect_1', kfolds=5, phase='trainvalid')
label_dict = dh.label_dict

def model_init():
  set_seed(SEED)
  # return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

best_roberta_a1_hp = {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0}
# best_bert_uncased_a1_hp = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1}
# best_bert_cased_a1_hp = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0}
# best_trc2_a1_hp = {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0}

training_args = TrainingArguments("trainer_aspect1_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **best_roberta_a1_hp)

normalization_results = []
for f in folds:
  train = f['train']
  valid = f['valid']
  trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=valid,
  )
  trainer.train()
  normalization_results.append(compute_f1(trainer.predict(valid)[:2])['f1'])

print(normalization_results, np.median(normalization_results), np.mean(normalization_results))

# BERT BASE UNCASED
# NO DOLLAR [0.895990695302052, 0.8995570597883215, 0.8452738262894254, 0.874176951563644, 0.8630821998469058] 0.874176951563644 0.8756161465580696
# YES NUMBER [0.8981563051124465, 0.8974839668481287, 0.8488267713019956, 0.8780604165321508, 0.8606391117911854] 0.8780604165321508 0.8766333143171814
# YES PUNCTUATION [0.8593687909973269, 0.8985031143817941, 0.8532473409399566, 0.8588795317056186, 0.87112825716739] 0.8593687909973269 0.8682254070384172

# BERT BASE CASED
# NO DOLLAR [0.8602370173292995, 0.8636851136851137, 0.8321037400237928, 0.8672544093240417, 0.8853935903116231] 0.8636851136851137 0.8617347741347741
# YES NUMBER [0.8841404245414737, 0.8641368641368641, 0.8604501480779747, 0.8598638676358884, 0.8780107862395172] 0.8641368641368641 0.8693204181263436
# YES PUNCTUATION [0.8429445067825102, 0.8758569615421028, 0.7944195786972511, 0.8654814253703731, 0.905949192435679] 0.8654814253703731 0.8569303329655831

# TRC2
# NO DOLLAR [0.8958042264293914, 0.9000626605417025, 0.8864232982982982, 0.8752254517362336, 0.8717673673133249] 0.8864232982982982 0.88585660086379 
# YES NUMBERS [0.8966728338925649, 0.9048582170754117, 0.8664071680376028, 0.8840867771800712, 0.8863606910842865] 0.8863606910842865 0.8876771374539875 
# YES PUNCTUATION [0.878276353816054, 0.9047837310995207, 0.8913983591644762, 0.8917766804291627, 0.9000133269973453] 0.8917766804291627 0.8932496903013117

# ROBERTA 
# NO DOLLAR [0.900967368925741, 0.8862712530406562, 0.8862874639629842, 0.9094908544908545, 0.891415085298064] 0.891415085298064 0.8948864051436601
# YES NUMBERS [0.8734717754207765, 0.8815958815958816, 0.8719846777279208, 0.8977637611704969, 0.8872622079309009] 0.8815958815958816 0.8824156607691952 
# YES PUNCTUATION [0.8672120554186019, 0.8705789924783546, 0.8818668943250232, 0.9105183589841844, 0.8918383162622794] 0.8818668943250232 0.8844029234936887



### Aspect 1 model testing

In [None]:
# best_bert_uncased_a1_hp = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1}
# best_bert_cased_a1_hp = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0}
# best_trc2_a1_hp = {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0}
best_roberta_a1_hp = {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0}

normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':True,
}
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-cased')
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
# trainvalid, df = dh.get_dataset('aspect_1', kfolds = 5, phase='trainvalid')
# label_dict = dh.label_dict

# train = trainvalid[2]['train']
# valid = trainvalid[2]['valid']

train, test, df = dh.get_dataset('aspect_1', kfolds = 5)
label_dict = dh.label_dict



training_args = TrainingArguments('trainer_aspect1',
                                  evaluation_strategy='epoch',
                                  logging_strategy='epoch',
                                  save_strategy='no',
                                  **best_roberta_a1_hp)

metric = load_metric("f1")

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  # return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=test
)
trainer.train()

trainer.save_model('models/aspect1model_roberta')

print('F1', compute_f1(trainer.predict(test)[:2])['f1'])

In [None]:
len(test)

### Aspect 1 model test evaluation

In [None]:
preds = trainer.predict(valid)
y_pred = list(map(lambda x: np.argmax(x), preds.predictions))
y_true = preds.label_ids

# pd.DataFrame({'y_pred':y_pred, 'y_true':y_true}).to_csv('a1_trc2_predictions.csv', index=False)

set(y_true).union(set(y_pred))
inverse_a1label_dict  = {value : key for (key, value) in label_dict.items()}
aspect1_target_names = [inverse_a1label_dict[k] for k in set(y_true).union(set(y_pred))]

print(classification_report(y_true, y_pred,target_names=aspect1_target_names))
make_confusion_matrix(y_true, y_pred, categories=aspect1_target_names, title='Aspect 1 BERT-base-uncased without dollar signs', figsize=(7,7))

## Preparing twostep model

In [None]:
trainpreds = trainer.predict(train)
y_trainpred = list(map(lambda x: np.argmax(x), trainpreds.predictions))

testpreds = trainer.predict(test)
y_testpred = list(map(lambda x: np.argmax(x), testpreds.predictions))

df['aspect_1_pred'] = None

df.loc[df.phase == 'train','aspect_1_pred'] = y_trainpred
df.loc[df.phase == 'test','aspect_1_pred'] = y_testpred

df.to_csv('aspect_1_roberta_pred_df.csv',  index=False)

## Preparing fusing model (with aspect 1)

In [None]:
def get_pooled_output(data, trainer):
  with torch.no_grad():
    # data = data[:]
    data.pop('labels')
    data.pop('text')
    uuid = data.pop('uuid')
    for k,v in data.items():
      data[k] = v.to(device)
    output = trainer.model.roberta(**data).last_hidden_state
    pooled_output = list(map(lambda x: x[0], output))
  pooled_output_detached = [x.detach().cpu().numpy() for x in pooled_output]
  del data
  del pooled_output
  torch.cuda.empty_cache()
  return pooled_output_detached

train_loader = DataLoader(train, batch_size=16, shuffle=False)
train_pooled_output = []
for batch in train_loader:
  train_pooled_output += get_pooled_output(batch, trainer)

test_loader = DataLoader(test, batch_size=16, shuffle=False)
test_pooled_output = []
for batch in test_loader:
  test_pooled_output += get_pooled_output(batch, trainer)

A1_train_uuids = list(train.uuid)
A1_test_uuids = list(test.uuid)

## CV model
same aspect 1 model but using only train data which is cross validated

In [None]:
normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-cased')
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)

_, originaltest, _ = dh.get_dataset('aspect_1', kfolds=5, phase='traintest')
folds, df = dh.get_dataset('aspect_1', kfolds=5, phase='trainvalid')

label_dict = dh.label_dict

def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))

best_runs = []

training_args = TrainingArguments("trainer_aspect1_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

for f in folds:
  train = f['train']
  # valid = originaltest
  len_train = train.__len__()
  #use first 90% as train and last 10% as valid, use f['valid'] as test
  valid = train.get_subset(list(range(int(0.9*len_train),len_train)))
  train = train.get_subset(list(range(0, int(0.9*len_train))))


  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
      compute_metrics=compute_f1
  )

  best_run = trainer.hyperparameter_search(
      direction="maximize",
      hp_space=my_hp_space,
      n_trials = 10
  )
  best_runs.append(best_run)


hp_performance = {}
for run in best_runs:
  fold_hp = run.hyperparameters
  training_args = TrainingArguments("trainer_aspect1_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **fold_hp)
  
  # check if hyperparameters already evaluated
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = [] 
  for f in folds:
    train = f['train']
    # valid = originaltest
    
    len_train = train.__len__()
    #use first 90% as train and last 10% as valid, use f['valid'] as test
    valid = train.get_subset(list(range(int(0.9*len_train),len_train)))
    train = train.get_subset(list(range(0, int(0.9*len_train))))
    
    
    trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
    )
    trainer.train()
    hp_performance[str(fold_hp)] += [compute_f1(trainer.predict(valid)[:2])['f1']]

print('BEST RUNS', best_runs)
for k,v in hp_performance.items():
  print('HYPERPARAMETER PERFORMANCE:', k, v, np.median(v), np.mean(v))

# Validation on original test set
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.9035730460662524, 0.8827593654164335, 0.8660351899827289, 0.8342646931831318, 0.8590336510493578] 0.8660351899827289 0.8691331891395808
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.9125400306321358, 0.894620869924997, 0.9061175460269174, 0.8467879331917691, 0.8918708809059686] 0.894620869924997 0.8903874521363576
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.8863784315354998, 0.87097293814433, 0.8757058044886993, 0.885447318658103, 0.8682727893166723] 0.8757058044886993 0.877355456428661
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.8760717055874124, 0.9061545910230121, 0.8861535877983245, 0.8452529272754287, 0.8945545286218861] 0.8861535877983245 0.8816374680612128

# Validation on 10% of train set
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.9145329671938217, 0.9047064375272534, 0.9109392480178997, 0.8823719154884144, 0.9161495891329867] 0.9109392480178997 0.9057400314720752
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.9145329671938217, 0.9097140926151654, 0.898495062989445, 0.8885704750250307, 0.9030734799957996] 0.9030734799957996 0.9028772155638525
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.9095093980821186, 0.9047064375272534, 0.9220362302224261, 0.8886618998978549, 0.9330841645448388] 0.9095093980821186 0.9115996260548984
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.9145329671938217, 0.8935606929186384, 0.8885704750250307, 0.9131874630396215, 0.9270232310962647] 0.9131874630396215 0.9073749658546755
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.9145329671938217, 0.9160117571032498, 0.8934831460674156, 0.8548614708561834, 0.9330841645448388] 0.9145329671938217 0.9023947011531019

### Evaluation and preparation of two step model

In [None]:
best_cv_hypers = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0}


normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
folds, df = dh.get_dataset('aspect_1', kfolds=5, phase='trainvalid')
label_dict = dh.label_dict

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))

training_args = TrainingArguments("trainer_aspect1_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **best_cv_hypers)

uuids = np.array([])
trues = np.array([])
predicts = np.array([])

for f in folds:
  train = f['train']
  valid = f['valid']

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
      compute_metrics=compute_f1
  )

  trainer.train()
  
  uuids = np.append(uuids, valid.uuid)
  preds = trainer.predict(valid)
  predicts = np.append(predicts, list(map(lambda x: np.argmax(x), preds.predictions)))
  trues = np.append(trues, preds.label_ids)
print(classification_report(trues, predicts))

a1_cv_model_df = pd.DataFrame({'UUID':uuids,
              'A1_pred':predicts.astype(int),
              'A1_true':trues.astype(int)})

# transform back to original labels
inverse_a1label_dict = {0: 'Corporate',1: 'Economy', 2: 'Market', 3: 'Stock'}
a1_cv_model_df['A1_pred'] = a1_cv_model_df['A1_pred'].apply(lambda x: inverse_a1label_dict[x])
a1_cv_model_df['A1_true'] = a1_cv_model_df['A1_true'].apply(lambda x: inverse_a1label_dict[x])

# Aspect 2

## Standard model

### Hypertuning

In [None]:
normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-cased')
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
folds, df = dh.get_dataset('aspect_2', kfolds=5, phase='trainvalid')
label_dict = dh.label_dict

def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  # return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))

best_runs = []

training_args = TrainingArguments("trainer_aspect2_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

for f in folds:
  train = f['train']
  valid = f['valid']

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
      compute_metrics=compute_f1
  )

  best_run = trainer.hyperparameter_search(
      direction="maximize",
      hp_space=my_hp_space,
      n_trials = 10
  )
  best_runs.append(best_run)


hp_performance = {}
for run in best_runs:
  fold_hp = run.hyperparameters
  training_args = TrainingArguments("trainer_aspect2_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **fold_hp)
  
  # check if hyperparameters already evaluated
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = [] 
  for f in folds:
    train = f['train']
    valid = f['valid']
    trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
    )
    trainer.train()
    hp_performance[str(fold_hp)] += [compute_f1(trainer.predict(valid)[:2])['f1']]

print('BEST RUNS', best_runs)
for k,v in hp_performance.items():
  print('HYPERPARAMETER PERFORMANCE:', k, v, np.median(v), np.mean(v))

# ROBERTA
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.6526084664140335, 0.6665022410958468, 0.6694007736697882, 0.641710405110911, 0.5943585352185264] 0.6526084664140335 0.6449160843018211
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.6475895284221357, 0.7568732994565925, 0.7432191425201887, 0.6960984434164369, 0.7327128128165565] 0.7327128128165565 0.7152986453263821
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.6600150478132129, 0.7131776742054083, 0.7248202720732881, 0.6878489961505208, 0.7390275128153403] 0.7131776742054083 0.7049779006115541
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.642894874721287, 0.7158791488662588, 0.700818972906584, 0.6937520151414794, 0.746179700481188] 0.700818972906584 0.6999049424233594
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.6188457393679594, 0.6757389392339054, 0.68783362632719, 0.6724835801152808, 0.6837608255562037] 0.6757389392339054 0.6677325421201079

# TRC2
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.646481463623466, 0.7047287202666461, 0.6850114237226569, 0.6833790113162709, 0.4146749275189642] 0.6833790113162709 0.6268551092896008
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.643322078918974, 0.7018295329328645, 0.6503442183982213, 0.6599179702525628, 0.660715544372432] 0.6599179702525628 0.663225868975011
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.5713435503203186, 0.6098379526494354, 0.6323345117863659, 0.6369974353166767, 0.6251958368077422] 0.6251958368077422 0.6151418573761077
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.570194440081343, 0.5994899327830279, 0.5732228961565424, 0.5986512907977389, 0.5773772335591892] 0.5773772335591892 0.5837871586755683

### Final model

In [None]:
best_hp = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} #RoBERTa
# best_hp = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} # TRC2

normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False,
}
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
train, test, df = dh.get_dataset('aspect_2', kfolds = 5)
label_dict = dh.label_dict

training_args = TrainingArguments('trainer_aspect2',
                                  evaluation_strategy='epoch',
                                  logging_strategy='epoch',
                                  save_strategy='no',
                                  **best_hp)

metric = load_metric("f1")

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=test
)
trainer.train()

# trainer.save_model('models/aspect2model_roberta')

print('F1', compute_f1(trainer.predict(test)[:2])['f1'])

preds = trainer.predict(test)
y_pred = list(map(lambda x: np.argmax(x), preds.predictions))
y_true = preds.label_ids

# pd.DataFrame({'y_pred':y_pred, 'y_true':y_true}).to_csv('a2_TRC2_predictions.csv', index=False)

set(y_true).union(set(y_pred))
inverse_a1label_dict  = {value : key for (key, value) in label_dict.items()}
aspect1_target_names = [inverse_a1label_dict[k] for k in set(y_true).union(set(y_pred))]

print(classification_report(y_true, y_pred,target_names=aspect1_target_names))
# make_confusion_matrix(y_true, y_pred, categories=aspect1_target_names, title='Aspect 2 RoBERTa', figsize=(7,7))

#prepare mcnemar
test_mcnemar_df = pd.DataFrame()
test_mcnemar_df['uuids'] = test.uuid
test_mcnemar_df['a2_pred'] = y_pred
test_mcnemar_df['a2_true'] = y_true

### Preparing fusing model (with aspect 2)

In [None]:
def get_pooled_output(data, trainer):
  with torch.no_grad():
    # data = data[:]
    data.pop('labels')
    data.pop('text')
    uuid = data.pop('uuid')
    for k,v in data.items():
      data[k] = v.to(device)
    output = trainer.model.roberta(**data).last_hidden_state
    pooled_output = list(map(lambda x: x[0], output))
  pooled_output_detached = [x.detach().cpu().numpy() for x in pooled_output]
  del data
  del pooled_output
  torch.cuda.empty_cache()
  return pooled_output_detached

train_loader = DataLoader(train, batch_size=16, shuffle=False)
train_pooled_output = []
for batch in train_loader:
  train_pooled_output += get_pooled_output(batch, trainer)

test_loader = DataLoader(test, batch_size=16, shuffle=False)
test_pooled_output = []
for batch in test_loader:
  test_pooled_output += get_pooled_output(batch, trainer)

A2_train_uuids = list(train.uuid)
A2_test_uuids = list(test.uuid)

## CV model

In [None]:
normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')

dh.normalize_text(normalize_config)

# _, originaltest, _ = dh.get_dataset('aspect_2', kfolds=5, phase='traintest')
folds, df = dh.get_dataset('aspect_2', kfolds=5, phase='trainvalid')
label_dict = dh.label_dict

def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))

best_runs = []

training_args = TrainingArguments("trainer_aspect2_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

for f in folds:
  train = f['train']
  # valid = originaltest

  len_train = train.__len__()
  # use first 90% as train and last 10% as valid, use f['valid'] as test
  valid = train.get_subset(list(range(int(0.9*len_train),len_train)))
  train = train.get_subset(list(range(0, int(0.9*len_train))))

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
      compute_metrics=compute_f1
  )

  best_run = trainer.hyperparameter_search(
      direction="maximize",
      hp_space=my_hp_space,
      n_trials = 10
  )
  best_runs.append(best_run)


hp_performance = {}
for run in best_runs:
  fold_hp = run.hyperparameters
  training_args = TrainingArguments("trainer_aspect2_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **fold_hp)
  
  # check if hyperparameters already evaluated
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = [] 
  for f in folds:
    train = f['train']
    # valid = originaltest
    len_train = train.__len__()
    # use first 90% as train and last 10% as valid, use f['valid'] as test
    valid = train.get_subset(list(range(int(0.9*len_train),len_train)))
    train = train.get_subset(list(range(0, int(0.9*len_train))))

    trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
    )
    trainer.train()
    hp_performance[str(fold_hp)] += [compute_f1(trainer.predict(valid)[:2])['f1']]

print('BEST RUNS', best_runs)
for k,v in hp_performance.items():
  print('HYPERPARAMETER PERFORMANCE:', k, v, np.median(v), np.mean(v))

# # TRC2 hyperparameters
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.7653568917018283, 0.6711358205940248, 0.6198501790607054, 0.6393395615822087, 0.647267316017316] 0.647267316017316 0.6685899537912167
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.752020202020202, 0.7638387718413326, 0.6641118543292457, 0.6980974102983672, 0.6929161747343565] 0.6980974102983672 0.7141968826447008
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.6852337477337477, 0.6760642135642135, 0.6268127705627705, 0.6975895316804408, 0.6395913664209525] 0.6760642135642135 0.665058325992425
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.7153495973976605, 0.7034384722918096, 0.6398691693583334, 0.6568310614506266, 0.6267550332442675] 0.6568310614506266 0.6684486667485394
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.6806550506662196, 0.6757333675096833, 0.6418494406985455, 0.7076655982905983, 0.6620134951120865] 0.6757333675096833 0.6735833904554266

# Roberta hyperparameters
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.7659033948507634, 0.709205089169204, 0.7096635204821055, 0.7248882364312985, 0.690716007477371] 0.7096635204821055 0.7200752496821485
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.788627097219115, 0.7510007192669732, 0.764397701229252, 0.6076573185268838, 0.6985223442843306] 0.7510007192669732 0.7220410361053109
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.8108141858141859, 0.726739125402227, 0.7406432970792238, 0.7615180141495931, 0.6760443407502231] 0.7406432970792238 0.7431517926390906
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.7495352686529156, 0.7249146874146875, 0.7237573404017253, 0.7519803296119085, 0.6923324149910838] 0.7249146874146875 0.7285040082144641

### Evaluation

In [None]:
# Roberta
best_cv_hypers = {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1}

#TRC2
# best_cv_hypers = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0}

normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False,
}

dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')


dh.normalize_text(normalize_config)
# _, originaltest, _ = dh.get_dataset('aspect_2', phase='traintest',kfolds = 5)
folds, df = dh.get_dataset('aspect_2',phase='trainvalid', kfolds = 5)
label_dict = dh.label_dict



training_args = TrainingArguments('trainer_aspect2',
                                  evaluation_strategy='epoch',
                                  logging_strategy='epoch',
                                  save_strategy='no',
                                  **best_cv_hypers)

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))

training_args = TrainingArguments("trainer_aspect1_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **best_cv_hypers)

trues = np.array([])
predicts = np.array([])

a2uuids = np.array([])


for f in folds:
  train = f['train']
  valid = f['valid']

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
      compute_metrics=compute_f1
  )

  trainer.train()
  
  a2uuids = np.append(a2uuids, valid.uuid)
  preds = trainer.predict(valid)
  trues = np.append(trues, preds.label_ids)
  predicts = np.append(predicts, list(map(lambda x: np.argmax(x), preds.predictions)))

print(classification_report(trues, predicts))

  # Roberta F1: 0.7
  # TRC2 F1: 0.68

# for mc nemar test
a2_df = pd.DataFrame()
a2_df['uuids'] = a2uuids
a2_df['trues'] = trues
a2_df['a2_preds'] = predicts

## Transfer model

In [None]:
# Load A1 model
config = AutoConfig.from_pretrained('models/aspect1model_roberta')
# Chang classification head through config
config.num_labels = 21
transfermodel = AutoModelForSequenceClassification.from_config(config)

# Freeze bert model layers
for param in transfermodel.base_model.parameters():
    param.requires_grad = False

transfer_hp = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1}

normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False,
}

dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')

dh.normalize_text(normalize_config)
train, test, df = dh.get_dataset('aspect_2', kfolds = 5)
label_dict = dh.label_dict

training_args = TrainingArguments('trainer_aspect2',
                                  evaluation_strategy='epoch',
                                  logging_strategy='epoch',
                                  save_strategy='no',
                                  **transfer_hp)

metric = load_metric("f1")

def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

trainer = Trainer(
    model=transfermodel,
    args=training_args,
    train_dataset=train,
    eval_dataset=test
)

# train classification head until convergence
trainer.train()

# Unfreeze bert model layers and train everything
for param in transfermodel.base_model.parameters():
    param.requires_grad = False

trainer = Trainer(
    model=transfermodel,
    args=training_args,
    train_dataset=train,
    eval_dataset=test
)
trainer.train()


print('F1', compute_f1(trainer.predict(test)[:2])['f1'])

preds = trainer.predict(test)
y_pred = list(map(lambda x: np.argmax(x), preds.predictions))
y_true = preds.label_ids

set(y_true).union(set(y_pred))
inverse_a1label_dict  = {value : key for (key, value) in label_dict.items()}
aspect1_target_names = [inverse_a1label_dict[k] for k in set(y_true).union(set(y_pred))]

print(classification_report(y_true, y_pred,target_names=aspect1_target_names))
make_confusion_matrix(y_true, y_pred, categories=aspect1_target_names, title='Transfer model', figsize=(7,7))

## Two step model

### Hypertuning

In [None]:
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

## set normalized df with predicted labels from best aspect 1 model
aspect1_pred_df = pd.read_csv('aspect_1_roberta_pred_df.csv')

def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=numlabels)

training_args = TrainingArguments("trainer_aspect2_twostep_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

best_runs = []

for a1 in [0,3]:
  dh.df = aspect1_pred_df[aspect1_pred_df.aspect_1 == a1]
  folds, df = dh.get_dataset('aspect_2', kfolds=5, phase='trainvalid')

  for fold in folds:
    train = fold['train']
    valid = fold['valid']
    numlabels = len(set(train.labels))
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train,
        eval_dataset=valid,
        compute_metrics=compute_f1
    )

    best_run = trainer.hyperparameter_search(
        direction="maximize",
        hp_space=my_hp_space,
        n_trials = 5
    )
    best_runs.append(best_run.hyperparameters)

# testing best runs on other folds
hp_performance = {}
for fold_hp in best_runs:
  training_args = TrainingArguments("trainer_aspect2_hyper",
                                evaluation_strategy = 'epoch',
                                logging_strategy = 'no',
                                log_on_each_node = False,
                                save_strategy = 'no',
                                **fold_hp)
  
  # check if hyperparameters already evaluated => skip
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = {'0':[],
                                  '3':[]}

  for a1 in [0,3]:
    dh.df = aspect1_pred_df[aspect1_pred_df.aspect_1 == a1]
    folds, df = dh.get_dataset('aspect_2', kfolds = 5, phase='trainvalid')

    for fold in folds:
      train = fold['train']
      valid = fold['valid']
      numlabels = len(set(train.labels))

      trainer = Trainer(
          model_init=model_init,
          args=training_args,
          train_dataset=train,
          eval_dataset=valid,
          compute_metrics=compute_f1
      )
      trainer.train()
      hp_performance[str(fold_hp)][str(a1)] += [compute_f1(trainer.predict(valid)[:2])['f1']]

a1_0_count = np.sum(df_full.aspect_1 == 0)
a1_3_count = np.sum(df_full.aspect_1 == 3)

for k,v in hp_performance.items():
  pooled_result = (a1_0_count * np.array(v['0']) + a1_3_count * np.array(v['3']))/(a1_0_count + a1_3_count)
  print(k, pooled_result, np.median(pooled_result), np.mean(pooled_result))

# {'learning_rate': 5e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.58726702 0.57455922 0.71011439 0.63498631 0.68435558] 0.6349863131776315 0.6382565047779636
# {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.70496559 0.68638129 0.76436387 0.74297122 0.71105937] 0.7110593711476345 0.7219482683784861
# {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.73175152 0.68553827 0.77288434 0.73554909 0.70691327] 0.7317515153914775 0.726527295879358
# {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.76615316 0.72663043 0.79668931 0.77069992 0.74538922] 0.7661531575081916 0.7611124089954359
# {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.75590887 0.69984951 0.79212312 0.73918664 0.72560519] 0.7391866396153463 0.742534665950539
# {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.72615128 0.6530492  0.75573929 0.71100048 0.69800873] 0.7110004770134158 0.7087897928853344
# {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.73412847 0.72012767 0.8105408  0.77103883 0.75615304] 0.7561530432327108 0.7583977631419405
## {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.70560902 0.73237456 0.81207968 0.77749339 0.76178065] 0.7617806537627843 0.7578674586320739
# {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.1} [0.53902208 0.60383949 0.71900151 0.65583886 0.63494506] 0.6349450566074124 0.6305294001137579
# {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.7182183  0.66575107 0.76606586 0.71700935 0.71922731] 0.7182182995612303 0.717254377293006

# {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.1} [0.71385247 0.65438315 0.71598956 0.68150431 0.69873132] 0.6987313238143349 0.6928921638248656
# {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'warmup_ratio': 0} [0.65969743 0.63549088 0.73061845 0.71499474 0.65329439] 0.6596974288475521 0.6788191758811921
# {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.63153881 0.67732481 0.7568956  0.65208737 0.61387556] 0.6520873711593047 0.6663444305678556
## {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.75419064 0.70844385 0.75311565 0.78136591 0.71007925] 0.753115652578985 0.7414390604157515
# {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.72517906 0.69823074 0.79620278 0.69032743 0.73738132] 0.72517905675456 0.7294642656570727
# {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.72558784 0.65305125 0.76968336 0.71427643 0.67667587] 0.7142764251521059 0.7078549472348589
# {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.68290076 0.6615136  0.76214394 0.63268518 0.71419997] 0.6829007606042694 0.6906886903763945
# {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.61603099 0.61625108 0.72357912 0.66086252 0.67923476] 0.6608625249014375 0.6591916956781776
# {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.72445905 0.7145566  0.79147944 0.76269183 0.73192616] 0.731926163746879 0.745022618250809

### Final twostep models

In [None]:
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
## set normalized df with predicted labels from best aspect 1 model
aspect1_pred_df = pd.read_csv('aspect_1_roberta_pred_df.csv')
dh.df = aspect1_pred_df

train, test, df = dh.get_dataset('aspect_2',kfolds=5)
test.return_labels = False

# inverse_a1label_dict = {0: 'Corporate', 1: 'Economy', 2: 'Market', 3: 'Stock'}
inverse_a1label_dict = {0: 'Corporate', 2: 'Market', 3: 'Stock'}

best_twostep_hp = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0}#{'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0}

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=numlabels)

training_args = TrainingArguments('trainer_aspect2_twostep',
                                  evaluation_strategy='epoch',
                                  logging_strategy='no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **best_twostep_hp)

twostep_results = {}


for k,v in inverse_a1label_dict.items():
  print(k,v)
  
  dh.df = df[df.aspect_1_pred == k]
  train, valid, _ = dh.get_dataset('aspect_2', kfolds=5)
  inverse_twostep_a2_dict = {v: k for k, v in dh.label_dict.items()}

  numlabels = len(set(train.labels))
  print('NUMLABELS', numlabels)
  twosteptrainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=valid,
    compute_metrics=compute_f1
  )
  twosteptrainer.train()
  # get argmax for prediction and link to original label using inverse dict
  twostep_results[str(k)] = list(map(lambda x: inverse_twostep_a2_dict[np.argmax(x)], twosteptrainer.predict(test).predictions))


## PICKING PREDICTIONS USING Aspect 1 predictions
twostep_final_preds = []
for idx, value in enumerate(df[df['phase'] == 'test']['aspect_1_pred']):
  twostep_final_preds.append(twostep_results[str(value)][idx])
twostep_final_preds

## EVALUATION
dh.df = aspect1_pred_df
train, test, df = dh.get_dataset('aspect_2',kfolds=5)
label_dict = dh.label_dict

df['aspect_2_twostep_final_pred'] = None
df.loc[df.phase == 'test','aspect_2_twostep_pred'] = twostep_final_preds

y_true = df[df.phase=='test']['aspect_2']
y_pred = df[df.phase=='test']['aspect_2_twostep_pred']#.astype(int)

set(y_true).union(set(y_pred))
inverse_a2label_dict  = {value : key for (key, value) in label_dict.items()}
aspect2_target_names = [inverse_a2label_dict[k] for k in set(y_true).union(set(y_pred))]

print(classification_report(y_true, y_pred,target_names=aspect2_target_names))
# make_confusion_matrix(y_true, y_pred, categories=aspect2_target_names, title='Aspect 2 two-step conditional model', figsize=(7,7))

print('F1 score twostep conditional:', f1_score(y_true, y_pred, average='weighted'))

# for mcnemar
twostep_test_mcnemar_df = pd.DataFrame()
twostep_test_mcnemar_df['uuids'] = np.array(test.uuid)
twostep_test_mcnemar_df['twosteptrues'] = np.array(y_true)
twostep_test_mcnemar_df['twosteppreds'] = np.array(y_pred.astype(int))

### McNemar test

In [None]:
test_mcnemar_df
twostep_test_mcnemar_df

mcn_df = pd.merge(test_mcnemar_df, twostep_test_mcnemar_df, on='uuids')

a = len(mcn_df.query('a2_pred == a2_true and twosteptrues == twosteppreds'))
b = len(mcn_df.query('a2_pred == a2_true and twosteptrues != twosteppreds'))
c = len(mcn_df.query('a2_pred != a2_true and twosteptrues == twosteppreds'))
d = len(mcn_df.query('a2_pred != a2_true and twosteptrues != twosteppreds'))

teststat = (b-c)**2/(b+c)

from scipy import stats
pval = 1 - stats.chi2.cdf(teststat, 1)
print(teststat, pval)
#3.9302325581395348 0.04742538550874109

# Two step model CV 

In [None]:
# take best hypers from L2AC
best_hp = {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1}

normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False,
}

dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
dh.normalize_text(normalize_config)

base_df = dh.df

inverse_a1label_dict = {0: 'Corporate', 2: 'Market', 3: 'Stock'}

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=numlabels)

training_args = TrainingArguments("trainer_aspect2_twostep_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **best_hp)

res = {'Corporate':[],
       'Market':[],
       'Stock':[]}

res2 = {
    'UUID':[],
    'A1':[],
    'A2pred':[],
    'A2true':[]
}

acum_labels = 0
for a1 in ['Corporate','Market','Stock']:
  dh.df = base_df[base_df.aspect_1 == a1]
  folds, df = dh.get_dataset('aspect_2', kfolds = 5, phase='trainvalid')

  for fold in folds:
    train = fold['train']
    valid = fold['valid']
    numlabels = len(set(train.labels))
    
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train,
        eval_dataset=valid,
        compute_metrics=compute_f1
    )
    trainer.train()

    preds = trainer.predict(valid)
    # adjust with acum_labels to keep unique labels over A1 divisions
    predictions = [np.argmax(x) + acum_labels for x in preds.predictions]
    labels = preds.label_ids + acum_labels
    labels = labels.tolist()

    temp_res = {'predictions': predictions,
               'labels': labels,
               'uuid': valid.uuid.tolist()}

    res[a1] = res[a1] + [temp_res]

    res2['UUID'] = res2['UUID'] + valid.uuid.tolist()
    res2['A1'] = res2['A1'] + [a1]*len(labels)
    res2['A2pred'] = res2['A2pred'] + predictions
    res2['A2true'] = res2['A2true'] + labels

  
  acum_labels += len(dh.label_dict)


# pick correct prediction using a1_cv_model_df
a2_cv_model_df = pd.DataFrame(res2)
a1_a2_merged_df = pd.merge(a1_cv_model_df, a2_cv_model_df,on='UUID')

for idx, row in a1_a2_merged_df.iterrows():
  if row['A1_pred'] != row['A1']:
    a1_a2_merged_df.loc[idx, 'A2pred'] = -1

print(classification_report(a1_a2_merged_df['A2true'], a1_a2_merged_df['A2pred'], zero_division=1))

## McNemar test

In [None]:
mcn_df = pd.merge(a1_a2_merged_df, a2_df, left_on='UUID', right_on='uuids')

a = len(mcn_df.query('A2pred == A2true and trues == a2_preds'))
b = len(mcn_df.query('A2pred == A2true and trues != a2_preds'))
c = len(mcn_df.query('A2pred != A2true and trues == a2_preds'))
d = len(mcn_df.query('A2pred != A2true and trues != a2_preds'))

teststat = (b-c)**2/(b+c)


from scipy import stats
pval = 1 - stats.chi2.cdf(teststat, 1)
print(teststat, pval)

#1.0140845070422535 0.31392630211928474

In [None]:
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')


## set normalized df with predicted labels from best aspect 1 model
# aspect1_pred_df = pd.read_csv('aspect_1_roberta_pred_df.csv')

def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
  return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=numlabels)

training_args = TrainingArguments("trainer_aspect2_twostep_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

best_runs = []

for a1 in [0,3]:
  dh.df = aspect1_pred_df[aspect1_pred_df.aspect_1 == a1]
  folds, df = dh.get_dataset('aspect_2', kfolds=5, phase='trainvalid')

  for fold in folds:
    train = fold['train']
    valid = fold['valid']
    numlabels = len(set(train.labels))
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train,
        eval_dataset=valid,
        compute_metrics=compute_f1
    )

    best_run = trainer.hyperparameter_search(
        direction="maximize",
        hp_space=my_hp_space,
        n_trials = 5
    )
    best_runs.append(best_run.hyperparameters)

# testing best runs on other folds
hp_performance = {}
for fold_hp in best_runs:
  training_args = TrainingArguments("trainer_aspect2_hyper",
                                evaluation_strategy = 'epoch',
                                logging_strategy = 'no',
                                log_on_each_node = False,
                                save_strategy = 'no',
                                **fold_hp)
  
  # check if hyperparameters already evaluated => skip
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = {'0':[],
                                  '3':[]}

  for a1 in [0,3]:
    dh.df = aspect1_pred_df[aspect1_pred_df.aspect_1 == a1]
    folds, df = dh.get_dataset('aspect_2', kfolds = 5, phase='trainvalid')

    for fold in folds:
      train = fold['train']
      valid = fold['valid']
      numlabels = len(set(train.labels))

      trainer = Trainer(
          model_init=model_init,
          args=training_args,
          train_dataset=train,
          eval_dataset=valid,
          compute_metrics=compute_f1
      )
      trainer.train()
      hp_performance[str(fold_hp)][str(a1)] += [compute_f1(trainer.predict(valid)[:2])['f1']]

a1_0_count = np.sum(df_full.aspect_1 == 0)
a1_3_count = np.sum(df_full.aspect_1 == 3)

for k,v in hp_performance.items():
  pooled_result = (a1_0_count * np.array(v['0']) + a1_3_count * np.array(v['3']))/(a1_0_count + a1_3_count)
  print(k, pooled_result, np.median(pooled_result), np.mean(pooled_result))

# Polarity classification

## Standard model

In [None]:
import torch
from torch import nn
from transformers import BertModel

class FinRoberta(torch.nn.Module):
    def __init__(self):
        super(FinRoberta, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids=None, attention_mask=None, labels=None, token_type_ids=None):
        _, pooled_output = self.roberta(input_ids, attention_mask=attention_mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output).float()

        labels = labels.float()

        proba = self.sigmoid(linear_output)
        loss_fct = MSELoss()
        loss = loss_fct(proba.view(-1), labels.view(-1))
        return (loss, proba)

class FinBert(torch.nn.Module):
    def __init__(self):
        super(FinBert, self).__init__()
        self.bert = BertModel.from_pretrained('models/language_model/TRC2')
        # self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids=None, attention_mask=None, labels=None, token_type_ids=None, a2state=None):
        _, pooled_output = self.bert(input_ids, attention_mask=attention_mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output).float()

        labels = labels.float()

        proba = self.sigmoid(linear_output)
        loss_fct = MSELoss()
        loss = loss_fct(proba.view(-1), labels.view(-1))
        return (loss, proba)

### Hypertuning

In [None]:
normalize_config = {
    'make_lower':True,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-cased')
# dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
folds, df = dh.get_dataset('sentiment', kfolds=5, phase='trainvalid', classification=False)
label_dict = dh.label_dict

def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

def model_init():
  set_seed(SEED)
  # return FinRoberta()
  return FinBert()

training_args = TrainingArguments("trainer_polarity_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

best_runs = []
for f in folds:
  train = f['train']
  valid = f['valid']

  trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=valid
  )

  best_run = trainer.hyperparameter_search(
      direction="minimize",
      hp_space=my_hp_space,
      n_trials = 10
  )
  best_runs.append(best_run)


hp_performance = {}
for run in best_runs:
  fold_hp = run.hyperparameters
  training_args = TrainingArguments("trainer_polarity_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **fold_hp)
  
  # check if hyperparameters already evaluated
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = [] 
  for f in folds:
    train = f['train']
    valid = f['valid']
    trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
    )
    trainer.train()
    hp_performance[str(fold_hp)] += [4 * trainer.evaluate(valid)['eval_loss']]

print('BEST RUNS', best_runs)
for k,v in hp_performance.items():
  print('HYPERPARAMETER PERFORMANCE:', k, v, np.median(v), np.mean(v))

# Bert uncased
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.052832238376140594, 0.0644736960530281, 0.06636153906583786, 0.06600579619407654, 0.06577026844024658] 0.06577026844024658 0.06308870762586594
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.056505460292100906, 0.06985578685998917, 0.06044714152812958, 0.056631673127412796, 0.06868556886911392] 0.06044714152812958 0.06242512613534927
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.05759287625551224, 0.06616748869419098, 0.06753169745206833, 0.06896571069955826, 0.06897512823343277] 0.06753169745206833 0.06584658026695252
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.05113855376839638, 0.06078331917524338, 0.07318360358476639, 0.06521035730838776, 0.071893610060215] 0.06521035730838776 0.06444188877940178

# #TRC2 Sigmoid (needs x4 to convert to original scale)
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'warmup_ratio': 0} [0.013998747803270817, 0.017447324469685555, 0.015128359198570251, 0.015227871015667915, 0.019011231139302254] 0.015227871015667915 0.016162706725299358
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.015402193181216717, 0.015639686957001686, 0.016010113060474396, 0.015499853529036045, 0.017030823975801468] 0.015639686957001686 0.01591653414070606
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.013562491163611412, 0.01695111393928528, 0.013968972489237785, 0.016102008521556854, 0.016268499195575714] 0.016102008521556854 0.01537061706185341
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.013963154517114162, 0.017033405601978302, 0.0139283062890172, 0.01466009858995676, 0.020511159673333168] 0.01466009858995676 0.01601922493427992
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.014055843465030193, 0.01622643508017063, 0.015630172565579414, 0.014675182290375233, 0.02019800804555416] 0.015630172565579414 0.016157128289341925

# # RoBERTa Sigmoid (needs x4 to convert to original scale)
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.01412577647715807, 0.016138076782226562, 0.012905549257993698, 0.013664200901985168, 0.01638566330075264] 0.01412577647715807 0.014643853344023228
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.015271838754415512, 0.01469334401190281, 0.01225187350064516, 0.01618705503642559, 0.015779796987771988] 0.015271838754415512 0.014836781658232212
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'warmup_ratio': 0} [0.013538007624447346, 0.017548933625221252, 0.012067773379385471, 0.01463028509169817, 0.018420834094285965] 0.01463028509169817 0.01524116676300764
## HYPERPARAMETER PERFORMANCE:  {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'warmup_ratio': 0} [0.012071726843714714, 0.01826534792780876, 0.013576824218034744, 0.012850789353251457, 0.018574798479676247] 0.013576824218034744 0.015067897364497184
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.013379896990954876, 0.018307488411664963, 0.010960334911942482, 0.014462033286690712, 0.014677247032523155] 0.014462033286690712 0.014357400126755238

### Model

In [None]:
# Bert uncased
# best_hp = {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1}
# MSE: 0.098369300365448
# R2: 0.3123079809334096

# TRC2
best_hp = {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1}
# MSE: 0.0978766679763794
# R2: 0.4014479019971946

# RoBERTa
# best_hp = {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'warmup_ratio': 0}
# MSE: 0.08234352618455887
# R2: 0.5440705770707137

# best_hp = {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0}

normalize_config = {
    'make_lower':True,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
train, test, df = dh.get_dataset('sentiment', kfolds=5, phase='traintest', classification=False)
label_dict = dh.label_dict

def model_init():
    set_seed(SEED)
    # return FinRoberta()
    return FinBert()

training_args = TrainingArguments("trainer_polarity_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **best_hp)

trainer = Trainer(
  model_init=model_init,
  args=training_args,
  train_dataset=train,
  eval_dataset=test
)
  
trainer.train()

y_pred = trainer.predict(test).predictions.reshape(-1)
y_true = trainer.predict(test).label_ids

print('MSE:', 4 * trainer.evaluate(test)['eval_loss'])
print('R2:', r2_score(y_pred, y_true))

## CV standard model

### Hypertuning

In [None]:
normalize_config = {
    'make_lower':True,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

# dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')

dh.normalize_text(normalize_config)
# _, originaltest, _ = dh.get_dataset('sentiment', kfolds=5, phase='traintest', classification=False)
folds, df = dh.get_dataset('sentiment', kfolds=5, phase='trainvalid', classification=False)
label_dict = dh.label_dict

def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

def model_init():
  set_seed(SEED)
  # return FinRoberta()
  return FinBert()

training_args = TrainingArguments("trainer_polarity_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

best_runs = []
for f in folds:
  train = f['train']

  len_train = train.__len__()
  # use first 90% as train and last 10% as valid, use f['valid'] as test
  valid = train.get_subset(list(range(int(0.9*len_train),len_train)))
  train = train.get_subset(list(range(0, int(0.9*len_train))))

  trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=valid
  )

  best_run = trainer.hyperparameter_search(
      direction="minimize",
      hp_space=my_hp_space,
      n_trials = 10
  )
  best_runs.append(best_run)


hp_performance = {}
for run in best_runs:
  fold_hp = run.hyperparameters
  training_args = TrainingArguments("trainer_polarity_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **fold_hp)
  
  # check if hyperparameters already evaluated
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = [] 
  for f in folds:
    train = f['train']
    
    len_train = train.__len__()
    # use first 90% as train and last 10% as valid, use f['valid'] as test
    valid = train.get_subset(list(range(int(0.9*len_train),len_train)))
    train = train.get_subset(list(range(0, int(0.9*len_train))))

    trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid,
    )
    trainer.train()
    hp_performance[str(fold_hp)] += [4 * trainer.evaluate(valid)['eval_loss']]

print('BEST RUNS', best_runs)
for k,v in hp_performance.items():
  print('HYPERPARAMETER PERFORMANCE:', k, v, np.median(v), np.mean(v))

# RoBERTa params
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.03710188344120979, 0.060917071998119354, 0.04569103941321373, 0.047584839165210724, 0.039142075926065445] 0.04569103941321373 0.04608738198876381
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.04339892044663429, 0.055397458374500275, 0.04764453321695328, 0.04351646080613136, 0.040668729692697525] 0.04351646080613136 0.046125220507383345
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.06353326886892319, 0.05331432819366455, 0.05189724266529083, 0.050210222601890564, 0.05598332732915878] 0.05331432819366455 0.054987677931785585
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.05958247929811478, 0.06423655897378922, 0.054369255900382996, 0.0485914908349514, 0.0451374426484108] 0.054369255900382996 0.05438344553112984
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.1} [0.04158683493733406, 0.07022882997989655, 0.056970082223415375, 0.07796427607536316, 0.03995445743203163] 0.056970082223415375 0.05734089612960815


#TRC2 params
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.042494893074035645, 0.04691098630428314, 0.04905678704380989, 0.041579488664865494, 0.06071660667657852] 0.04691098630428314 0.04815175235271454
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.05296031013131142, 0.05256319418549538, 0.05840889737010002, 0.05776962637901306, 0.05929775908589363] 0.05776962637901306 0.0561999574303627
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.04763873666524887, 0.05571665242314339, 0.06414235383272171, 0.057232439517974854, 0.058037012815475464] 0.057232439517974854 0.056553439050912854
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.05371842160820961, 0.04786146804690361, 0.049455560743808746, 0.03541581705212593, 0.056039273738861084] 0.049455560743808746 0.0484981082379818
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.05104272440075874, 0.05990688502788544, 0.05650267377495766, 0.06502795219421387, 0.05511854588985443] 0.05650267377495766 0.05751975625753403

#OLD
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.045946549624204636, 0.057897359132766724, 0.055134452879428864, 0.052269693464040756, 0.038776498287916183] 0.052269693464040756 0.05000491067767143
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.05887581780552864, 0.053627628833055496, 0.04601342976093292, 0.06552106887102127, 0.03700460121035576] 0.053627628833055496 0.05220850929617882
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.03710188344120979, 0.060917071998119354, 0.04569103941321373, 0.047584839165210724, 0.039142075926065445] 0.04569103941321373 0.04608738198876381
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.05276298150420189, 0.04673157259821892, 0.06633079797029495, 0.04586451128125191, 0.04022789001464844] 0.04673157259821892 0.05038355067372322

### Model

In [None]:
# best_hp = {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} #RoBERTa
best_hp = {'learning_rate': 5e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} # TRC2

normalize_config = {
    'make_lower':True,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
folds, df = dh.get_dataset('sentiment', kfolds=5, phase='trainvalid', classification=False)

def model_init():
    set_seed(SEED)
    # return FinRoberta()
    return FinBert()

training_args = TrainingArguments("trainer_polarity_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **best_hp)
trues = np.array([])
predicts = np.array([])

for f in folds:
  train = f['train']
  valid = f['valid']

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=train,
      eval_dataset=valid
  )

  trainer.train()
  
  predicts = np.append(predicts, trainer.predict(valid).predictions.reshape(-1))
  trues = np.append(trues, trainer.predict(valid).label_ids)

# four because the the scale is twice as big in the original and we take squares (2^2=4)
print("MSE:", 4 * mean_squared_error(trues, predicts), "R2:", r2_score(trues, predicts))

#BERT TRC2 MSE: 0.06313041194622943 R2: 0.6114493690516956
#RoBERTa MSE: 0.05205366875539488 R2: 0.6796237310266637

## Fusing FinRoBERTa

In [None]:
class HierarchFinRoBerta(torch.nn.Module):
    def __init__(self, midlayer = 768):
        super(HierarchFinRoBerta, self).__init__()
        self.roberta = RobertaModel.from_pretrained('models/language_model/RoBERTa')
        self.dropout = nn.Dropout(0.1)
        self.midlayer = midlayer
        if midlayer:
          self.linear1 = nn.Linear(768*2, midlayer)
          self.linear2 = nn.Linear(midlayer,1)
        else:
          self.linear = nn.Linear(768*2, 1)

        self.sigmoid = nn.Sigmoid()
    
    def forward(self, input_ids=None, attention_mask=None, labels=None, token_type_ids=None, state=None):
        _, pooled_output = self.roberta(input_ids, attention_mask=attention_mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)

        concat = torch.cat((dropout_output, state), dim=1)

        if self.midlayer:
          linear1_output = self.linear1(concat)
          linear2_output = self.linear2(linear1_output).float()
          proba = self.sigmoid(linear2_output)
        else:
          linear_output = self.linear(concat).float()
          proba = self.sigmoid(linear_output)

        labels = labels.float()
        
        loss_fct = MSELoss()

        loss = loss_fct(proba.view(-1), labels.view(-1))
        return [loss, proba]

### Data (aspect 2)

In [None]:
normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
dh.normalize_text(normalize_config)
# start with aspect2 to match data of L2AC
_, _, df_a2 = dh.get_dataset('aspect_2', kfolds = 5)
dh.df = df_a2
# get sentiment data
train, test, _ = dh.get_dataset('sentiment', kfolds = 5, classification=False)

train_state = [train_pooled_output[A2_train_uuids.index(x)] for x in train.uuid]
test_state = [test_pooled_output[A2_test_uuids.index(x)] for x in test.uuid]

train.set_state(train_state)
test.set_state(test_state)

### Data (aspect 1)

In [None]:
normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
dh.normalize_text(normalize_config)
# start with aspect2 to match data of L2AC
_, _, df_a1 = dh.get_dataset('aspect_1', kfolds = 5)
dh.df = df_a1
# get sentiment data
train, test, _ = dh.get_dataset('sentiment', kfolds = 5, classification=False)

train_state = [train_pooled_output[A1_train_uuids.index(x)] for x in train.uuid]
test_state = [test_pooled_output[A1_test_uuids.index(x)] for x in test.uuid]

train.set_state(train_state)
test.set_state(test_state)

### Hypertuning

In [None]:
KFOLDS = 5

def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

def model_init():
    set_seed(SEED)
    return HierarchFinRoBerta(midlayer=96)

training_args = TrainingArguments("trainer_hyperhierarchfinroberta",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

best_runs = []

kfold=KFold(n_splits=KFOLDS, shuffle=True, random_state=SEED)

for fold,(train_idx,valid_idx) in enumerate(kfold.split(train)):
  fold_train = train.get_subset(train_idx)
  fold_valid = train.get_subset(valid_idx)

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=fold_train,
      eval_dataset=fold_valid
  )

  best_run = trainer.hyperparameter_search(
      direction="minimize",
      hp_space=my_hp_space,
      n_trials = 10
  )
  best_runs.append(best_run)


hp_performance = {}
for run in best_runs:
  fold_hp = run.hyperparameters
  training_args = TrainingArguments("trainer_aspect2_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **fold_hp)
  
  # check if hyperparameters already evaluated
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = [] 
  for fold,(train_idx,valid_idx) in enumerate(kfold.split(train)):
    fold_train = train.get_subset(train_idx)
    fold_valid = train.get_subset(valid_idx)

    trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=fold_train,
      eval_dataset=fold_valid,
    )

    trainer.train()
    hp_performance[str(fold_hp)] += [4 * trainer.evaluate(fold_valid)['eval_loss']]

print('BEST RUNS', best_runs)
for k,v in hp_performance.items():
  print('HYPERPARAMETER PERFORMANCE:', k, v, np.median(v), np.mean(v))

################### Aspect 2

# NO MIDLAYER
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.06423457711935043, 0.054951559752225876, 0.058882828801870346, 0.05863616615533829, 0.05030940845608711] 0.05863616615533829 0.057402908056974414
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.06559525430202484, 0.04588118568062782, 0.06181681528687477, 0.06266765296459198, 0.045842092484235764] 0.06181681528687477 0.05636060014367104
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.062031447887420654, 0.05466155707836151, 0.04636817052960396, 0.051338955760002136, 0.052864573895931244] 0.052864573895931244 0.0534529410302639
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.0638841763138771, 0.05140890181064606, 0.054602351039648056, 0.05610606446862221, 0.050046686083078384] 0.054602351039648056 0.055209635943174365

# MIDLAYER 12
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.06518840789794922, 0.06324437260627747, 0.056476250290870667, 0.07012335956096649, 0.053130973130464554] 0.06324437260627747 0.06163267269730568
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.07025410979986191, 0.04925870895385742, 0.05584905669093132, 0.053993355482816696, 0.058355122804641724] 0.05584905669093132 0.057542070746421814
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.06383845955133438, 0.05137746408581734, 0.049238625913858414, 0.05201975628733635, 0.049674857407808304] 0.05137746408581734 0.05322983264923096
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.06984138488769531, 0.047495946288108826, 0.05617126077413559, 0.052930884063243866, 0.051594849675893784] 0.052930884063243866 0.05560686513781547
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.07571060955524445, 0.05503875017166138, 0.05336594954133034, 0.059144191443920135, 0.049123212695121765] 0.05503875017166138 0.058476542681455614

# MIDLAYER 24
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.06454502046108246, 0.04716330021619797, 0.05858182534575462, 0.05355722829699516, 0.049986228346824646] 0.05355722829699516 0.05476672053337097
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.06531694531440735, 0.045436955988407135, 0.059730321168899536, 0.05764250457286835, 0.1376320719718933] 0.059730321168899536 0.07315175980329514
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.07911954820156097, 0.05356360971927643, 0.05369555205106735, 0.15020723640918732, 0.04875458776950836] 0.05369555205106735 0.07706810683012008
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.07241515815258026, 0.14234702289104462, 0.06845612078905106, 0.052341461181640625, 0.06812967360019684] 0.06845612078905106 0.08073788732290268
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.05936623364686966, 0.0962679460644722, 0.06711787730455399, 0.1498887985944748, 0.04821263626217842] 0.06711787730455399 0.08417069837450981


# MIDLAYER 48
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.07207576185464859, 0.06664641946554184, 0.0653189942240715, 0.06142018735408783, 0.054214026778936386] 0.0653189942240715 0.06393507793545723
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.07160342484712601, 0.04879377409815788, 0.05592978745698929, 0.1511913686990738, 0.05646868795156479] 0.05646868795156479 0.07679740861058235
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.07211726158857346, 0.05931711941957474, 0.04967755079269409, 0.05973115935921669, 0.05038229748606682] 0.05931711941957474 0.05824507772922516
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.1} [0.06810101866722107, 0.05706861615180969, 0.06595220416784286, 0.048007313162088394, 0.057626355439424515] 0.057626355439424515 0.05935110151767731
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.0755169615149498, 0.059627968817949295, 0.06216586008667946, 0.07027994096279144, 0.04869387671351433] 0.06216586008667946 0.06325692161917687

# MIDLAYER 96
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.06152065098285675, 0.057252999395132065, 0.07370685786008835, 0.0583922453224659, 0.059187836945056915] 0.059187836945056915 0.06201211810111999
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.06609219312667847, 0.047976233065128326, 0.058343227952718735, 0.05523522570729256, 0.05812292546033859] 0.05812292546033859 0.057153961062431334
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.06144116073846817, 0.04515719786286354, 0.0514652281999588, 0.05252969264984131, 0.04920241981744766] 0.0514652281999588 0.0519591398537159
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.0729515478014946, 0.05304001271724701, 0.058123815804719925, 0.05319353565573692, 0.05316171795129776] 0.05319353565573692 0.05809412598609924

# MIDLAYER 192
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.06979982554912567, 0.06099386885762215, 0.0742906853556633, 0.0672699362039566, 0.05182266607880592] 0.0672699362039566 0.06483539640903473
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.0676109567284584, 0.04469650238752365, 0.0547521747648716, 0.05361950770020485, 0.043778903782367706] 0.05361950770020485 0.05289160907268524
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.07491818070411682, 0.05735928937792778, 0.048223864287137985, 0.06419715285301208, 0.05625413358211517] 0.05735928937792778 0.06019052416086197
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.06873898208141327, 0.05153900012373924, 0.04935310781002045, 0.052255138754844666, 0.1284651905298233] 0.052255138754844666 0.07007028385996819
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.069391168653965, 0.047357410192489624, 0.05249038338661194, 0.07116805016994476, 0.05458672717213631] 0.05458672717213631 0.05899874791502953

# MIDLAYER 384
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.06611419469118118, 0.05409218370914459, 0.06084262952208519, 0.04743494838476181, 0.043441012501716614] 0.05409218370914459 0.054384993761777876
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.07011972367763519, 0.04322606325149536, 0.05507131293416023, 0.06185067072510719, 0.04764164984226227] 0.05507131293416023 0.05558188408613205
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.07215578854084015, 0.04281898960471153, 0.05355256795883179, 0.14978887140750885, 0.05030406266450882] 0.05355256795883179 0.07372405603528023
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.14099563658237457, 0.13644950091838837, 0.0535721480846405, 0.056096725165843964, 0.051422521471977234] 0.056096725165843964 0.08770730644464493
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.0687575414776802, 0.06393096596002579, 0.061236366629600525, 0.05772775411605835, 0.04796048253774643] 0.061236366629600525 0.05992262214422226

# MIDLAYER 768
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.06230446323752403, 0.1386440545320511, 0.05124799534678459, 0.054485201835632324, 0.053950872272253036] 0.054485201835632324 0.07212651744484902
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.06333993375301361, 0.0445093959569931, 0.05998576432466507, 0.04966750741004944, 0.06016889587044716] 0.05998576432466507 0.055534299463033676
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.07341684401035309, 0.04749355837702751, 0.046480435878038406, 0.050744179636240005, 0.045124221593141556] 0.04749355837702751 0.052651847898960116

################### Aspect 1

# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'warmup_ratio': 0.1} [0.05107186734676361, 0.07912354916334152, 0.04587465524673462, 0.06579069048166275, 0.07370642572641373] 0.06579069048166275 0.06311343759298324
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.1415610909461975, 0.059526797384023666, 0.04489831253886223, 0.05882678180932999, 0.06631062924861908] 0.059526797384023666 0.07422472238540649
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.051765017211437225, 0.06288894265890121, 0.04320967569947243, 0.06698088347911835, 0.07290225476026535] 0.06288894265890121 0.05954935476183891
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1} [0.05438518151640892, 0.06760481745004654, 0.04291054233908653, 0.052228737622499466, 0.0739610567688942] 0.05438518151640892 0.05821806713938713
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.049329522997140884, 0.07380255311727524, 0.04255344718694687, 0.06843607872724533, 0.06130211055278778] 0.06130211055278778 0.05908474251627922

### Model

In [None]:
MIDLAYER = 96
best_hp_a1 = {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 4, 'warmup_ratio': 0.1}
best_hp_a2 {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1}

def model_init():
    set_seed(SEED)
    return HierarchFinRoBerta(midlayer = MIDLAYER)

training_args = TrainingArguments("trainer_hierarchfinroberta",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **best_hp_a1)

trainer = Trainer(
  model_init=model_init,
  args=training_args,
  train_dataset=train,
  eval_dataset=test
)
  
trainer.train()

y_pred = trainer.predict(test).predictions.reshape(-1)
y_true = trainer.predict(test).label_ids

print('MSE:', 4 * trainer.evaluate(test)['eval_loss'])
print('R2:', r2_score(y_pred, y_true))

#96 A2 MSE: 0.09156506508588791 R2: 0.5462602636069516
#96 A1 MSE: 0.09040633589029312 R2: 0.43695038134566055

## CV Fusing FinRoBERTa

### Hypertuning

In [None]:
KFOLDS = 5
MIDLAYER = 192


def my_hp_space(trial):
    return {
        "learning_rate": trial.suggest_categorical("learning_rate", [5e-5, 3e-5, 2e-5]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2,5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32]),
        "warmup_ratio": trial.suggest_categorical("warmup_ratio", [0, 0.1])
    }

def model_init():
    set_seed(SEED)
    return HierarchFinRoBerta(midlayer=MIDLAYER)

training_args = TrainingArguments("trainer_hyperhierarchfinroberta",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no')

best_runs = []

kfold=KFold(n_splits=KFOLDS, shuffle=True, random_state=SEED)

for fold,(train_idx,valid_idx) in enumerate(kfold.split(train)):
  fold_train = train.get_subset(train_idx)
  fold_valid = train.get_subset(valid_idx)

  len_train = fold_train.__len__()
  # use first 90% as train and last 10% as valid, use f['valid'] as test
  fold_valid = fold_train.get_subset(list(range(int(0.9*len_train),len_train)))
  fold_train = fold_train.get_subset(list(range(0, int(0.9*len_train))))

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=fold_train,
      eval_dataset=fold_valid
  )

  best_run = trainer.hyperparameter_search(
      direction="minimize",
      hp_space=my_hp_space,
      n_trials = 10
  )
  best_runs.append(best_run)


hp_performance = {}
for run in best_runs:
  fold_hp = run.hyperparameters
  training_args = TrainingArguments("trainer_aspect2_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **fold_hp)
  
  # check if hyperparameters already evaluated
  if hp_performance.get(str(fold_hp)):
    continue

  hp_performance[str(fold_hp)] = [] 
  for fold,(train_idx,valid_idx) in enumerate(kfold.split(train)):
    fold_train = train.get_subset(train_idx)
    fold_valid = train.get_subset(valid_idx)

    len_train = fold_train.__len__()
    # use first 90% as train and last 10% as valid, use f['valid'] as test
    fold_valid = fold_train.get_subset(list(range(int(0.9*len_train),len_train)))
    fold_train = fold_train.get_subset(list(range(0, int(0.9*len_train))))

    trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=fold_train,
      eval_dataset=fold_valid,
    )

    trainer.train()
    hp_performance[str(fold_hp)] += [4 * trainer.evaluate(fold_valid)['eval_loss']]

print('BEST RUNS', best_runs)
for k,v in hp_performance.items():
  print('HYPERPARAMETER PERFORMANCE:', k, v, np.median(v), np.mean(v))

## Midlayer 96
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'warmup_ratio': 0} [0.05812760815024376, 0.05992165580391884, 0.06937055289745331, 0.05579930171370506, 0.07339829951524734] 0.05992165580391884 0.06332348361611366
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.06785283237695694, 0.04221581295132637, 0.04742591455578804, 0.055907607078552246, 0.05133869871497154] 0.05133869871497154 0.05294817313551903
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.06735347956418991, 0.05680914223194122, 0.05233849585056305, 0.04614022746682167, 0.04406413808465004] 0.05233849585056305 0.05334109663963318
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.11478505283594131, 0.05992785468697548, 0.04924999177455902, 0.04192931950092316, 0.11667902767658234] 0.05992785468697548 0.07651424929499626
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.054569076746702194, 0.054410792887210846, 0.10497128963470459, 0.04388121888041496, 0.046076249331235886] 0.054410792887210846 0.06078172549605369

## Midlayer 192
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.060317911207675934, 0.10273847728967667, 0.06227489933371544, 0.06927239149808884, 0.10685722529888153] 0.06927239149808884 0.08029218092560768
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.05504140630364418, 0.058704353868961334, 0.10132472217082977, 0.04814065620303154, 0.05301426723599434] 0.05504140630364418 0.06324508115649223
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 5e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.07506703585386276, 0.10653567314147949, 0.04743075743317604, 0.061564281582832336, 0.11328081786632538] 0.07506703585386276 0.0807757131755352
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.06125420704483986, 0.05550108104944229, 0.053696099668741226, 0.03821159526705742, 0.07462887465953827] 0.05550108104944229 0.056658371537923816
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.08425376564264297, 0.060196295380592346, 0.04777076095342636, 0.06615117192268372, 0.05669728294014931] 0.060196295380592346 0.06301385536789894

## Midlayer 768
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} [0.0535794273018837, 0.07436546683311462, 0.09868229180574417, 0.038192324340343475, 0.05600828304886818] 0.05600828304886818 0.06416555866599083
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.06118221580982208, 0.047577355057001114, 0.05475705862045288, 0.05595185607671738, 0.05688818544149399] 0.05595185607671738 0.05527133420109749
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'warmup_ratio': 0.1} [0.05960104614496231, 0.07025128602981567, 0.041488077491521835, 0.056888364255428314, 0.051232244819402695] 0.056888364255428314 0.05589220374822616
# HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'warmup_ratio': 0.1} [0.059650544077157974, 0.06368260085582733, 0.047790560871362686, 0.04538740590214729, 0.07965012639760971] 0.059650544077157974 0.059232247620821
## HYPERPARAMETER PERFORMANCE: {'learning_rate': 2e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'warmup_ratio': 0} [0.06114126369357109, 0.05618518963456154, 0.0517575740814209, 0.03321797400712967, 0.052398812025785446] 0.052398812025785446 0.05094016268849373

### Evaluation

In [None]:
MIDLAYER = 96
best_hp = {'learning_rate': 2e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 4, 'warmup_ratio': 0} # midlayer 96 MSE: 0.05490046490686365 R2: 0.6612348585964942


normalize_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')

dh.normalize_text(normalize_config)
folds, df = dh.get_dataset('sentiment', kfolds=5, phase='trainvalid', classification=False)

def model_init():
    set_seed(SEED)
    return HierarchFinRoBerta(midlayer=MIDLAYER)

training_args = TrainingArguments("trainer_polarity_hyper",
                                  evaluation_strategy = 'epoch',
                                  logging_strategy = 'no',
                                  log_on_each_node = False,
                                  save_strategy = 'no',
                                  **best_hp)
trues = np.array([])
predicts = np.array([])

kfold=KFold(n_splits=KFOLDS, shuffle=True, random_state=SEED)

for fold,(train_idx,valid_idx) in enumerate(kfold.split(train)):
  fold_train = train.get_subset(train_idx)
  fold_valid = train.get_subset(valid_idx)

  trainer = Trainer(
      model_init=model_init,
      args=training_args,
      train_dataset=fold_train,
      eval_dataset=fold_valid
  )

  trainer.train()
  
  predicts = np.append(predicts, trainer.predict(fold_valid).predictions.reshape(-1))
  trues = np.append(trues, trainer.predict(fold_valid).label_ids)

# four because the the scale is twice as big in the original and we take squares (2^2=4)
print("MSE:", 4 * mean_squared_error(trues, predicts), "R2:", r2_score(trues, predicts))

# Normalization research
researching which normalization techniques impact stock tickers the most to support theories

In [None]:
initial_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':False
}

dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-uncased')
# dh = DataHandler(tokenizer_model='bert',  tokenizer_version='bert-base-cased')
# dh = DataHandler(tokenizer_model='roberta',  tokenizer_version='roberta-base')
dh.normalize_text(initial_config)

train, test, df = dh.get_dataset('aspect_2', kfolds=None, phase='traintest')

# label if sentence contains ticker
containsticker = df['text'].str.match('[$][a-zA-Z]').astype(int)

normalizing_config = {
    'make_lower':False,
    'remove_url':True,
    'remove_twitter_usernames':True,
    'remove_hashtags':True,
    'remove_numbers':True,
    'remove_punctuation':True,
    'remove_whitespaces':True,
    'remove_dollar':True
}
dh.normalize_text(normalizing_config)

dh.df['containsticker'] = containsticker
train, test, df = dh.get_dataset('containsticker', kfolds=None, phase='traintest')


label_dict = dh.label_dict

training_args = TrainingArguments('trainer_ticker',
                                  evaluation_strategy='epoch',
                                  logging_strategy='epoch',
                                  save_strategy='no')

metric = load_metric("f1")
def compute_f1(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='weighted')

def model_init():
  set_seed(SEED)
  return BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=len(label_dict))
  # return BertForSequenceClassification.from_pretrained("models/language_model/TRC2", num_labels=len(label_dict))
  # return RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=len(label_dict))

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=test
)
trainer.train()

preds = trainer.predict(test)
y_pred = list(map(lambda x: np.argmax(x), preds.predictions))
y_true = preds.label_ids
metrics.f1_score(y_true, y_pred)

# BERT
# no normalization: 1
# Remove casing: 1
# Remove dollar: 0.9160305343511451
# Remove dollar and casing: 0.8085106382978724

# RoBERTa
# no normalization: 1
# Remove casing: 1
# Remove dollar: 0.9253731343283583
# Remove dollar and casing: 0.8840579710144927