# 0. Configuration

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_logical_device_configuration(gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=4096)])

In [3]:
import pandas as pd
import numpy as np
import torch
import json

from transformers import AutoTokenizer, AutoConfig, AutoModel
from transformers import AutoModelForSequenceClassification
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.losses import cosine_similarity
from tensorflow import keras
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
from pprint import pprint
from tensorflow.keras.callbacks import ModelCheckpoint

2022-06-15 14:42:58.626935: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-15 14:42:59.316257: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4096 MB memory:  -> device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:17:00.0, compute capability: 6.1


# 1. Data and Model Loading

## 1-1. Data Loading

### FinPhrase

In [4]:
data_fpath = './data/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt'

In [5]:
with open(data_fpath, 'rb') as file:
    data = file.read()
data = data.decode('utf-8', 'ignore')
data = data.split('\r\n')[:-1]
print('Number of total data: %d\n' % len(data))
print('Data examples:')
print(data[:2])

Number of total data: 2264

Data examples:
['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .@neutral', "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .@positive"]


In [6]:
titles = [line.split('@')[0] for line in data]
labels = [line.split('@')[1] for line in data]
print('Title examples:')
print(titles[:2])
print('\nLabel examples')
print(labels[:2])

Title examples:
['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .', "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m ."]

Label examples
['neutral', 'positive']


In [7]:
label_list = ['positive', 'negative', 'neutral']
Y = tf.keras.layers.StringLookup(vocabulary=label_list,
                                 num_oov_indices=0, output_mode='one_hot')(labels)

  return bool(asarray(a1 == a2).all())


### SemEval 2017 Task 5

In [8]:
# data_fpath = './data/Headline_Trainingdata.json'

In [9]:
# with open(data_fpath, 'r', encoding='utf-8') as file:
#     data = json.load(file)

# print('Number of total data: %d\n' % len(data))
# print('Data examples:')
# pprint(data[:2])

In [10]:
# ids = []
# companies = []
# titles = []
# sentiments = []
# for i in range(len(data)):
#     ids.append(data[i]['id'])
#     companies.append(data[i]['company'])
#     titles.append(data[i]['title'])
#     sentiments.append(data[i]['sentiment'])

## 1-2. FinBERT Model Loading

In [11]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
config = AutoConfig.from_pretrained('ProsusAI/finbert',
                                    output_hidden_states=True,
                                    output_attentions=True)
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", config=config)

## 1-3. Vectorization to Tensorflow

In [12]:
encoded_input = tokenizer(titles, padding=True, return_tensors='pt')

In [13]:
%%time
with torch.no_grad():
    outputs = model(**encoded_input)

CPU times: user 28min 2s, sys: 13min 1s, total: 41min 3s
Wall time: 1min 13s


# 2. Probing

In [14]:
import pandas as pd

In [15]:
df = pd.DataFrame({'title': titles, 'label': labels})
df

Unnamed: 0,title,label
0,"According to Gran , the company has no plans t...",neutral
1,"For the last quarter of 2010 , Componenta 's n...",positive
2,"In the third quarter of 2010 , net sales incre...",positive
3,Operating profit rose to EUR 13.1 mn from EUR ...,positive
4,"Operating profit totalled EUR 21.1 mn , up fro...",positive
...,...,...
2259,Operating result for the 12-month period decre...,negative
2260,HELSINKI Thomson Financial - Shares in Cargote...,negative
2261,LONDON MarketWatch -- Share prices ended lower...,negative
2262,Operating profit fell to EUR 35.4 mn from EUR ...,negative


In [16]:
n_sample = 100
prob_idx = df.groupby('label').sample(n_sample, random_state=0).index
prob_idx_neg = prob_idx[:n_sample]
prob_idx_neu = prob_idx[n_sample:n_sample*2]
prob_idx_pos = prob_idx[-n_sample:]

In [17]:
Y_pred = tf.math.argmax(tf.convert_to_tensor(outputs.logits), axis=1)
Y_true = tf.math.argmax(Y, axis=1).numpy()

In [18]:
acc_pos = tf.reduce_sum(tf.cast((Y_pred == Y_true).numpy()[prob_idx_pos], tf.float32))
acc_neg = tf.reduce_sum(tf.cast((Y_pred == Y_true).numpy()[prob_idx_neg], tf.float32))
acc_neu = tf.reduce_sum(tf.cast((Y_pred == Y_true).numpy()[prob_idx_neu], tf.float32))
print('Accuracy by labels:')
print('Positive\t| Negative\t| Neutral')
print('%.4f\t\t| %.4f\t| %.4f' % (acc_pos, acc_neg, acc_neu))

Accuracy by labels:
Positive	| Negative	| Neutral
98.0000		| 99.0000	| 96.0000


In [19]:
print('Positive Sample - Error cases:')
for idx in prob_idx_pos[(Y_pred != Y_true).numpy()[prob_idx_pos]]:
    print(idx)
    print(titles[idx])
    print('predicted as: %d' % Y_pred[idx])
    print()
# [titles[idx] for idx in prob_idx[:n_sample][(Y_pred != Y_true)[-n_sample:]]]

print('-'*20)
print('Negative Sample - Error cases:')
for idx in prob_idx_neg[(Y_pred != Y_true).numpy()[prob_idx_neg]]:
    print(idx)
    print(titles[idx])
    print('predicted as: %d' % Y_pred[idx])
    print()

print('-'*20)
print('Neutral Sample - Error cases:')
for idx in prob_idx_neu[(Y_pred != Y_true).numpy()[prob_idx_neu]]:
    print(idx)
    print(titles[idx])
    print('predicted as: %d' % Y_pred[idx])
    print()

Positive Sample - Error cases:
345
Unit costs for flight operations fell by 6.4 percent .
predicted as: 1

849
The company reports a loss for the period of EUR 0.4 mn compared to a loss of EUR 1.9 mn in the corresponding period in 2005 .
predicted as: 1

--------------------
Negative Sample - Error cases:
371
Finnish power supply solutions and systems provider Efore Oyj said its net loss widened to 3.2 mln euro $ 4.2 mln for the first quarter of fiscal 2006-2007 ending October 31 , 2007 from 900,000 euro $ 1.2 mln for the same period of fiscal 2005-06 .
predicted as: 0

--------------------
Neutral Sample - Error cases:
1108
The sale of Savcor FACE to Cencorp will result in a profit or loss which can not yet be determined , owing to factors including the valuation of the consideration shares to be received and prevailing exchange rates .
predicted as: 1

1231
Fortum expects its annual capital expenditure in the next four to five years to be within a range of EUR 0.8-1 .2 billion , as e

In [25]:
from bertviz import model_view

In [26]:
def model_view_idx(idx):
    input_text = titles[idx]
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model(inputs)
    attention = outputs[-1]
    tokens = tokenizer.convert_ids_to_tokens(inputs[0])
    
    model_view(attention, tokens)

In [27]:
def save_model_view_idx(idx, fname):
    input_text = titles[idx]
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    outputs = model(inputs)
    attention = outputs[-1]
    tokens = tokenizer.convert_ids_to_tokens(inputs[0])
    
    html = model_view(attention, tokens, html_action='return')
    
    with open(fname, 'w') as f:
        f.write(html.data)

In [33]:
save_model_view_idx(1467, '1467.html')

In [115]:
results = []
results_ids = []

for idx in prob_idx[(Y_pred != Y_true).numpy()[prob_idx]]:
    results_ids.append(idx)
    case_result = []
    case_input = tokenizer(titles[idx], return_tensors='pt')
    with torch.no_grad():
        case_output = model(**case_input)
    for layer in case_output.attentions:
        head_result = []
        for head in layer[0]:
            cls_attention = head[0]
            token_id = np.argmax(cls_attention)
            token = tokenizer.convert_ids_to_tokens(case_input.input_ids[0])[token_id]
            head_result.append(token)
        case_result.append(head_result)
    results.append(case_result)

In [129]:
from collections import Counter

In [138]:
print(Counter([token for layer in case for token in layer]))

Counter({'[SEP]': 73, '[CLS]': 37, 'loss': 10, '.': 8, 'compared': 7, 'the': 2, 'reports': 2, '##r': 1, 'for': 1, 'company': 1, 'corresponding': 1, 'mn': 1})


In [139]:
for case, idx in zip(results, results_ids):
    print('Case number %d: %s' % (idx, titles[idx]))
    for i, layer in enumerate(case):
        print('Layer %d:\t%s' % (i, '\t'.join(layer)))
    print(Counter([token for layer in case for token in layer]))
    print()

Case number 371: Finnish power supply solutions and systems provider Efore Oyj said its net loss widened to 3.2 mln euro $ 4.2 mln for the first quarter of fiscal 2006-2007 ending October 31 , 2007 from 900,000 euro $ 1.2 mln for the same period of fiscal 2005-06 .
Layer 0:	[SEP]	[CLS]	[CLS]	[CLS]	[CLS]	.	the	[CLS]	[CLS]	##n	[CLS]	[CLS]
Layer 1:	,	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]
Layer 2:	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]	[SEP]	[CLS]	[CLS]	[CLS]	[CLS]	[CLS]
Layer 3:	[SEP]	[SEP]	finnish	[SEP]	[SEP]	[CLS]	finnish	[CLS]	[CLS]	[CLS]	[SEP]	[CLS]
Layer 4:	[SEP]	finnish	finnish	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[CLS]	[SEP]	[SEP]	[SEP]
Layer 5:	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]
Layer 6:	its	same	.	[SEP]	[SEP]	[CLS]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]
Layer 7:	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	.
Layer 8:	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]	[SEP]
Layer 9:	[SEP]	[SEP]

In [31]:
titles_prob = titles[:100]
labels_prob = labels[:100]

In [34]:
import pandas as pd

In [None]:
pd.DataFrame('label')

In [29]:
tf.reduce_sum(tf.convert_to_tensor(outputs.attentions[0][:100]), axis=-1)

<tf.Tensor: shape=(100, 12, 150), dtype=float32, numpy=
array([[[1.0000001 , 1.        , 1.        , ..., 1.0000001 ,
         1.        , 1.        ],
        [0.99999994, 1.        , 1.        , ..., 1.        ,
         1.        , 1.        ],
        [1.        , 1.        , 0.9999999 , ..., 1.0000001 ,
         1.0000001 , 1.        ],
        ...,
        [1.        , 1.0000001 , 0.99999994, ..., 1.        ,
         1.        , 0.99999994],
        [0.9999999 , 0.99999994, 0.99999994, ..., 1.0000001 ,
         1.        , 1.0000001 ],
        [1.        , 1.        , 0.9999999 , ..., 0.9999998 ,
         1.        , 1.0000002 ]],

       [[1.        , 0.99999994, 0.99999994, ..., 1.        ,
         1.        , 0.99999994],
        [1.        , 1.        , 1.        , ..., 1.        ,
         1.        , 1.        ],
        [1.0000001 , 1.0000001 , 1.        , ..., 0.9999999 ,
         1.0000001 , 0.99999994],
        ...,
        [0.99999994, 1.0000001 , 1.        , ..., 1.

In [21]:
tf.convert_to_tensor(outputs.attentions[0])

2022-06-07 13:10:34.915050: W tensorflow/core/common_runtime/bfc_allocator.cc:479] Allocator (GPU_0_bfc) ran out of memory trying to allocate 4.87GiB (rounded to 5233680128)requested by op _EagerConst
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2022-06-07 13:10:34.915114: I tensorflow/core/common_runtime/bfc_allocator.cc:1027] BFCAllocator dump for GPU_0_bfc
2022-06-07 13:10:34.915136: I tensorflow/core/common_runtime/bfc_allocator.cc:1034] Bin (256): 	Total Chunks: 7, Chunks in use: 5. 1.8KiB allocated for chunks. 1.2KiB in use in bin. 56B client-requested in use in bin.
2022-06-07 13:10:34.915151: I tensorflow/core/common_runtime/bfc_allocator.cc:1034] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2022-06-07 13:10:34.915166: I tensorflow/core/comm

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.

core/common_runtime/bfc_allocator.cc:1063] Next region of size 4294967296
2022-06-07 13:10:34.915477: I tensorflow/core/common_runtime/bfc_allocator.cc:1083] InUse at 7f8578000000 of size 256 next 1
2022-06-07 13:10:34.915491: I tensorflow/core/common_runtime/bfc_allocator.cc:1083] InUse at 7f8578000100 of size 1280 next 2
2022-06-07 13:10:34.915503: I tensorflow/core/common_runtime/bfc_allocator.cc:1083] InUse at 7f8578000600 of size 256 next 3
2022-06-07 13:10:34.915513: I tensorflow/core/common_runtime/bfc_allocator.cc:1083] InUse at 7f8578000700 of size 256 next 4
2022-06-07 13:10:34.915524: I tensorflow/core/common_runtime/bfc_allocator.cc:1083] Free  at 7f8578000800 of size 256 next 5
2022-06-07 13:10:34.915535: I tensorflow/core/common_runtime/bfc_allocator.cc:1083] InUse at 7f8578000900 of size 256 next 6
2022-06-07 13:10:34.915545: I tensorflow/core/common_runtime/bfc_allocator.cc:1083] Free  at 7f8578000a00 of size 39168 next 8
2022-06-07 13:10:34.915556: I tensorflow/core/co

In [18]:
outputs.hidden_states[0].shape

torch.Size([4846, 150, 768])

## 2-1. FinBERT's Classification Performance

In [None]:
Y_pred = tf.math.argmax(tf.convert_to_tensor(outputs.logits), axis=1)
Y_true = tf.math.argmax(Y, axis=1)

In [None]:
acc = tf.math.reduce_sum(tf.cast((Y_pred == Y_true), tf.float32)) / len(Y_true)
print('Number of examples: %d' % len(Y_true))
print('Accuracy of FinBERT on Financial PhraseBank: %.4f' % acc)

## 2-2. Layer-wise Classification Performance

### Model build

In [None]:
probe_model = keras.Sequential()
probe_model.add(keras.Input(shape=(outputs.hidden_states[-1].shape[-1])))
probe_model.add(Dense(3, activation='softmax', name='output_layer'))
print((probe_model.summary()))

In [None]:
probe_model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              metrics=['accuracy'])

In [None]:
probe_model.save_weights('probe_model_init.h5')

### Training Layer-wise Classifiers

In [None]:
exp_name = '20220603_finbert_sentiment_layer_probe'

In [None]:
num_layer = len(outputs.hidden_states)
print('Results:')
print('Layer\t| train_acc\t| epoch\t\t| val_acc\t| epoch')
for i in range(num_layer):
    X = tf.convert_to_tensor(outputs.hidden_states[i])[:, 0, :]
    probe_model.load_weights('probe_model_init.h5')
    
    ckp_path = exp_name + 'layer%2d' % i + '/cp-{epoch:04d}.ckpt'
    cp_callback = ModelCheckpoint(filepath=ckp_path, verbose=False,
                                  save_weights_only=True, save_freq=100)
    probe_model.save_weights(ckp_path.format(epoch=0))
    history = probe_model.fit(X, Y, epochs=1000, batch_size=batch_size, verbose=False,
                              validation_split=0.3, callbacks=[cp_callback])

#     train_loss = np.min(np.array(history.history['loss']))
#     val_loss = np.min(np.array(history.history['val_loss']))
    train_acc = np.max(np.array(history.history['accuracy']))
    train_epoch = np.argmax(np.array(history.history['accuracy']))
    val_acc = np.max(np.array(history.history['val_accuracy']))
    val_epoch = np.argmax(np.array(history.history['val_accuracy']))
    print('%d\t| %.4f\t| %d\t\t| %.4f\t| %d' % (i, train_acc, train_epoch, val_acc, val_epoch))

## 2-1. Layer-wise Classification - Non-linear Classifier

In [None]:
probe_model = keras.Sequential()
probe_model.add(keras.Input(shape=(outputs.hidden_states[-1].shape[-1])))
probe_model.add(Dense(256, activation='gelu', name='dense_layer'))
probe_model.add(Dropout(0.1, name='dropout_layer'))
probe_model.add(Dense(3, activation='softmax', name='output_layer'))
print((probe_model.summary()))

In [None]:
probe_model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              metrics=['accuracy'])

In [None]:
probe_model.save_weights('probe_model_init.h5')

### Training Layer-wise Classifiers

In [None]:
exp_name = '20220603_finbert_sentiment_layer_probe_nonlinear'

In [None]:
num_layer = len(outputs.hidden_states)
print('Results:')
print('Layer\t| train_acc\t| epoch\t\t| val_acc\t| epoch')
for i in range(num_layer):
    X = tf.convert_to_tensor(outputs.hidden_states[i])[:, 0, :]
    probe_model.load_weights('probe_model_init.h5')
    
    ckp_path = exp_name + 'layer%2d' % i + '/cp-{epoch:04d}.ckpt'
    cp_callback = ModelCheckpoint(filepath=ckp_path, verbose=False,
                                  save_weights_only=True, save_freq=100)
    probe_model.save_weights(ckp_path.format(epoch=0))
    history = probe_model.fit(X, Y, epochs=1000, batch_size=batch_size, verbose=False,
                              validation_split=0.3, callbacks=[cp_callback])

#     train_loss = np.min(np.array(history.history['loss']))
#     val_loss = np.min(np.array(history.history['val_loss']))
    train_acc = np.max(np.array(history.history['accuracy']))
    train_epoch = np.argmax(np.array(history.history['accuracy']))
    val_acc = np.max(np.array(history.history['val_accuracy']))
    val_epoch = np.argmax(np.array(history.history['val_accuracy']))
    print('%d\t| %.4f\t| %d\t\t| %.4f\t| %d' % (i, train_acc, train_epoch, val_acc, val_epoch))