# 0. Configuration

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')

In [3]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
tf.config.set_logical_device_configuration(gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=4096)])

In [4]:
import pandas as pd
import numpy as np
import torch
import json

from transformers import AutoTokenizer, AutoConfig, AutoModel
from transformers import AutoModelForSequenceClassification
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.losses import cosine_similarity
from tensorflow import keras
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
from pprint import pprint
from tensorflow.keras.callbacks import ModelCheckpoint

2022-06-07 11:11:55.644155: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-06-07 11:11:56.355648: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4096 MB memory:  -> device: 0, name: NVIDIA TITAN Xp, pci bus id: 0000:a6:00.0, compute capability: 6.1


# 1. Data and Model Loading

## 1-1. Data Loading

### FinPhrase

In [5]:
data_fpath = './data/FinancialPhraseBank-v1.0/Sentences_50Agree.txt'

In [6]:
with open(data_fpath, 'rb') as file:
    data = file.read()
data = data.decode('utf-8', 'ignore')
data = data.split('\r\n')[:-1]
print('Number of total data: %d\n' % len(data))
print('Data examples:')
print(data[:2])

Number of total data: 4846

Data examples:
['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .@neutral', 'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .@neutral']


In [7]:
titles = [line.split('@')[0] for line in data]
labels = [line.split('@')[1] for line in data]
print('Title examples:')
print(titles[:2])
print('\nLabel examples')
print(labels[:2])

Title examples:
['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .', 'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .']

Label examples
['neutral', 'neutral']


In [8]:
label_list = ['positive', 'negative', 'neutral']
Y = tf.keras.layers.StringLookup(vocabulary=label_list,
                                 num_oov_indices=0, output_mode='one_hot')(labels)

  return bool(asarray(a1 == a2).all())


### SemEval 2017 Task 5

In [9]:
# data_fpath = './data/Headline_Trainingdata.json'

In [10]:
# with open(data_fpath, 'r', encoding='utf-8') as file:
#     data = json.load(file)

# print('Number of total data: %d\n' % len(data))
# print('Data examples:')
# pprint(data[:2])

In [11]:
# ids = []
# companies = []
# titles = []
# sentiments = []
# for i in range(len(data)):
#     ids.append(data[i]['id'])
#     companies.append(data[i]['company'])
#     titles.append(data[i]['title'])
#     sentiments.append(data[i]['sentiment'])

## 1-2. FinBERT Model Loading

In [16]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
config = AutoConfig.from_pretrained('ProsusAI/finbert',
                                    output_hidden_states=True,
                                    output_attentions=True)
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert", config=config)

Some weights of the model checkpoint at ProsusAI/finbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 1-3. Vectorization to Tensorflow

In [13]:
encoded_input = tokenizer(titles, padding=True, return_tensors='pt')

In [14]:
%%time
with torch.no_grad():
    outputs = model(**encoded_input)

CPU times: user 1h 3min 5s, sys: 17min 54s, total: 1h 21min
Wall time: 2min 31s


# 2. Experiments

## 2-1. FinBERT's Classification Performance

In [41]:
Y_pred = tf.math.argmax(tf.convert_to_tensor(outputs.logits), axis=1)
Y_true = tf.math.argmax(Y, axis=1)

In [58]:
acc = tf.math.reduce_sum(tf.cast((Y_pred == Y_true), tf.float32)) / len(Y_true)
print('Number of examples: %d' % len(Y_true))
print('Accuracy of FinBERT on Financial PhraseBank: %.4f' % acc)

Number of examples: 4846
Accuracy of FinBERT on Financial PhraseBank: 0.8894


## 2-2. Layer-wise Classification Performance

### Model build

In [86]:
probe_model = keras.Sequential()
probe_model.add(keras.Input(shape=(outputs.hidden_states[-1].shape[-1])))
probe_model.add(Dense(3, activation='softmax', name='output_layer'))
print((probe_model.summary()))

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 output_layer (Dense)        (None, 3)                 2307      
                                                                 
Total params: 2,307
Trainable params: 2,307
Non-trainable params: 0
_________________________________________________________________
None


In [89]:
probe_model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              metrics=['accuracy'])

In [90]:
probe_model.save_weights('probe_model_init.h5')

### Training Layer-wise Classifiers

In [80]:
exp_name = '20220603_finbert_sentiment_layer_probe'

In [91]:
num_layer = len(outputs.hidden_states)
print('Results:')
print('Layer\t| train_acc\t| epoch\t\t| val_acc\t| epoch')
for i in range(num_layer):
    X = tf.convert_to_tensor(outputs.hidden_states[i])[:, 0, :]
    probe_model.load_weights('probe_model_init.h5')
    
    ckp_path = exp_name + 'layer%2d' % i + '/cp-{epoch:04d}.ckpt'
    cp_callback = ModelCheckpoint(filepath=ckp_path, verbose=False,
                                  save_weights_only=True, save_freq=100)
    probe_model.save_weights(ckp_path.format(epoch=0))
    history = probe_model.fit(X, Y, epochs=1000, batch_size=batch_size, verbose=False,
                              validation_split=0.3, callbacks=[cp_callback])

#     train_loss = np.min(np.array(history.history['loss']))
#     val_loss = np.min(np.array(history.history['val_loss']))
    train_acc = np.max(np.array(history.history['accuracy']))
    train_epoch = np.argmax(np.array(history.history['accuracy']))
    val_acc = np.max(np.array(history.history['val_accuracy']))
    val_epoch = np.argmax(np.array(history.history['val_accuracy']))
    print('%d\t| %.4f\t| %d\t\t| %.4f\t| %d' % (i, train_acc, train_epoch, val_acc, val_epoch))

Results:
Layer	| train_acc	| epoch		| val_acc	| epoch
0	| 0.5967	| 2		| 0.5880	| 1
1	| 0.7099	| 988		| 0.5887	| 22
2	| 0.7597	| 990		| 0.5880	| 2
3	| 0.7556	| 996		| 0.5887	| 23
4	| 0.7771	| 981		| 0.5880	| 1
5	| 0.7916	| 998		| 0.5880	| 1
6	| 0.8305	| 998		| 0.5880	| 1
7	| 0.8491	| 974		| 0.5825	| 6
8	| 0.8488	| 973		| 0.5880	| 7
9	| 0.8585	| 975		| 0.6032	| 998
10	| 0.8906	| 993		| 0.7772	| 996
11	| 0.9287	| 995		| 0.8817	| 27
12	| 0.9452	| 997		| 0.9099	| 22


## 2-1. Layer-wise Classification - Non-linear Classifier

In [94]:
probe_model = keras.Sequential()
probe_model.add(keras.Input(shape=(outputs.hidden_states[-1].shape[-1])))
probe_model.add(Dense(256, activation='gelu', name='dense_layer'))
probe_model.add(Dropout(0.1, name='dropout_layer'))
probe_model.add(Dense(3, activation='softmax', name='output_layer'))
print((probe_model.summary()))

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_layer (Dense)         (None, 256)               196864    
                                                                 
 dropout_layer (Dropout)     (None, 256)               0         
                                                                 
 output_layer (Dense)        (None, 3)                 771       
                                                                 
Total params: 197,635
Trainable params: 197,635
Non-trainable params: 0
_________________________________________________________________
None


In [95]:
probe_model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
              metrics=['accuracy'])

In [96]:
probe_model.save_weights('probe_model_init.h5')

### Training Layer-wise Classifiers

In [97]:
exp_name = '20220603_finbert_sentiment_layer_probe_nonlinear'

In [98]:
num_layer = len(outputs.hidden_states)
print('Results:')
print('Layer\t| train_acc\t| epoch\t\t| val_acc\t| epoch')
for i in range(num_layer):
    X = tf.convert_to_tensor(outputs.hidden_states[i])[:, 0, :]
    probe_model.load_weights('probe_model_init.h5')
    
    ckp_path = exp_name + 'layer%2d' % i + '/cp-{epoch:04d}.ckpt'
    cp_callback = ModelCheckpoint(filepath=ckp_path, verbose=False,
                                  save_weights_only=True, save_freq=100)
    probe_model.save_weights(ckp_path.format(epoch=0))
    history = probe_model.fit(X, Y, epochs=1000, batch_size=batch_size, verbose=False,
                              validation_split=0.3, callbacks=[cp_callback])

#     train_loss = np.min(np.array(history.history['loss']))
#     val_loss = np.min(np.array(history.history['val_loss']))
    train_acc = np.max(np.array(history.history['accuracy']))
    train_epoch = np.argmax(np.array(history.history['accuracy']))
    val_acc = np.max(np.array(history.history['val_accuracy']))
    val_epoch = np.argmax(np.array(history.history['val_accuracy']))
    print('%d\t| %.4f\t| %d\t\t| %.4f\t| %d' % (i, train_acc, train_epoch, val_acc, val_epoch))

Results:
Layer	| train_acc	| epoch		| val_acc	| epoch
0	| 0.5976	| 73		| 0.5880	| 0
1	| 0.7703	| 996		| 0.5880	| 0
2	| 0.8744	| 998		| 0.5880	| 0
3	| 0.8417	| 991		| 0.5894	| 23
4	| 0.8930	| 997		| 0.5887	| 25
5	| 0.9458	| 997		| 0.5880	| 0
6	| 0.9782	| 977		| 0.6066	| 730
7	| 0.9976	| 932		| 0.6279	| 270
8	| 0.9947	| 987		| 0.6369	| 492
9	| 0.9985	| 967		| 0.6726	| 376
10	| 0.9994	| 858		| 0.7978	| 250
11	| 0.9997	| 641		| 0.8803	| 12
12	| 0.9997	| 691		| 0.9106	| 3
