# FinBERT Notebook

This notebooks shows how to train and use the FinBERT pre-trained language model for financial sentiment analysis.

## Modules 

In [1]:
from pathlib import Path
import shutil
import os
import logging
import sys
sys.path.append('..')

from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report

from transformers import AutoModelForSequenceClassification

from finbert.finbert import *
import finbert.utils as tools

%load_ext autoreload
%autoreload 2

project_dir = Path.cwd().parent
pd.set_option('max_colwidth', -1)

  pd.set_option('max_colwidth', -1)


In [2]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

## Prepare the model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [10]:
lm_path = project_dir/'models'/'TRC2'
cl_path = project_dir/'models'/'sentiment'
cl_data_path = project_dir/'data'/'sentiment_data'

###  Configuring training parameters

You can find the explanations of the training parameters in the class docsctrings. 

In [13]:
# Clean the cl_path
try:
    shutil.rmtree(cl_path) 
except:
    pass

bertmodel = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")


config = Config(   data_dir=cl_data_path,
                   bert_model=bertmodel,
                   num_train_epochs=4,
                   model_dir=cl_path,
                   max_seq_length = 48,
                   train_batch_size = 32,
                   learning_rate = 2e-5,
                   output_mode='classification',
                   warm_up_proportion=0.2,
                   local_rank=-1,
                   discriminate=True,
                   gradual_unfreeze=True)

`finbert` is our main class that encapsulates all the functionality. The list of class labels should be given in the prepare_model method call with label_list parameter.

In [14]:
finbert = FinBert(config)
finbert.base_model = 'bert-base-uncased'
finbert.config.discriminate=True
finbert.config.gradual_unfreeze=True

In [None]:
finbert.prepare_model(label_list=['positive','negative','neutral'])

### Training

In [10]:
trained_model = finbert.train(train_examples = train_data, model = model)

12/24/2020 12:45:23 - INFO - finbert.utils -   *** Example ***
12/24/2020 12:45:23 - INFO - finbert.utils -   guid: train-1
12/24/2020 12:45:23 - INFO - finbert.utils -   tokens: [CLS] after the reporting period , bio ##tie north american licensing partner so ##max ##on pharmaceuticals announced positive results with na ##lm ##efe ##ne in a pilot phase 2 clinical trial for smoking ce ##ssa ##tion [SEP]
12/24/2020 12:45:23 - INFO - finbert.utils -   input_ids: 101 2044 1996 7316 2558 1010 16012 9515 2167 2137 13202 4256 2061 17848 2239 24797 2623 3893 3463 2007 6583 13728 27235 2638 1999 1037 4405 4403 1016 6612 3979 2005 9422 8292 11488 3508 102 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:23 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:23 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/

HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=109.0), HTML(value='')))

12/24/2020 12:45:28 - INFO - finbert.utils -   *** Example ***
12/24/2020 12:45:28 - INFO - finbert.utils -   guid: validation-1
12/24/2020 12:45:28 - INFO - finbert.utils -   tokens: [CLS] our in - depth expertise extends to the fields of energy , industry , urban & mobility and water & environment [SEP]
12/24/2020 12:45:28 - INFO - finbert.utils -   input_ids: 101 2256 1999 1011 5995 11532 8908 2000 1996 4249 1997 2943 1010 3068 1010 3923 1004 12969 1998 2300 1004 4044 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:28 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:28 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:28 - INFO - finbert.utils -   label: neutral (id = 2)
12/24/2020 12:45:28 - INFO - finbert.finbert -   ***** Loading data *****
12/24/2




HBox(children=(HTML(value='Validating'), FloatProgress(value=0.0, max=13.0), HTML(value='')))


Validation losses: [0.8303732826159551]
No best model found


Epoch:  25%|██▌       | 1/4 [00:05<00:16,  5.65s/it]

HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=109.0), HTML(value='')))

12/24/2020 12:45:37 - INFO - finbert.utils -   *** Example ***
12/24/2020 12:45:37 - INFO - finbert.utils -   guid: validation-1
12/24/2020 12:45:37 - INFO - finbert.utils -   tokens: [CLS] our in - depth expertise extends to the fields of energy , industry , urban & mobility and water & environment [SEP]
12/24/2020 12:45:37 - INFO - finbert.utils -   input_ids: 101 2256 1999 1011 5995 11532 8908 2000 1996 4249 1997 2943 1010 3068 1010 3923 1004 12969 1998 2300 1004 4044 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:37 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:37 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:37 - INFO - finbert.utils -   label: neutral (id = 2)
12/24/2020 12:45:38 - INFO - finbert.finbert -   ***** Loading data *****
12/24/2




HBox(children=(HTML(value='Validating'), FloatProgress(value=0.0, max=13.0), HTML(value='')))


Validation losses: [0.8303732826159551, 0.4003512469621805]


Epoch:  50%|█████     | 2/4 [00:14<00:13,  6.69s/it]

HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=109.0), HTML(value='')))

12/24/2020 12:45:50 - INFO - finbert.utils -   *** Example ***
12/24/2020 12:45:50 - INFO - finbert.utils -   guid: validation-1
12/24/2020 12:45:50 - INFO - finbert.utils -   tokens: [CLS] our in - depth expertise extends to the fields of energy , industry , urban & mobility and water & environment [SEP]
12/24/2020 12:45:50 - INFO - finbert.utils -   input_ids: 101 2256 1999 1011 5995 11532 8908 2000 1996 4249 1997 2943 1010 3068 1010 3923 1004 12969 1998 2300 1004 4044 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:50 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:50 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:45:50 - INFO - finbert.utils -   label: neutral (id = 2)
12/24/2020 12:45:50 - INFO - finbert.finbert -   ***** Loading data *****
12/24/2




HBox(children=(HTML(value='Validating'), FloatProgress(value=0.0, max=13.0), HTML(value='')))


Validation losses: [0.8303732826159551, 0.4003512469621805, 0.3436752695303697]


Epoch:  75%|███████▌  | 3/4 [00:27<00:08,  8.47s/it]

HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=109.0), HTML(value='')))

12/24/2020 12:46:04 - INFO - finbert.utils -   *** Example ***
12/24/2020 12:46:04 - INFO - finbert.utils -   guid: validation-1
12/24/2020 12:46:04 - INFO - finbert.utils -   tokens: [CLS] our in - depth expertise extends to the fields of energy , industry , urban & mobility and water & environment [SEP]
12/24/2020 12:46:04 - INFO - finbert.utils -   input_ids: 101 2256 1999 1011 5995 11532 8908 2000 1996 4249 1997 2943 1010 3068 1010 3923 1004 12969 1998 2300 1004 4044 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:46:04 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:46:04 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:46:04 - INFO - finbert.utils -   label: neutral (id = 2)
12/24/2020 12:46:04 - INFO - finbert.finbert -   ***** Loading data *****
12/24/2




HBox(children=(HTML(value='Validating'), FloatProgress(value=0.0, max=13.0), HTML(value='')))

Epoch: 100%|██████████| 4/4 [00:40<00:00, 10.20s/it]


Validation losses: [0.8303732826159551, 0.4003512469621805, 0.3436752695303697, 0.34499044028612286]





## Test the model

`bert.evaluate` outputs the DataFrame, where true labels and logit values for each example is given

In [11]:
test_data = pd.read_csv('Final.csv')

In [12]:
results = finbert.evaluate(examples=test_data, model=trained_model)

12/24/2020 12:46:05 - INFO - finbert.utils -   *** Example ***
12/24/2020 12:46:05 - INFO - finbert.utils -   guid: test-1
12/24/2020 12:46:05 - INFO - finbert.utils -   tokens: [CLS] the bristol port company has sealed a one million pound contract with cooper specialised handling to supply it with four 45 - ton ##ne , custom ##ised reach stack ##ers from ko ##ne ##cr ##ane ##s [SEP]
12/24/2020 12:46:05 - INFO - finbert.utils -   input_ids: 101 1996 7067 3417 2194 2038 10203 1037 2028 2454 9044 3206 2007 6201 17009 8304 2000 4425 2009 2007 2176 3429 1011 10228 2638 1010 7661 5084 3362 9991 2545 2013 12849 2638 26775 7231 2015 102 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:46:05 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:46:05 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:46:05 - IN

HBox(children=(HTML(value='Testing'), FloatProgress(value=0.0, max=31.0), HTML(value='')))




### Prepare the classification report

In [13]:
def report(df, cols=['label','prediction','logits']):
    #print('Validation loss:{0:.2f}'.format(metrics['best_validation_loss']))
    cs = CrossEntropyLoss(weight=finbert.class_weights)
    loss = cs(torch.tensor(list(df[cols[2]])),torch.tensor(list(df[cols[0]])))
    print("Loss:{0:.2f}".format(loss))
    print("Accuracy:{0:.2f}".format((df[cols[0]] == df[cols[1]]).sum() / df.shape[0]) )
    print("\nClassification Report:")
    print(classification_report(df[cols[0]], df[cols[1]]))

In [14]:
results['prediction'] = results.predictions.apply(lambda x: np.argmax(x,axis=0))

In [15]:
report(results,cols=['labels','prediction','predictions'])

Loss:0.39
Accuracy:0.83

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.87      0.78       267
           1       0.78      0.89      0.83       128
           2       0.92      0.80      0.85       575

    accuracy                           0.83       970
   macro avg       0.80      0.85      0.82       970
weighted avg       0.84      0.83      0.83       970



### Get predictions

In [17]:
cl_path = project_dir/'models'/'classifier_model'/'finbert-sentiment'
model = AutoModelForSequenceClassification.from_pretrained(cl_path, cache_dir=None, num_labels=3)

In [19]:
result = predict(test_data,model)

12/24/2020 12:46:11 - INFO - finbert.utils -   *** Example ***
12/24/2020 12:46:11 - INFO - finbert.utils -   guid: 0
12/24/2020 12:46:11 - INFO - finbert.utils -   tokens: [CLS] later that day apple said it was rev ##ising down its earnings expectations in the fourth quarter of 2018 , largely because of lower sales and signs of economic weakness in china . [SEP]
12/24/2020 12:46:11 - INFO - finbert.utils -   input_ids: 101 2101 2008 2154 6207 2056 2009 2001 7065 9355 2091 2049 16565 10908 1999 1996 2959 4284 1997 2760 1010 4321 2138 1997 2896 4341 1998 5751 1997 3171 11251 1999 2859 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:46:11 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12/24/2020 12:46:11 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 