In [1]:
# import list
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import random

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

import transformers
from transformers import AutoModel, BertTokenizerFast
from transformers import BertModel, DistilBertTokenizer, DistilBertModel
from transformers import BertTokenizer
from transformers import AutoTokenizer, AutoModelForMaskedLM

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import AlbertModel

from transformers import ElectraModel, ElectraTokenizer
from transformers import XLNetTokenizer, XLNetModel

from datasets import load_dataset

%matplotlib inline

In [None]:
#Please pick one among the available configs: ['sentences_allagree', 'sentences_75agree', 'sentences_66agree', 'sentences_50agree']
#Example of usage:
#load_dataset('financial_phrasebank', 'sentences_allagree')`

In [49]:
dataset = load_dataset('financial_phrasebank', 'sentences_50agree')

Found cached dataset financial_phrasebank (C:/Users/user/.cache/huggingface/datasets/financial_phrasebank/sentences_50agree/1.0.0/550bde12e6c30e2674da973a55f57edde5181d53f5a5a34c1531c53f93b7e141)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# 0 =  부정 , 1 = 중립 , 2 = 긍정 

In [50]:
data_train = dataset['train']

data_list = []

for i in range(len(data_train)):
    text_label = [data_train['sentence'][i], data_train['label'][i]]
    data_list.append(text_label)

In [51]:
len(data_list )

4846

In [6]:
## 전처리 부분 : 타 코트/논문 기반으로 진행 

In [52]:
df_ = pd.DataFrame(data_list)
df_.columns = ['text','label']
df_['label'].value_counts(normalize = True)

1    0.594098
2    0.281263
0    0.124639
Name: label, dtype: float64

In [53]:
## 각각 데이터 떠로 두기 
#data_list[0]
text = []
label = []

for i in range(len(data_list)):
    t = text_preprocessing(data_list[i][0])
    t_split = t.split(' ')
    t = text_word_one_limit(t_split)
    l = data_list[i][1] 
    text.append(t)
    label.append(l)

In [7]:
# 훈련/검증/테스트 나누기  : 현재는 train 6 : val 2 : test 2  비율 - 데이터 수에 따라 조정 
train_text, temp_text, train_labels, temp_labels = train_test_split(text, label, 
                                                                    random_state=42, 
                                                                    test_size=0.4, 
                                                                    stratify=label)

# we will use temp_text and temp_labels to create validation and test set
val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=42, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [54]:
# 본 훈련 진행 ㄱ  train / test  230313 

# 훈련/검증/테스트 나누기  : 현재는 train 6 : val 2 : test 2  비율 - 데이터 수에 따라 조정 
train_text, val_text, train_labels, val_labels = train_test_split(text, label, 
                                                                    random_state=42, 
                                                                    test_size=0.2, 
                                                                    stratify=label)

test_text = val_text

test_labels = val_labels 

In [55]:
# 가장 많은 토큰 수 
cnt = 0

for i in range(len(text)):
    txt = text[i]
    txt = txt.split(' ')
    if len(txt) > cnt:
        cnt = len(txt) 
cnt

50

In [None]:
## 여기서 부터 모델링 부분 

In [56]:
model_path = 'google/electra-base-discriminator'  # "monologg/koelectra-base-v3-discriminator"
model =   ElectraModel.from_pretrained(model_path)  # KoELECTRA-Small-v3
tokenizer =  ElectraTokenizer.from_pretrained(model_path )

#model_path = "ProsusAI/finbert" 
#model_path = 'yiyanghkust/finbert-pretrain'
#model_path = 'microsoft/deberta-v3-base'
#model_path = "google/bigbird-roberta-base"
#model = AutoModel.from_pretrained(model_path)
#tokenizer = AutoTokenizer.from_pretrained(model_path)

#model_path = 'xlnet-base-cased'
#tokenizer = XLNetTokenizer.from_pretrained(model_path)
#model = XLNetModel.from_pretrained(model_path)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [79]:
# torch.tensor 화

# Specify `MAX_LEN`   , BERT의 최대는 512, 뉴스 문장의 수를 어느 정도할지 정하고 감소 정하기 
MAX_LEN = 64

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 16

X_train = train_text
X_val = val_text
X_test = test_text
y_train = train_labels
y_val = val_labels
y_test = test_labels

train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)
test_inputs, test_masks = preprocessing_for_bert(X_test)

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)
test_labels = torch.tensor(y_test)


# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size= batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size= batch_size)

# Create the DataLoader for our test set
test_dataset = TensorDataset(test_inputs, test_masks)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size= batch_size)



In [11]:
X_train[65], train_inputs[65]

('Basic banking activities continued as normal',
 tensor([ 101, 3937, 8169, 3450, 2506, 2004, 3671,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0]))

In [80]:
## GPU 할당 및 loss 함수 및 모델 설정 및 학습 시작 

# GPU 할당 변경하기
GPU_NUM =  0# 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print ('Current cuda device ', torch.cuda.current_device()) # check

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs= 5)
train(bert_classifier, train_dataloader, val_dataloader, epochs= 5 , evaluation=True)

Current cuda device  0
cuda


Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   1.006197   |     -      |     -     |   4.16   
   1    |   40    |   0.927388   |     -      |     -     |   3.90   
   1    |   60    |   0.824820   |     -      |     -     |   3.91   
   1    |   80    |   0.834858   |     -      |     -     |   3.91   
   1    |   100   |   0.689077   |     -      |     -     |   4.02   
   1    |   120   |   0.705308   |     -      |     -     |   4.00   
   1    |   140   |   0.634176   |     -      |     -     |   3.97   
   1    |   160   |   0.613407   |     -      |     -     |   3.99   
   1    |   180   |   0.603552   |     -      |     -     |   3.99   
   1    |   200   |   0.491369   |     -      |     -     |   4.02   
   1    |   220   |   0.480013   |     -      |     -     |   4.12   
   1    |   240   |   0.481866   |     -      |     -     |   4.05   


In [81]:
# 임의 변수 옮김 & 모델 성능 평가 : val
model = bert_classifier

device = torch.device('cpu') # base 는 gpu인데... 여유 없을시 cpu

    # Compute predicted probabilities on the test set
probs = bert_predict(model, val_dataloader)  
preds = probs
test_y = val_labels

# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds, digits=2))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87       121
           1       0.91      0.86      0.89       576
           2       0.81      0.84      0.82       273

    accuracy                           0.86       970
   macro avg       0.84      0.88      0.86       970
weighted avg       0.87      0.86      0.87       970



In [82]:
# 모델 저장 
# 이미 디렉토리에 폴더가 있다면 이걸 쓰기 
out_dir = './model' 

# 모델 폴더 만들기 : 있으면 x 
if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    torch.save(model.state_dict(),  out_dir+ '/' + 'saved_weights_2303_acc86(50agree).pt')
    #torch.save(model,  out_dir + '/' +  'pretrainded_model_230216' ) # 필요하다면 

torch.save(model.state_dict(),  out_dir+ '/' + 'saved_weights_2303_acc86(50agree).pt') 
# saved_weights_220722_electra_un_base_v2_10
#torch.save(model,  out_dir + '/' +  'pretrainded_model_220722_electra_un_base_v2_10' ) # 필요하다면 

In [83]:
# 저장한 모델 가중치 불러오기 
device = torch.device('cpu') 
set_seed(42)    # Set seed for reproducibility
model = BertClassifier()

out_dir = './model' 
## saved_weights_220629.pt  5 epoch 모델 
#load weights of best model
path = out_dir+ '/' + 'saved_weights_2303_acc86(50agree).pt'
print(path)
model.load_state_dict(torch.load(path))  #, map_location=device   / strict = False 

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


./model/saved_weights_2303_acc86(50agree).pt


<All keys matched successfully>

In [84]:
# 검증 후 모델이 잘 학습되었다면 test set 기반 검증 : 만약, 문제가 있다면 모델 네트워크에서 수정 및 확인 
# Compute predicted probabilities on the test set
probs = bert_predict(model, test_dataloader)  
preds = probs
test_y = test_labels
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds, digits=2))

              precision    recall  f1-score   support

           0       0.81      0.93      0.87       121
           1       0.91      0.86      0.89       576
           2       0.81      0.84      0.82       273

    accuracy                           0.86       970
   macro avg       0.84      0.88      0.86       970
weighted avg       0.87      0.86      0.87       970



In [85]:
from sklearn.metrics import confusion_matrix
y_pred = preds
y_true = test_y
confusion_matrix(y_true, y_pred)

array([[113,   7,   1],
       [ 24, 498,  54],
       [  3,  42, 228]], dtype=int64)

In [None]:
## 230313    train / test 비율 맞추기 

In [36]:
n = 123
test_text[n], test_labels[n]

('For Telenor the three and half year contract is worth an estimated YIT has chosen Telenor and Elisa as its principal suppliers of ICT solutions in Norway Sweden Denmark and Finland',
 tensor(2))

In [37]:
## 220707 각각 개별 입력 분류 만들기  -> 개별 가능하게 수정하기 
# torch.tensor 화
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

MAX_LEN = 64
batch_size = 16

X_pred =  test_text[n]
#X_pred =  input()
X_pred = text_preprocessing(X_pred)
X_pred_split = X_pred.split(' ')
X_pred = text_word_one_limit(X_pred_split)
#print()
print(X_pred)

pred_inputs, pred_masks = preprocessing_for_bert([X_pred])
#test_labels  torch.tensor(y_test)
pred_dataset = TensorDataset(pred_inputs, pred_masks)
pred_sampler = SequentialSampler(pred_dataset)
pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size= batch_size)

#test_y = test_labels
probs = bert_predict(model, pred_dataloader)  
preds = np.argmax(probs, axis = 1)
probs, preds[0]

For Telenor the three and half year contract is worth an estimated YIT has chosen Telenor and Elisa as its principal suppliers of ICT solutions in Norway Sweden Denmark and Finland


(array([[5.2618352e-04, 5.4283824e-04, 9.9893099e-01]], dtype=float32), 2)

In [38]:
probs[0][1]
X_pred

'For Telenor the three and half year contract is worth an estimated YIT has chosen Telenor and Elisa as its principal suppliers of ICT solutions in Norway Sweden Denmark and Finland'

In [20]:
model

BertClassifier(
  (bert): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(50358, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0): BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [13]:
!nvidia-smi

Tue Feb 21 14:55:44 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 456.71       Driver Version: 456.71       CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 166... WDDM  | 00000000:08:00.0  On |                  N/A |
| 31%   36C    P8    14W / 125W |   5682MiB /  6144MiB |      1%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|       

In [78]:
### 함수들 
def text_word_one_limit(t_split):

    con_text = ''
    for tn in range(len(t_split)):
        tns = t_split[tn]
        if len(tns) <= 1 :
            continue

        con_text = con_text + ' ' + tns  
    return con_text.strip()

# 전처리 함수, 추후 증권 뉴스에 맞게 일부 추가 및 수정 필요 
def text_preprocessing(text):
    cleaned_text = re.sub('[^a-zA-Z]', ' ', text)
    #cleaned_text = re.sub('[a-zA-Z]', '', text) #숫자제외
    cleaned_text = re.sub('\n', '', cleaned_text)
    cleaned_text = re.sub('\xa0', '', cleaned_text) 
    cleaned_text = re.sub(
        '[\{\}\[\]\/?;:|\)…－〕.〔ⓘㅇ÷♠♣＜＞©◀Ⅱ·―Ⅱ＆,？☏☎™×『』《》／┌─┬┐│├ ┼┤└┴┘★〈●○[］〉±▨→↑↓∼％「」※ㆍ♥①②③④⑤⑥⑦⑧⑨△◇ ㈜ⓝ◈；：“”‘’ *~【】♡♥▽▷ⓒ▣◇□㈜◆☞■▶▲▼`!^\-_+<>@\#$%&\\\=\(\'\"]',
        ' ', cleaned_text)
    #cleaned_text = re.sub('    ', '', cleaned_text) 
    #cleaned_text = re.sub('   ', '', cleaned_text) 
    #cleaned_text = re.sub('  ', '', cleaned_text) 
    
    return cleaned_text


## 헤드 테일 짜르기 
def truncation_method(df, head, tail):  # 문서 리스트, head 자를 비율, tail 자를 비율 
    target_dt = []
    #head
    head_n = int(510 * head) 

    #tail
    tail_n = int(510 * tail) 
    
    print(head_n, tail_n)

    for i in range(len(df)):
        target = df['text'][i]
        #print(target)
    
        target_head = target[:head_n]  # head 짜르기 
        
        if len(target) > 510  : # 510 max_len
            tail = len(target) - tail_n # 510
            target_tail = target[tail:]
            text = target_head + target_tail 
        else:
            #tail = 0
            text = target
    
        #print(len(target ), len(text), tail)
        label = df['label'][i]
        date = df['date'][i]
        target_dt.append([label, text, date])

    return target_dt

def preprocessing_for_bert(data):
    # Create empty lists to store outputs
    input_ids = []
    attention_masks = []

    # For every sentence...
    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,  # Preprocess sentence
            add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
            max_length=MAX_LEN,                  # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            #return_tensors='pt',           # Return PyTorch tensor
            return_attention_mask=True,      # Return attention mask
            truncation= True
            )
        
        # Add the outputs to the lists
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks




# 모델 학습/검증/예측

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 256, 3
        #D_in, H, D_out = 256, 256, 2
        #D_in, H, D_out = 768, 50, 3
        # Instantiate BERT model
        #self.bert = DistilBertModel.from_pretrained(model_path)
        self.bert = ElectraModel.from_pretrained(model_path) 
        #self.bert = AutoModel.from_pretrained(model_path) 
        #self.bert = XLNetModel.from_pretrained(model_path) 

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            #nn.Dropout(0.25), 
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Dropout(0.25), 
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits
    
def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr= 2e-5,    # Default learning rate  , Learning rate (Adam): 5e-5, 3e-5 or 2e-5
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler    

    
def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    #model.eval()
    model.eval().cpu()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        all_logits.append(logits)
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """Train the BertClassifier model.
    """
    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20 batches
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
            
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy