## Library imports

In [4]:
!pip install transformers
!pip install datasets
!pip install pyLDAvis



You should consider upgrading via the 'c:\users\ghoneimm\appdata\local\continuum\anaconda3\python.exe -m pip install --upgrade pip' command.


In [1]:
#Import the necessary libraries
import pandas as pd
import pickle
import numpy as np
import scipy as sp
import sys
import re
from copy import deepcopy
import random
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
from torch import optim
torch.manual_seed(10)
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import DataLoader
import transformers
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

#import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

import warnings
warnings.filterwarnings("ignore")

# import gzip

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

print('Version information')

print('python: {}'.format(sys.version))
print('numpy: {}'.format(np.__version__))

C:\Users\ghoneimm\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\ghoneimm\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  stacklevel=1)
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghoneimm\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Version information
python: 3.7.3 (default, Apr 24 2019, 15:29:51) [MSC v.1915 64 bit (AMD64)]
numpy: 1.21.6


# Load Dataset


We start by loading both data sets already split into an 80/20 train and test set.

In [2]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

# Separate dataframes into train and test lists
x_train, y_train = list(df_train['headline']), list(df_train['label'])
x_test, y_test = list(df_test['headline']), list(df_test['label'])

Below is the number of headlines in the train and test set as well as a sample of the article headlines and its binary label, where 0 is considered not clickbait and 1 is clickbait.

In [3]:

print(f'Number of Train Headlines: {len(x_train)}')
print(f'Number of Test Headlines: {len(x_test)}')

print('\n\nSample Label and Headlines:')
x = 105
for label, line in zip(y_train[x:x+5], x_train[x:x+5]):
    print(f'{label}: {line}')
    
print('\nOutput of Sample Headlines without Print Statement:')
x_train[x:x+5]

Number of Train Headlines: 19200
Number of Test Headlines: 4800


Sample Label and Headlines:
1: 27 Breathtaking Alternatives To A Traditional Wedding Bouquet <br>

1: 22 Pictures People Who Aren't Grad Students Will <strong>Never</strong> Understand

0: PepsiCo Profit Falls 43 Percent

0: Website of Bill O'Reilly, FOX News commentator, hacked in retribution

1: The Green Toy Soldiers From Your Childhood Now Come In Baller Yoga Poses A


Output of Sample Headlines without Print Statement:


['27 Breathtaking Alternatives To A Traditional Wedding Bouquet <br>\n',
 "22 Pictures People Who Aren't Grad Students Will <strong>Never</strong> Understand\n",
 'PepsiCo Profit Falls 43 Percent\n',
 "Website of Bill O'Reilly, FOX News commentator, hacked in retribution\n",
 'The Green Toy Soldiers From Your Childhood Now Come In Baller Yoga Poses A\n']

In [4]:
# Save test and train as csv
df_train_wos = pd.read_csv('./data/train_wos.csv')
df_test_wos = pd.read_csv('./data/test_wos.csv')

# Separate dataframes into train and test lists
x_train_wos, y_train_wos = list(df_train_wos['article']), list(df_train_wos['label'])
x_test_wos, y_test_wos = list(df_test_wos['article']), list(df_test_wos['label'])

# Numerical label to domain mapping
wos_label = {0:'CS', 1:'ECE', 2:'Civil', 3:'Medical'}
# Numerical label to Numerical mapping
label_mapping = {0:0, 1:1, 4:2, 5:3}

for i, label in enumerate(y_train_wos):
    y_train_wos[i] = label_mapping[label]
for i, label in enumerate(y_test_wos):
    y_test_wos[i] = label_mapping[label]

In [5]:
print(f'Number of Train Articles: {len(x_train_wos)}')
print(f'Number of Test Articles: {len(x_test_wos)}')

print('\nLabel Key:', wos_label)

print('\nSample Label and Articles:\n')
x = 107
for label, line in zip(y_train_wos[x:x+3], x_train_wos[x:x+3]):
    print(f'{label} - {wos_label[label]}: {line}')

Number of Train Articles: 1600
Number of Test Articles: 400

Label Key: {0: 'CS', 1: 'ECE', 2: 'Civil', 3: 'Medical'}

Sample Label and Articles:

0 - CS: An efficient procedure for calculating the electromagnetic fields in multilayered cylindrical structures is reported in this paper. Using symbolic computation, spectral Green's functions, suitable for numerical implementations are determined in compact and closed forms. Applications are presented for structures with two dielectric layers.

1 - ECE: A multifunctional platform based on the microhotplate was developed for applications including a Pirani vacuum gauge, temperature, and gas sensor. It consisted of a tungsten microhotplate and an on-chip operational amplifier. The platform was fabricated in a standard complementary metal oxide semiconductor (CMOS) process. A tungsten plug in standard CMOS process was specially designed as the serpentine resistor for the microhotplate, acting as both heater and thermister. With the sacrifici

## Classification with BERT


The transformer neural network is a novel architecture that aims to solve sequence-to-sequence tasks while handling long-range dependencies with ease. In a transformer, we can pass all the words of a sentence and determine the word embedding simultaneously.

<p align="center"><img src="https://d2l.ai/_images/bert-one-seq.svg" width="75%" align="center"></p>

We will be using BERT (Bidirectional Encoder Representations from Transformers) pre-trained models for embeddings. BERT architecture consists of several Transformer encoders stacked together. Each Transformer encoder encapsulates two sub-layers: a self-attention layer and a feed-forward layer. 

The details on BERT can be referred from the paper : [BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding](https://arxiv.org/pdf/1810.04805.pdf).

We will be using the BERT embeddings and a fully connected linear layer to perform classification.

We will then classify the Clickbait and Web of science dataset for this task.


### 3.1 : Initialize Tokenizer



In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

C:\Users\ghoneimm\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\ghoneimm\AppData\Local\Continuum\anaconda3\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll
  stacklevel=1)


### 3.2 : Classifying Clickbait Dataset using BERT


In [49]:
def vectorize_batch(batch):
    Y, X = list(zip(*batch))
    X = tokenizer(X, padding='max_length', max_length = 128, truncation=True, return_tensors="pt")
    input_ids, attention_mask = X['input_ids'], X['attention_mask']
    return input_ids, attention_mask, torch.tensor(Y)

In [50]:
train_dataset = list(map(lambda y, x: (y, x), y_train, x_train))
test_dataset = list(map(lambda y, x: (y, x), y_test, x_test))

train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=vectorize_batch, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32, collate_fn=vectorize_batch)

In [69]:
from transformers import BertTokenizer, BertModel
model = BertModel.from_pretrained('bert-base-uncased')

In [82]:
next(iter(train_loader))

(tensor([[  101,  2029,  1997,  ...,     0,     0,     0],
         [  101,  2538,  1056,  ...,     0,     0,     0],
         [  101,  2822,  2576,  ...,     0,     0,     0],
         ...,
         [  101,  2064,  2017,  ...,     0,     0,     0],
         [  101,  1999,  2033,  ...,     0,     0,     0],
         [  101,  2149, 10687,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
         0, 1, 0, 1, 1, 1, 0, 0]))

In [83]:
o = model(next(iter(train_loader))[0], next(iter(train_loader))[1], return_dict=True)


In [84]:
next(iter(train_loader))[0].shape

torch.Size([32, 128])

In [85]:
o

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0864,  0.2330, -0.0568,  ..., -0.2137,  0.1613,  0.0918],
         [ 0.4364,  0.6714, -0.1787,  ..., -0.6308,  0.6170,  0.2370],
         [ 0.4506,  0.8987, -0.8459,  ..., -0.7261, -0.5720, -0.0747],
         ...,
         [ 0.2418,  0.2926,  0.0830,  ..., -0.2001,  0.0549, -0.0622],
         [ 0.2477,  0.3359,  0.0261,  ..., -0.2299,  0.0145, -0.1423],
         [ 0.2559,  0.3247,  0.0477,  ..., -0.1549,  0.0032, -0.1408]],

        [[-0.6144, -0.1487,  0.0565,  ..., -0.3425,  0.8226, -0.1402],
         [-0.0905,  0.2448,  0.4034,  ..., -0.6777,  0.5067,  0.0820],
         [-0.8929, -0.6711, -0.5964,  ..., -0.7422,  0.0858, -0.3999],
         ...,
         [-0.0363, -0.0087,  0.7482,  ..., -0.4807,  0.0298, -0.5388],
         [-0.0674,  0.0420,  0.7729,  ..., -0.5360, -0.0586, -0.6428],
         [-0.0557,  0.0021,  0.7605,  ..., -0.5004, -0.0616, -0.7429]],

        [[ 0.0881,  0.0888, -0.0726,  ..., -0.1399,  

In [None]:
from bert import BERTClassifier
from tqdm import tqdm

NUM_CLASSES = 2

device = "cpu"
model = BERTClassifier(num_classes=NUM_CLASSES)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
N_EPOCHS = 3

model.train()
for epoch in range(N_EPOCHS):
    total_loss = 0.0
    for X, X_mask, Y in tqdm(train_loader):
      X = X.to(device)
      X_mask = X_mask.to(device)
      Y = Y.to(device)
      outputs = model(X, X_mask)
      
      loss = criterion(outputs, Y)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      total_loss += loss.item()

    print("loss on epoch %i: %f" % (epoch, total_loss))

 12%|█████████                                                                      | 69/600 [28:10<3:41:37, 25.04s/it]

In [None]:
from sklearn.metrics import accuracy_score

with torch.no_grad():
  Y_truth, Y_preds = [],[]
  for X, X_mask, Y in test_loader:
    X = X.to(device)
    X_mask = X_mask.to(device)
    outputs = model(X, X_mask)

    Y_truth.append(Y)
    Y_preds.append(outputs)

  Y_truth = torch.cat(Y_truth)
  Y_preds = torch.cat(Y_preds)

print("Test Accuracy on Clickbait Dataset using BERT : {:.3f}".format(accuracy_score(Y_truth.cpu().detach().numpy(), F.softmax(Y_preds, dim=-1).argmax(dim=-1).cpu().detach().numpy())))

In [None]:
preds = F.softmax(Y_preds, dim=-1).argmax(dim=-1).cpu().detach().numpy()

with open('bert_clickbait.pkl', 'wb') as fp:
    pickle.dump(preds, fp)

### Classifying Web of Science Dataset using BERT 

In [None]:
train_dataset = list(map(lambda y, x: (y, x), y_train_wos, x_train_wos))
test_dataset = list(map(lambda y, x: (y, x), y_test_wos, x_test_wos))

train_loader = DataLoader(train_dataset, batch_size=32, collate_fn=vectorize_batch, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=32, collate_fn=vectorize_batch)

In [None]:
from bert import BERTClassifier
from tqdm import tqdm

NUM_CLASSES = 4

model = BERTClassifier(num_classes=NUM_CLASSES)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
N_EPOCHS = 3

model.train()
for epoch in range(N_EPOCHS):
    total_loss = 0.0
    for X, X_mask, Y in tqdm(train_loader):
      X = X.to(device)
      X_mask = X_mask.to(device)
      Y = Y.to(device)
      outputs = model(X, X_mask)
      
      loss = criterion(outputs, Y)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      total_loss += loss.item()

    print("loss on epoch %i: %f" % (epoch, total_loss))

In [None]:
from sklearn.metrics import accuracy_score

with torch.no_grad():
  Y_truth, Y_preds = [],[]
  for X, X_mask, Y in test_loader:
    X = X.to(device)
    X_mask = X_mask.to(device)
    outputs = model(X, X_mask)

    Y_truth.append(Y)
    Y_preds.append(outputs)

  Y_truth = torch.cat(Y_truth)
  Y_preds = torch.cat(Y_preds)

print("Test Accuracy on Web of Science Dataset using BERT : {:.3f}".format(accuracy_score(Y_truth.cpu().detach().numpy(), F.softmax(Y_preds, dim=-1).argmax(dim=-1).cpu().detach().numpy())))

In [None]:
preds = F.softmax(Y_preds, dim=-1).argmax(dim=-1).cpu().detach().numpy()

with open('bert_wos.pkl', 'wb') as fp:
    pickle.dump(preds, fp)

## Sequence Labeling

Part-of-speech (POS) tagging is a popular Natural Language Processing process which refers to categorizing words in a text (corpus) in correspondence with a particular part of speech, depending on the definition of the word and its context.

Named entity recognition (NER) seeks to locate and classify named entities in text into pre-defined categories such as the names of persons, organizations, locations, expressions of times, quantities, monetary values, percentages, etc.

For more details on each tasks, please refer to class slides.

<p align="center"><img src="https://media-exp1.licdn.com/dms/image/C5112AQGVAByeLRJlBw/article-inline_image-shrink_400_744/0/1579118062060?e=1674691200&v=beta&t=lpQUVXCxwj-GYb3R_Kz_ys6BB-cgZYgOurOdniGPyrU" width="75%" align="center"></p>

We will be using BERT (Bidirectional Encoder Representations from Transformers) for sequence labeling. The architecture of the model is shown above in the diagram.

We will be using the BERT embeddings and a fully connected linear layer to perform classification.

We will then classify using the conll2003 dataset for this task.


### 4.1 : Loading Dataset

Run the below cell to download the conll2003 dataset. Each word in the dataset has been put on a separate line and there is an empty line after each sentence. The first item on each line is a word, the second a part-of-speech (POS) tag, the third a syntactic chunk tag and the fourth the named entity tag. The chunk tags and the named entity tags have the format I-TYPE which means that the word is inside a phrase of type TYPE.

In [6]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

train = dataset['train']
val = dataset['validation']
test = dataset['test']

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 to C:/Users/ghoneimm/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to C:/Users/ghoneimm/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

### Pre-process Dataset


In [17]:
### The code is used to vectorize the data for each batch as defined in the data loader.
### For data in the batch, the input tokens are encoded and 
### then special tokens [CLS] (101) is added at the beginning and [SEP] (102) is added at the end.
### These tokens are added because the BERT model was pretrained with these tokens. So to get the same results for inference we need to add them.
### The input is padded with 0 if it is lesser than Max length.

MAX_LEN = 128

device = "cpu"
def vectorize_batch(batch):
    batch_input_ids = []
    batch_mask = []
    batch_token_type_ids = []
    batch_pos = []
    batch_ner = []

    for data in batch:
      target_pos = []
      target_ner = []
      inputs = []
      tokens = data['tokens']
      pos_tags = data['pos_tags']
      ner_tags = data['ner_tags']
      for i in range(len(tokens)):
        input = tokenizer.encode(tokens[i], add_special_tokens=False)
        input_len = len(input)
        target_pos.extend([pos_tags[i]] * input_len)
        target_ner.extend([ner_tags[i]] * input_len)
        inputs.extend(input)
      inputs = inputs[:MAX_LEN - 2]
      target_pos = target_pos[:MAX_LEN - 2]
      target_ner = target_ner[:MAX_LEN - 2]

      inputs = [101] + inputs + [102]
      target_pos = [0] + target_pos + [0]
      target_ner = [0] + target_ner + [0]

      mask = [1] * len(inputs)
      token_type_ids = [0] * len(inputs)

      padding_len = MAX_LEN - len(inputs)
      inputs = inputs + ([0] * padding_len)
      mask = mask + ([0] * padding_len)

      token_type_ids = token_type_ids + ([0] * padding_len)
      target_pos = target_pos + ([0] * padding_len)
      target_ner = target_ner + ([0] * padding_len)

      batch_input_ids.append(inputs)
      batch_mask.append(mask)
      batch_token_type_ids.append(token_type_ids)
      batch_pos.append(target_pos)
      batch_ner.append(target_ner)

    return torch.tensor(batch_input_ids, dtype=torch.long), torch.tensor(batch_mask, dtype=torch.long), torch.tensor(batch_token_type_ids, dtype=torch.long), torch.tensor(batch_pos, dtype=torch.long), torch.tensor(batch_ner, dtype=torch.long)

In [18]:
train_loader = DataLoader(train, batch_size=8, collate_fn=vectorize_batch, shuffle=True)
test_loader  = DataLoader(test, batch_size=1, collate_fn=vectorize_batch)

In [19]:
next(iter(train_loader))

(tensor([[  101,  2605,  2006,  ...,     0,     0,     0],
         [  101, 10981,  5443,  ...,     0,     0,     0],
         [  101, 10704, 12385,  ...,     0,     0,     0],
         ...,
         [  101,  1020,  1011,  ...,     0,     0,     0],
         [  101,  3119,  5703,  ...,     0,     0,     0],
         [  101,  5712,   102,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([[ 0, 22, 15,  ...,  0,  0,  0],
         [ 0, 21, 14,  ...,  0,  0,  0],
         [ 0, 22, 22,  ...,  0,  0,  0],
         ...,
         [ 0, 11,  8,  ...,  0,  0,  0],


### POS Tagging and NER using conll2003 dataset

In [20]:

from sequenceLabeling import SequenceLabeling
from tqdm import tqdm

NUM_CLASSES_POS = 47
NUM_CLASSES_NER = 9

model_pos = SequenceLabeling(NUM_CLASSES_POS)
model_pos.to(device)

model_ner = SequenceLabeling(NUM_CLASSES_NER)
model_ner.to(device)

criterion = nn.CrossEntropyLoss()
optimizer_pos = optim.Adam(model_pos.parameters(), lr=0.0001)
optimizer_ner = optim.Adam(model_ner.parameters(), lr=0.0001)
N_EPOCHS = 3

model_pos.train()
model_ner.train()

for epoch in range(N_EPOCHS):
    total_loss_pos = 0.0
    total_loss_ner = 0.0
    for input_ids, mask, token_type_ids, target_pos, target_ner in tqdm(train_loader):
      input_ids = input_ids.to(device)
      mask = mask.to(device)
      token_type_ids = token_type_ids.to(device)
      target_pos = target_pos.to(device)
      target_ner = target_ner.to(device)
      
      outputs_pos = model_pos(input_ids, mask, token_type_ids)
      outputs_ner = model_ner(input_ids, mask, token_type_ids)

      active_loss_pos = mask.view(-1) == 1
      active_logits_pos = outputs_pos.view(-1, NUM_CLASSES_POS)

      active_labels_pos = torch.where(
        active_loss_pos,
        target_pos.view(-1),
        torch.tensor(criterion.ignore_index).type_as(target_pos)
      )

      loss_pos = criterion(active_logits_pos, active_labels_pos)      

      optimizer_pos.zero_grad()
      loss_pos.backward()
      optimizer_pos.step()

      active_loss_ner = mask.view(-1) == 1
      active_logits_ner = outputs_ner.view(-1, NUM_CLASSES_NER)

      active_labels_ner = torch.where(
        active_loss_ner,
        target_ner.view(-1),
        torch.tensor(criterion.ignore_index).type_as(target_ner)
      )

      loss_ner = criterion(active_logits_ner, active_labels_ner)      

      optimizer_ner.zero_grad()
      loss_ner.backward()
      optimizer_ner.step()

      total_loss_pos += loss_pos.item()
      total_loss_ner += loss_ner.item()

    print("POS Tagging loss on epoch %i: %f" % (epoch, total_loss_pos))
    print("NER loss on epoch %i: %f" % (epoch, total_loss_pos))




  0%|                                                                                         | 0/1756 [00:04<?, ?it/s][A[A[A


ValueError: Expected input batch_size (8) to match target batch_size (1024).

In [None]:
with torch.no_grad():
    y_true_pos = []
    y_pred_pos = []
    y_true_ner = []
    y_pred_ner = []
    for input_ids, mask, token_type_ids, target_pos, target_ner in tqdm(test_loader):
      input_ids = input_ids.to(device)
      mask = mask.to(device)
      token_type_ids = token_type_ids.to(device)
      target_pos = target_pos.to(device).view(-1)
      target_ner = target_ner.to(device).view(-1)

      outputs_pos = model_pos(input_ids, mask, token_type_ids)
      predicted_pos = torch.argmax(outputs_pos, dim=-1)

      outputs_ner = model_ner(input_ids, mask, token_type_ids)
      predicted_ner = torch.argmax(outputs_ner, dim=-1)

      active_loss = mask == 1
      active_loss = active_loss.view(-1)
      predicted_pos = predicted_pos.view(-1)
      predicted_ner = predicted_ner.view(-1)

      for i in range(len(active_loss)):
        if not active_loss[i]:
          break
        y_true_pos.append(target_pos[i].cpu().detach().numpy())
        y_pred_pos.append(predicted_pos[i].cpu().detach().numpy())
        y_true_ner.append(target_ner[i].cpu().detach().numpy())
        y_pred_ner.append(predicted_ner[i].cpu().detach().numpy())

from sklearn.metrics import f1_score

print("Test F1-score on Conll2003 Dataset for POS Tagging : {:.3f}".format(f1_score(y_true_pos, y_pred_pos, average='micro')))
print("Test F1-score on Conll2003 Dataset for NER : {:.3f}".format(f1_score(y_true_ner, y_pred_ner, average='micro')))

In [None]:
with open('bert_pos.pkl', 'wb') as fp:
    pickle.dump(np.array(y_pred_pos), fp)

with open('bert_ner.pkl', 'wb') as fp:
    pickle.dump(np.array(y_pred_ner), fp)