# Setup

In [1]:
import csv
import io
import os
#you may need to install the packages by
#!pip install csv
#!pip install io
#!pip install os

In [2]:
#The file is trained on Google Colab and this is to connect to Google Drive
#If you have GPU on you local computer, you may train locally
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Change the path to the training file
file_path = '/content/drive/MyDrive/Colab Notebooks/data/anno_14_tc.csv'

# Load in data

In [4]:
#Read the csv and save it as a file
with open(file_path) as f:
    f.readline()
    reader = csv.reader(f)
    data = list(reader)

In [5]:
#To inspect the file format
print(data[:5])

[['S1', '*'], ['Enter', 'O'], ['email', 'O'], ['address', 'O'], ['to', 'O']]


In [6]:
print(data[:20])

[['S1', '*'], ['Enter', 'O'], ['email', 'O'], ['address', 'O'], ['to', 'O'], ['Email', 'B-location'], ['textbox', 'I-location'], ['admin1@mail.com', 'B-value'], ['S2', '*'], ['Enter', 'O'], ['password', 'O'], ['to', 'O'], ['Password', 'B-location'], ['textbox', 'I-location'], ['Admin@123', 'B-value'], ['S3', '*'], ['Click', 'O'], ['button', 'B-value'], ['Login', 'I-value'], ['S4', '*']]


# Clean the sentences

In [7]:
#The ['S1', '*'] is to indicate the beginning of the steps/sentences 
#We need to remove it and use it to indicate the segmentation of sentences
sents = []
sent = []
for word, tag in data:
  if tag == '*':
    if len(sent) > 0:
        sents.append(sent)
        sent = []
    else:
        continue
  else:
    sent.append((word, tag))

In [8]:
#Make the sentences into text format
texts_sents = []
for sent in sents:
  words = []
  for word, tag in sent:
    words.append(word)
  texts_sents.append(" ".join(words))

In [9]:
#inspect the format
texts_sents[0]

'Enter email address to Email textbox admin1@mail.com'

# Trying out Flair BERT models

In [10]:
!pip install flair
#You may need to install flair libary 
#The documentation https://github.com/flairNLP/flair
#Flair is a framework for state-of-art NLP for:
#named entity recognition, part-of-speech tagging, classification and etc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 7.7 MB/s 
[?25hCollecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
[K     |████████████████████████████████| 788 kB 53.5 MB/s 
Collecting sqlitedict>=1.6.0
  Downloading sqlitedict-2.0.0.tar.gz (46 kB)
[K     |████████████████████████████████| 46 kB 4.7 MB/s 
Collecting janome
  Downloading Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 210 kB/s 
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 14.4 MB/s 
[?25hCollecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting transformers>=4.0.0
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 50.3

In [11]:
from flair.models import SequenceTagger
#This is to import the sequence tagger

In [12]:
tagger = SequenceTagger.load('ner')
#To load the named entity recognition function by 'ner'

Downloading:   0%|          | 0.00/432M [00:00<?, ?B/s]

2022-06-17 00:13:29,327 loading file /root/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-06-17 00:13:31,468 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [13]:
from flair.data import Sentence
#To precess data by sentence level, import Sentence

In [14]:
#A demo of using NER(named entity recognition) tagger for sentence
sentence = Sentence("George Washington went to Washington.")

In [15]:
tagger.predict(sentence)

In [16]:
#It can predicts Person: PER, and location: LOC by context because though the two Washingtongs are the same word
#This is the power of Flair, tag the words by context rather than form only
print(sentence.to_tagged_string())

Sentence: "George Washington went to Washington ." → ["George Washington"/PER, "Washington"/LOC]


In [17]:
#To print the entity and its probability 
for entity in sentence.get_spans('ner'):
  print(entity)

Span[0:2]: "George Washington" → PER (0.9989)
Span[4:5]: "Washington" → LOC (0.9942)


In [18]:
print(sentence.to_dict(tag_type='ner'))

{'text': 'George Washington went to Washington.', 'ner': [{'value': 'PER', 'confidence': 0.998886227607727}, {'value': 'LOC', 'confidence': 0.9942097663879395}]}


In [19]:
#For our task, we have different entities to extract
#To extract entities of our interest, we need to build our own corpus to train
from flair.data import Corpus 
from flair.datasets import ColumnCorpus

In [20]:
#The columns should correspond to the format of the file
#The first column is text, the second is NER,
#You may have other columns to assist flair such as part-of-speech tagging, dependency parsing and etc
columns = {0: 'text', 1:'ner'}

In [21]:
data_folder = '/content/drive/MyDrive/Colab Notebooks/data'

In [22]:
#To inspect our data, it corresponds to our defined columns
sents

[[('Enter', 'O'),
  ('email', 'O'),
  ('address', 'O'),
  ('to', 'O'),
  ('Email', 'B-location'),
  ('textbox', 'I-location'),
  ('admin1@mail.com', 'B-value')],
 [('Enter', 'O'),
  ('password', 'O'),
  ('to', 'O'),
  ('Password', 'B-location'),
  ('textbox', 'I-location'),
  ('Admin@123', 'B-value')],
 [('Click', 'O'), ('button', 'B-value'), ('Login', 'I-value')],
 [('Wait', 'O'),
  ('title', 'B-value'),
  ('to', 'O'),
  ('be', 'O'),
  ('present', 'O'),
  ('for', 'O'),
  ('30', 'B-time'),
  ('seconds', 'I-time')],
 [('Enter', 'O'),
  ('email', 'O'),
  ('address', 'O'),
  ('to', 'O'),
  ('Email', 'B-location'),
  ('textbox', 'I-location'),
  ('invalid@wrong', 'B-value')],
 [('Enter', 'O'),
  ('password', 'O'),
  ('to', 'O'),
  ('Password', 'B-location'),
  ('textbox', 'I-location'),
  ('invalidpassword', 'B-value')],
 [('Click', 'O'), ('Login', 'B-value'), ('button', 'I-value')],
 [('Wait', 'O'),
  ('email', 'B-value'),
  ('error', 'I-value'),
  ('message', 'I-value'),
  ('to', 'O'),
 

In [23]:
#It only supports txt file, so we need to convert our processed data into txt file
with open(data_folder+ "/train.txt", "w") as file:
  for sent in sents:
    file.write("\n")
    for item in sent:
      file.write("\n" + " ".join(item))
  print("Done")

Done


In [24]:
#To build the corpus, specify the file path, columns we defined above, load the training file in txt format
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file = 'train.txt')

2022-06-17 00:13:46,091 Reading data from /content/drive/MyDrive/Colab Notebooks/data
2022-06-17 00:13:46,096 Train: /content/drive/MyDrive/Colab Notebooks/data/train.txt
2022-06-17 00:13:46,098 Dev: None
2022-06-17 00:13:46,101 Test: None


In [25]:
#There are 70 training sentences, it will treat the duplicates as one
print(len(corpus.train))

70


In [26]:
#To inspect the first sentence from training
corpus.train[0].to_tagged_string('ner')

'Sentence: "Enter email address to Email textbox admin1@mail.com" → ["Email textbox"/location, "admin1@mail.com"/value]'

In [27]:
#To inspect the last sentence from training
print(corpus.train[-1].to_tagged_string('ner'))

Sentence: "Input update library name to library name textbox" → ["update library name"/value, "library name textbox"/location]


# Training

In [28]:
#specify the labeling type as named entity recognition
label_type = "ner"

In [29]:
#build named entity recognition dictionaries
tag_dictionary = corpus.make_label_dictionary(label_type=label_type)

2022-06-17 00:13:46,185 Computing label dictionary. Progress:


70it [00:00, 21230.84it/s]

2022-06-17 00:13:46,199 Dictionary created for label 'ner' with 4 values: value (seen 67 times), location (seen 15 times), time (seen 10 times)





In [30]:
#There are four tags of our interest
print(tag_dictionary)

Dictionary with 4 tags: <unk>, value, location, time


In [31]:
from flair.embeddings import TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
#load the embeddings, models and trainer from Flair

In [32]:
#use transformer to embed the words by using xlm-roberta-large
#for details of this roberta, refer to https://huggingface.co/xlm-roberta-large
#check the documentation https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
#layer means the layers of the transformer-based model that produce embeddings
#fine-tune means whehter or not embeddings are fine-tunable
#use-context Set to True to include context outside of sentences. 
#This can greatly increase accuracy on some tasks, 
# but slows down embedding generation
embeddings = TransformerWordEmbeddings(model='xlm-roberta-large',
                                       layers="-1",
                                       subtoken_pooling="first",
                                       fine_tune=True,
                                       use_context=True,
                                       )

Downloading:   0%|          | 0.00/616 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.09G [00:00<?, ?B/s]

In [33]:
#for more information of the tagger, check https://github.com/flairNLP/flair/blob/master/flair/models/sequence_tagger_model.py
#hidden_size:  hidden size of RNN layer
#embeddings: embeddings to use during training and prediction
#tag_dictionary: Dictionary containing all tags from corpus which can be predicted
#tag_type: type of tag which is going to be predicted in case a corpus has multiple annotations
#use_crf: If True, use a Conditional Random Field for prediction, else linear map to tag space.
#project_embeddings: If True, add a linear layer on top of embeddings, if you want to imitate
#fine tune non-trainable embeddings.
#use_rnn: If true, use a RNN, else Linear layer.
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type=label_type,
                        use_crf=False,
                        use_rnn=False,
                        reproject_embeddings=False)

2022-06-17 00:15:16,565 SequenceTagger predicts: Dictionary with 13 tags: O, S-value, B-value, E-value, I-value, S-location, B-location, E-location, I-location, S-time, B-time, E-time, I-time


In [34]:
#training
trainer = ModelTrainer(tagger, corpus)

In [35]:
#check the details of the trainer https://github.com/flairNLP/flair/blob/master/flair/trainers/trainer.py
#It will print out the model details and the training accuracy and F1 score
#The loss will also be printed out for each epoch
#The path means where the model is saved for future loading
trainer.fine_tune('resources/taggers/sota-ner-flair',
              learning_rate=5.0e-6,
              train_with_dev=True,
              mini_batch_size=1,
              mini_batch_chunk_size=1,
              max_epochs=50)

2022-06-17 00:15:16,604 ----------------------------------------------------------------------------------------------------
2022-06-17 00:15:16,609 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): XLMRobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(250002, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_feature

100%|██████████| 9/9 [00:00<00:00, 13.06it/s]

2022-06-17 00:39:03,352 Evaluating as a multi-label problem: False
2022-06-17 00:39:03,391 1.0	0.9167	0.9565	0.9167
2022-06-17 00:39:03,392 
Results:
- F-score (micro) 0.9565
- F-score (macro) 0.9804
- Accuracy 0.9167

By class:
              precision    recall  f1-score   support

       value     1.0000    0.8889    0.9412         9
        time     1.0000    1.0000    1.0000         2
    location     1.0000    1.0000    1.0000         1

   micro avg     1.0000    0.9167    0.9565        12
   macro avg     1.0000    0.9630    0.9804        12
weighted avg     1.0000    0.9167    0.9559        12

2022-06-17 00:39:03,401 ----------------------------------------------------------------------------------------------------





{'dev_loss_history': [],
 'dev_score_history': [],
 'test_score': 0.9565217391304348,
 'train_loss_history': [2.722585264856087,
  2.470999058935569,
  2.1364900481636226,
  1.848151976118036,
  1.6379481499128812,
  1.518113443476382,
  1.337381136029366,
  1.1499917338843075,
  1.0001730969029177,
  0.7410793297437388,
  0.6573708765141356,
  0.6162003181606555,
  0.4340319993668261,
  0.4341696045870066,
  0.4007213225404633,
  0.30042234880224067,
  0.2975536773124858,
  0.28709505285382914,
  0.24107371452973927,
  0.20050468981148606,
  0.2401336455337912,
  0.24329374690385036,
  0.21551417591762653,
  0.177431264928449,
  0.1738303332843712,
  0.13900103293242824,
  0.15901276053910965,
  0.16430658266920828,
  0.15309569181420574,
  0.15630163440193107,
  0.17443185864020308,
  0.1220006901083114,
  0.1641782878566627,
  0.20872237410305067,
  0.15372303058118642,
  0.19661230594553275,
  0.1805420212465432,
  0.1418951555182269,
  0.12404439752025533,
  0.16183435700280377,
 

# Load the data

In [40]:
#import the packages
from flair.data import Sentence
from flair.models import SequenceTagger

In [41]:
#To load the trained model directly
model = SequenceTagger.load('resources/taggers/sota-ner-flair/final-model.pt')

2022-06-17 00:41:25,080 loading file resources/taggers/sota-ner-flair/final-model.pt
2022-06-17 00:41:47,997 SequenceTagger predicts: Dictionary with 13 tags: O, S-value, B-value, E-value, I-value, S-location, B-location, E-location, I-location, S-time, B-time, E-time, I-time


In [42]:
#Use the model
#Demo
sentence = Sentence("Enter email address to Email textbox admin@gmail.com")
model.predict(sentence)
print(sentence.to_tagged_string())

Sentence: "Enter email address to Email textbox admin @ gmail.com" → ["Email textbox"/location, "admin"/value, "@"/value]


In [43]:
sentence = Sentence("Wait for email to appear for 20s")
model.predict(sentence)
print(sentence.to_tagged_string())

Sentence: "Wait for email to appear for 20s" → ["email"/value, "20s"/time]
