# Setup

In [1]:
import csv
import io
import os
#you may need to install the packages by
#!pip install csv
#!pip install io
#!pip install os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = '/content/drive/MyDrive/Colab Notebooks/data/anno_8_tc.csv'

# Load in data

In [4]:
with open(file_path) as f:
    f.readline()
    reader = csv.reader(f)
    data = list(reader)

In [5]:
print(data[:5])

[['S1', '*'], ['Enter', 'O'], ['email', 'O'], ['address', 'O'], ['to', 'O']]


In [6]:
print(data[:20])

[['S1', '*'], ['Enter', 'O'], ['email', 'O'], ['address', 'O'], ['to', 'O'], ['Email', 'B-location'], ['textbox', 'I-location'], ['admin1@mail.com', 'B-value'], ['S2', '*'], ['Enter', 'O'], ['password', 'O'], ['to', 'O'], ['Password', 'B-location'], ['textbox', 'I-location'], ['Admin@123', 'B-value'], ['S3', '*'], ['Click', 'O'], ['button', 'B-value'], ['Login', 'I-value'], ['S4', '*']]


# Clean the sentences

In [7]:
sents = []
sent = []
for word, tag in data:
  if tag == '*':
    if len(sent) > 0:
        sents.append(sent)
        sent = []
    else:
        continue
  else:
    sent.append((word, tag))

In [8]:
texts_sents = []
for sent in sents:
  words = []
  for word, tag in sent:
    words.append(word)
  texts_sents.append(" ".join(words))

In [9]:
texts_sents[0]

'Enter email address to Email textbox admin1@mail.com'

# Trying out Flair models

In [10]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting flair
  Downloading flair-0.11.3-py3-none-any.whl (401 kB)
[K     |████████████████████████████████| 401 kB 27.8 MB/s 
[?25hCollecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting wikipedia-api
  Downloading Wikipedia-API-0.5.4.tar.gz (18 kB)
Collecting janome
  Downloading Janome-0.4.2-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 429 kB/s 
[?25hCollecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
[K     |████████████████████████████████| 788 kB 61.8 MB/s 
Collecting pptree
  Downloading pptree-3.1.tar.gz (3.0 kB)
Collecting bpemb>=0.3.2
  Downloading bpemb-0.3.3-py3-none-any.whl (19 kB)
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 2.4 MB/s 
[?25hCollecting segtok>=1.5.7
  Downloading segtok-1.5.11-py3-none-any.whl (24 k

In [11]:
from flair.models import SequenceTagger

In [12]:
tagger = SequenceTagger.load('ner')

Downloading:   0%|          | 0.00/432M [00:00<?, ?B/s]

2022-06-12 22:11:40,159 loading file /root/.flair/models/ner-english/4f4cdab26f24cb98b732b389e6cebc646c36f54cfd6e0b7d3b90b25656e4262f.8baa8ae8795f4df80b28e7f7b61d788ecbb057d1dc85aacb316f1bd02837a4a4
2022-06-12 22:11:42,291 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>


In [13]:
from flair.data import Sentence

In [14]:
sentence = Sentence("George Washington went to Washington.")

In [15]:
tagger.predict(sentence)

In [16]:
print(sentence.to_tagged_string())

Sentence: "George Washington went to Washington ." → ["George Washington"/PER, "Washington"/LOC]


In [17]:
for entity in sentence.get_spans('ner'):
  print(entity)

Span[0:2]: "George Washington" → PER (0.9989)
Span[4:5]: "Washington" → LOC (0.9942)


In [18]:
print(sentence.to_dict(tag_type='ner'))

{'text': 'George Washington went to Washington.', 'ner': [{'value': 'PER', 'confidence': 0.998886227607727}, {'value': 'LOC', 'confidence': 0.9942097663879395}]}


In [19]:
from flair.data import Corpus 
from flair.datasets import ColumnCorpus

In [20]:
columns = {0: 'text', 1:'ner'}

In [21]:
data_folder = '/content/drive/MyDrive/Colab Notebooks/data'

In [22]:
sents

[[('Enter', 'O'),
  ('email', 'O'),
  ('address', 'O'),
  ('to', 'O'),
  ('Email', 'B-location'),
  ('textbox', 'I-location'),
  ('admin1@mail.com', 'B-value')],
 [('Enter', 'O'),
  ('password', 'O'),
  ('to', 'O'),
  ('Password', 'B-location'),
  ('textbox', 'I-location'),
  ('Admin@123', 'B-value')],
 [('Click', 'O'), ('button', 'B-value'), ('Login', 'I-value')],
 [('Wait', 'O'),
  ('title', 'B-value'),
  ('to', 'O'),
  ('be', 'O'),
  ('present', 'O'),
  ('for', 'O'),
  ('30', 'B-time'),
  ('seconds', 'I-time')],
 [('Enter', 'O'),
  ('email', 'O'),
  ('address', 'O'),
  ('to', 'O'),
  ('Email', 'B-location'),
  ('textbox', 'I-location'),
  ('invalid@wrong', 'B-value')],
 [('Enter', 'O'),
  ('password', 'O'),
  ('to', 'O'),
  ('Password', 'B-location'),
  ('textbox', 'I-location'),
  ('invalidpassword', 'B-value')],
 [('Click', 'O'), ('Login', 'B-value'), ('button', 'I-value')],
 [('Wait', 'O'),
  ('email', 'B-value'),
  ('error', 'I-value'),
  ('message', 'I-value'),
  ('to', 'O'),
 

In [23]:
with open(data_folder+ "/train.txt", "w") as file:
  for sent in sents:
    file.write("\n")
    for item in sent:
      file.write("\n" + " ".join(item))
  print("Done")

Done


In [24]:
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file = 'train.txt')

2022-06-12 22:11:55,668 Reading data from /content/drive/MyDrive/Colab Notebooks/data
2022-06-12 22:11:55,671 Train: /content/drive/MyDrive/Colab Notebooks/data/train.txt
2022-06-12 22:11:55,672 Dev: None
2022-06-12 22:11:55,674 Test: None


In [25]:
print(len(corpus.train))

34


In [26]:
corpus.train[0].to_tagged_string('ner')

'Sentence: "Enter email address to Email textbox admin1@mail.com" → ["Email textbox"/location, "admin1@mail.com"/value]'

In [27]:
print(corpus.train[-1].to_tagged_string('ner'))

Sentence: "Enter new phone number to phone number text box 12345678" → ["phone number text box"/location, "12345678"/value]


# Training

In [28]:
label_type = "ner"

In [29]:
tag_dictionary = corpus.make_label_dictionary(label_type=label_type)

2022-06-12 22:11:55,745 Computing label dictionary. Progress:


34it [00:00, 16090.08it/s]

2022-06-12 22:11:55,757 Dictionary created for label 'ner' with 4 values: value (seen 32 times), location (seen 12 times), time (seen 8 times)





In [30]:
print(tag_dictionary)

Dictionary with 4 tags: <unk>, value, location, time


In [31]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

In [32]:
embedding_types = [
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
]

2022-06-12 22:11:56,311 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpwjh0_flv


100%|██████████| 160000128/160000128 [00:06<00:00, 25888964.33B/s]


2022-06-12 22:12:02,847 copying /tmp/tmpwjh0_flv to cache at /root/.flair/embeddings/glove.gensim.vectors.npy
2022-06-12 22:12:03,084 removing temp file /tmp/tmpwjh0_flv
2022-06-12 22:12:03,456 https://flair.informatik.hu-berlin.de/resources/embeddings/token/glove.gensim not found in cache, downloading to /tmp/tmpq9vzmwvx


100%|██████████| 21494764/21494764 [00:01<00:00, 15693930.81B/s]

2022-06-12 22:12:05,177 copying /tmp/tmpq9vzmwvx to cache at /root/.flair/embeddings/glove.gensim
2022-06-12 22:12:05,200 removing temp file /tmp/tmpq9vzmwvx





2022-06-12 22:12:06,881 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-forward-0.4.1.pt not found in cache, downloading to /tmp/tmp13_1axn_


100%|██████████| 73034624/73034624 [00:04<00:00, 17839704.28B/s]

2022-06-12 22:12:11,327 copying /tmp/tmp13_1axn_ to cache at /root/.flair/embeddings/news-forward-0.4.1.pt





2022-06-12 22:12:11,433 removing temp file /tmp/tmp13_1axn_
2022-06-12 22:12:12,001 https://flair.informatik.hu-berlin.de/resources/embeddings/flair/news-backward-0.4.1.pt not found in cache, downloading to /tmp/tmpzv6y3i5t


100%|██████████| 73034575/73034575 [00:03<00:00, 23014928.91B/s]

2022-06-12 22:12:15,499 copying /tmp/tmpzv6y3i5t to cache at /root/.flair/embeddings/news-backward-0.4.1.pt





2022-06-12 22:12:15,605 removing temp file /tmp/tmpzv6y3i5t


In [33]:
embeddings = StackedEmbeddings(embeddings=embedding_types)

In [34]:
tagger = SequenceTagger(hidden_size=256,
                        embeddings=embeddings,
                        tag_dictionary=tag_dictionary,
                        tag_type=label_type,
                        use_crf=True)

2022-06-12 22:12:15,820 SequenceTagger predicts: Dictionary with 13 tags: O, S-value, B-value, E-value, I-value, S-location, B-location, E-location, I-location, S-time, B-time, E-time, I-time


In [35]:
trainer = ModelTrainer(tagger, corpus)

In [45]:
trainer.train('resources/taggers/sota-ner-flair',
              learning_rate=0.1,
              train_with_dev=True,
              mini_batch_size=1,
              max_epochs=150)

2022-06-12 22:21:29,885 ----------------------------------------------------------------------------------------------------
2022-06-12 22:21:29,887 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
    (list_embedding_2): FlairEmbeddings(
      (lm): LanguageModel(
        (drop): Dropout(p=0.05, inplace=False)
        (encoder): Embedding(300, 100)
        (rnn): LSTM(100, 2048)
        (decoder): Linear(in_features=2048, out_features=300, bias=True)
      )
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=4196, out_features=4196, b

100%|██████████| 4/4 [00:00<00:00, 23.61it/s]

2022-06-12 22:22:03,021 Evaluating as a multi-label problem: False
2022-06-12 22:22:03,031 0.8	0.5714	0.6667	0.5
2022-06-12 22:22:03,032 
Results:
- F-score (micro) 0.6667
- F-score (macro) 0.8
- Accuracy 0.5

By class:
              precision    recall  f1-score   support

       value     0.7500    0.5000    0.6000         6
        time     1.0000    1.0000    1.0000         1

   micro avg     0.8000    0.5714    0.6667         7
   macro avg     0.8750    0.7500    0.8000         7
weighted avg     0.7857    0.5714    0.6571         7

2022-06-12 22:22:03,039 ----------------------------------------------------------------------------------------------------





{'dev_loss_history': [],
 'dev_score_history': [],
 'test_score': 0.6666666666666666,
 'train_loss_history': [0.07990737120901342,
  0.05931226382911427,
  0.08140684858130699,
  0.09559431395122996,
  0.12887827791689055,
  0.07306244648079004,
  0.10643832479710916,
  0.028233687673802712,
  0.06872195204837615,
  0.058675510732657844,
  0.06544429041639137,
  0.049977075654777894,
  0.055217778372498694,
  0.04750386135285672,
  0.03276113772480904,
  0.02773266831295198,
  0.07347876254510702,
  0.09119680468477724,
  0.03646449971819456,
  0.05400232931938313,
  0.03960466207624811,
  0.03917331057410258,
  0.02922255930847395,
  0.048842394662169275,
  0.03349969289559857,
  0.023432983341713377,
  0.04444575043859092,
  0.05364010325151748,
  0.04112821529345884,
  0.018879557187672442,
  0.031014222637871385,
  0.03349580729317931,
  0.06642650760239385,
  0.022980487922753545,
  0.03232858792556706,
  0.030170305954036216,
  0.03500580344501481,
  0.047110419291102754,
  0.097

# Load the data

In [42]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [43]:
model = SequenceTagger.load('resources/taggers/sota-ner-flair/final-model.pt')

2022-06-12 22:20:41,906 loading file resources/taggers/sota-ner-flair/final-model.pt
2022-06-12 22:20:42,658 SequenceTagger predicts: Dictionary with 15 tags: O, S-value, B-value, E-value, I-value, S-location, B-location, E-location, I-location, S-time, B-time, E-time, I-time, <START>, <STOP>


In [44]:
sentence = Sentence("Enter email address to Email textbox admin@gmail.com")
model.predict(sentence)
print(sentence.to_tagged_string())

Sentence: "Enter email address to Email textbox admin @ gmail.com" → ["Email textbox"/location, "admin"/value, "@"/value, "gmail.com"/value]


In [40]:
sentence = Sentence("Wait for email to appear for 20s")
model.predict(sentence)
print(sentence.to_tagged_string())

Sentence: "Wait for email to appear for 20s" → ["20s"/time]
