In [4]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
import docx2txt
import glob
import re 

import spacy
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words

import nltk
nltk.download('punkt')
from nltk import tokenize

from datasets import Dataset, load_metric

import torch
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer, AutoConfig
from transformers import BertForTokenClassification, DistilBertForTokenClassification, RobertaForTokenClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification
from transformers import pipeline

In [5]:
print(torch.cuda.is_available())

True


# Data Preprocessing

In [7]:
ai_train = pd.read_csv('..\data\CrossNER\ner_data\ai\train.txt', delimiter='\t', header=None)
ai_train = ai_train.rename(columns={0:'word', 1:'entity'})

entire_word_lst = []
entire_entity_lst = []

word_lst = []
entity_lst = []

for i in range(len(ai_train)):
  if ai_train['word'][i] != '.':
    word_lst.append(ai_train['word'][i].lower())
    entity_lst.append(ai_train['entity'][i])
  else:
    word_lst.append(ai_train['word'][i].lower())
    entity_lst.append(ai_train['entity'][i])

    entire_word_lst.append(word_lst)
    entire_entity_lst.append(entity_lst)
    word_lst = []
    entity_lst = []


ai_train_cleaned = pd.DataFrame({'word':entire_word_lst, 'entity': entire_entity_lst})
ai_train_cleaned

Unnamed: 0,word,entity
0,"[popular, approaches, of, opinion-based, recom...","[O, O, O, B-product, I-product, I-product, O, ..."
1,"[advocates, of, procedural, representations, w...","[O, O, O, O, O, O, O, O, B-university, O, O, O..."
2,"[the, standard, interface, and, calculator, in...","[O, O, O, O, O, O, O, O, O, B-programlang, O]"
3,"[octave, helps, in, solving, linear, and, nonl...","[B-product, O, O, O, O, O, O, O, O, O, O, O, O..."
4,"[variants, of, the, back-propagation, algorith...","[O, O, O, B-algorithm, I-algorithm, O, O, O, B..."
...,...,...
88,"[poggio, is, an, honorary, member, of, the, ne...","[B-researcher, O, O, O, O, O, O, B-organisatio..."
89,"[during, the, 1990s, ,, encouraged, by, succes...","[O, O, O, O, O, O, O, O, B-task, I-task, O, B-..."
90,"[in, 1999, ,, felix, gers, and, his, advisor, ...","[O, O, O, B-researcher, I-researcher, O, O, O,..."
91,"[59, ,, pp., 2547-2553, ,, oct., 2011, in, one...","[O, O, O, O, O, O, O, O, B-misc, I-misc, I-mis..."


In [8]:
ai_dev = pd.read_csv('..\data\CrossNER\ner_data\ai\dev.txt', delimiter='\t', header=None)
ai_dev = ai_dev.rename(columns={0:'word', 1:'entity'})

entire_word_lst = []
entire_entity_lst = []

word_lst = []
entity_lst = []

for i in range(len(ai_dev)):
  if ai_dev['word'][i] != '.':
    word_lst.append(ai_dev['word'][i].lower())
    entity_lst.append(ai_dev['entity'][i])
  else:
    word_lst.append(ai_dev['word'][i].lower())
    entity_lst.append(ai_dev['entity'][i])

    entire_word_lst.append(word_lst)
    entire_entity_lst.append(entity_lst)
    word_lst = []
    entity_lst = []

ai_dev_cleaned = pd.DataFrame({'word':entire_word_lst, 'entity': entire_entity_lst})

In [9]:
ai_train_cleaned.head()
ai_dev_cleaned.head()

Unnamed: 0,word,entity
0,"[here, ,, accuracy, is, measured, by, error, r...","[O, O, B-metrics, O, O, O, B-metrics, I-metric..."
1,"[brion, james, portrays, leon, kowalski, ,, a,...","[B-person, I-person, O, B-person, I-person, O,..."
2,"[the, first, picture, to, be, scanned, ,, stor...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[segmenting, the, text, into, topics, or, disc...","[B-task, I-task, I-task, I-task, I-task, O, B-..."
4,"[at, indiana, university, in, 1999, he, organi...","[O, B-university, I-university, O, O, O, O, O,..."


In [10]:
ai_labels = ["O", "B-field", "I-field", "B-task", "I-task", "B-product", "I-product", "B-algorithm", 
                "I-algorithm", "B-researcher", "I-researcher", "B-metrics", "I-metrics", 
                "B-programlang", "I-programlang", "B-conference", "I-conference", 
                "B-university", "I-university", "B-country", "I-country", "B-person", "I-person", 
                "B-organisation", "I-organisation", "B-location", "I-location", "B-misc", "I-misc"]

ai_label_encoding = {"O":0, "B-field":1, "I-field":2, "B-task":3, "I-task":4, "B-product":5, "I-product":6, 
                    "B-algorithm":7, "I-algorithm":8, "B-researcher":9, "I-researcher":10, "B-metrics":11, "I-metrics":12, 
                    "B-programlang":13, "I-programlang":14, "B-conference":15, "I-conference":16, 
                    "B-university":17, "I-university":18, "B-country":19, "I-country":20, "B-person":21, "I-person":22, 
                    "B-organisation":23, "I-organisation":24, "B-location":25, "I-location":26, "B-misc":27, "I-misc":28}

ai_id2label = {v: k for k, v in ai_label_encoding.items()}

train_hf = Dataset.from_pandas(ai_train_cleaned)
dev_hf = Dataset.from_pandas(ai_dev_cleaned)

In [None]:
print(train_hf['word'][:5])
print(train_hf['entity'][:5])

In [12]:
task = 'ner'
scibert_model_checkpoint = 'allenai/scibert_scivocab_uncased' # SciBert pre-trained on scientific papers

TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16

# SciBERT
tokenizer = AutoTokenizer.from_pretrained(scibert_model_checkpoint)

In [13]:
def tokenize_and_labels(examples):

    label_all_tokens = True
    tokenized_inputs = tokenizer(examples["word"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["entity"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # elif label[word_idx] == '0':
            #     label_ids.append(0)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(ai_label_encoding[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(ai_label_encoding[label[word_idx]] if label_all_tokens else -100)

            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
train_tokenized_datasets = train_hf.map(tokenize_and_labels, batched=True)
dev_tokenized_datasets = dev_hf.map(tokenize_and_labels, batched=True)

In [15]:
tokenize_and_labels(train_hf[:2])

{'input_ids': [[102, 6237, 2688, 131, 9092, 579, 791, 28309, 429, 10766, 1711, 2190, 1471, 3267, 7566, 422, 776, 6606, 422, 18996, 669, 145, 1461, 469, 17755, 18996, 669, 546, 137, 4631, 1904, 412, 205, 316, 205, 19649, 422, 151, 205, 5527, 422, 316, 205, 261, 205, 4552, 422, 118, 205, 151, 205, 28369, 30123, 422, 316, 205, 12653, 422, 316, 205, 115, 205, 20736, 422, 182, 205, 115, 205, 18173, 422, 128, 205, 9524, 422, 145, 497, 30141, 546, 422, 422, 2337, 145, 305, 546, 862, 139, 21842, 5020, 205, 103], [102, 29158, 131, 16336, 6859, 267, 3680, 11853, 235, 1805, 422, 604, 111, 11201, 131, 2206, 8859, 678, 10746, 137, 262, 712, 387, 1203, 30108, 205, 103]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# Model Initialization & Training 

In [None]:
model = AutoModelForTokenClassification.from_pretrained(scibert_model_checkpoint, 
                                                        num_labels=len(ai_labels), id2label=ai_id2label)

In [26]:
scibert_model_name = scibert_model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"..\model\{scibert_model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    seed=42,
    num_train_epochs=15,
    weight_decay=0.001, 
    load_best_model_at_end=True,
    save_strategy = 'epoch'
    )

In [27]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

In [28]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [ai_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ai_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [29]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=dev_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train & Evaluate

In [None]:
trainer.train()

In [31]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: entity, word. If entity, word are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 320
  Batch size = 16


{'eval_loss': 0.6172993779182434,
 'eval_precision': 0.525668222316504,
 'eval_recall': 0.5431828145550197,
 'eval_f1': 0.5342820181112549,
 'eval_accuracy': 0.8403590318961373,
 'eval_runtime': 2.3754,
 'eval_samples_per_second': 134.713,
 'eval_steps_per_second': 8.42,
 'epoch': 15.0}

# Save the best model (with the lowest loss)

for preventing overfitting!

In [None]:
trainer.save_model('..\model\scibert_ner_ai.model')

# Load the finetuned model 

In [None]:
s_model = AutoModelForTokenClassification.from_pretrained('..\model\scibert_ner_ai.model')

In [34]:
s_tokenizer = AutoTokenizer.from_pretrained('..\model\scibert_ner_ai.model')

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


# Test on proposals

In [36]:
def parse_doc(file_path):

    with open(file_path, 'rb') as f:
        doc = docx2txt.process(f)

    word_par = [word for word in doc.rstrip().split('\n\n') if word.lower() not in stopwords] # paragraph-level 

    new_text_par = " ".join(word_par)
    new_text_par = re.sub('\s+', ' ', new_text_par)

    new_text_sent = tokenize.sent_tokenize(new_text_par)

    return new_text_sent

In [37]:
parsed_p = parse_doc('..\data\FY21H1/Sponsor Proposals\Azure Compute\Azure Compute FY21H1 Project Proposal Doc.docx')

In [None]:
parsed_p

In [None]:
nlp = pipeline("ner", model=s_model, tokenizer=s_tokenizer, aggregation_strategy="simple")

word_lst = []
for p in parsed_p:
  result = nlp(p)
  print(p)

## Future work

The codes above completed finetuning SciBERT on the downstream task of NER, by using the CrossNER's AI domain train/dev sets. Also, we showed the results of this model on our proposals, by presenting named entities related to AI concepts. 

Though we completed the model development, we need more steps to **aggregate all entities extracted from the proposals in the way that can be visualized in Power BI.** This will be the main future work post this micro-internship. 