In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Robust_SlotFilling

/content/drive/MyDrive/Robust_SlotFilling


In [3]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
Collecting jupyter
  Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting loguru
  Downloading loguru-0.6.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.10.1-py3-none-any.whl (469 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downl

In [4]:
!wandb login 

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
import json
import pandas as pd
import transformers
from typing import List
import re
import matplotlib.pyplot as plt
import numpy as np
import wandb

# Data Pre-processing

In [6]:
train_data = pd.read_json('./data/train_Mar-22-2023.json')
val_data = pd.read_json('./data/val_Mar-22-2023.json')

In [7]:
combined_data = [train_data, val_data]

In [8]:
data = pd.concat(combined_data, ignore_index=True)

In [10]:
def label_with_bio(full_text, spans, template_id):
    """
    Takes full-text utterances and spans (containing entity value and entity type) and creates B-I-O labels

    Args:
        df (pd.DataFrame): DataFrame with utterances, annotations, metadata, and ner-tags.
    Returns:
        id2labels (dict): dictionary with integer ids as keys and associated tags (one per id) as values
        dict (dict): dictionary with tags as keys and associated integer ids (one per tag) as values
        
    """
    words = full_text.split()
    ner_tags = ['O']*len(words)

    SPECIAL_CHAR_REGEX = re.compile(r"(\w+)([\.\?\!\,\:\'])\B")
    
    words = [re.sub(SPECIAL_CHAR_REGEX, r"\g<1>", word) for word in words]
    
    try:

        for span in spans:

        # # Handle rogue IP Address case
        #     if(template_id == 37):
        #         ner_tags[-1] = 'B-' + span['entity_type']
        #         ner_tags[-1] = span['entity_type']
        #         return ner_tags


            entity_words = span['entity_value'].split()
            entity_words = [re.sub(SPECIAL_CHAR_REGEX, r"\g<1>", word) for word in entity_words]

            span_labels = ['I-' + span['entity_type']]*len(entity_words)
            span_labels[0] = 'B-' + span['entity_type']
            # span_labels[-1] = 'E-' + span['entity_type']

                
            start_idx = words.index(entity_words[0])
            end_idx = words.index(entity_words[-1])

        
            if start_idx - end_idx != 0:
                ner_tags[start_idx:end_idx + 1] = span_labels
            else:
                ner_tags[start_idx] = span_labels[0]
        
        return ner_tags
    
    
    except:
        return "Null"
    

    

# Sample Data

In [11]:
full_text = "When they weren't singing about Hobbits, satanic felines and interstellar journeys, they were singing about the verses from Carla Pinto's Cautionary Tales. Is there a better example of unbridled creativity than early Pinto?"

spans = [
            {
                "entity_type": "PERSON",
                "entity_value": "Pinto",
                "start_position": 217,
                "end_position": 222
            },
            {
                "entity_type": "PERSON",
                "entity_value": "Carla Pinto",
                "start_position": 124,
                "end_position": 135
            }
        ]

In [12]:
ner_tags = label_with_bio(full_text, spans, 10)

# Data Labeling

In [13]:
def get_label_ids(df: pd.DataFrame)-> (dict, dict):
    """
    Takes a DataFrame with token labels (O, B-provider, I-specialty) and creates mappings
    from unique integer ids to tokens and vice versa.

    Args:
        df (pd.DataFrame): DataFrame with utterances, annotations, metadata, and ner-tags.
    Returns:
        id2labels (dict): dictionary with integer ids as keys and associated tags (one per id) as values
        dict (dict): dictionary with tags as keys and associated integer ids (one per tag) as values
    """
    unique_tags = list(set([item for sublist in list(df["ner_tags_text"]) for item in sublist]))
    id2label, label2id = {}, {}
    for ind, tag in enumerate(unique_tags):
        id2label[ind], label2id[tag]= tag, ind
    return id2label, label2id

In [14]:
def convert_label_to_id(labels: List[str], label2id) -> List[int]:
    """
    Converts a list of labels (O, O, B-specialty) to a list of ids (0, 0, 5)

    Args:
        labels (List[str]): list of token labels in text
        label2id (dict): dictionary mapping labels to unique ids

    Returns:
        List[int]: mapping of input text token labels to integer token labels
    """
    return[label2id[label] for label in labels]

In [15]:
def populate_df_with_labels(data, id2label, label2id):

    data['ner_tags_text'] = data.apply(lambda x: label_with_bio(x.full_text, x.spans, x.template_id), axis=1)

    # Remove labels 
    data = data[data['ner_tags_text'] != "Null"]
    data["tokens"] = data['full_text'].apply(lambda x: x.split())
     

    data["ner_tags"] = data["ner_tags_text"].apply(lambda x: convert_label_to_id(x, label2id))

    # Removing invalid labels
    data['token_length'] = data.tokens.apply(lambda x: len(x))
    data['tag_length'] = data.ner_tags.apply(lambda x: len(x))
    data["difference"] = data.token_length - data.tag_length
    data = data[data['difference'] == 0]

    return data

In [16]:
data['ner_tags_text'] = data.apply(lambda x: label_with_bio(x.full_text, x.spans, x.template_id), axis=1)
data = data[data['ner_tags_text'] != "Null"]
data["tokens"] = data['full_text'].apply(lambda x: x.split())

id2label, label2id = get_label_ids(data)

data["ner_tags"] = data["ner_tags_text"].apply(lambda x: convert_label_to_id(x, label2id))
data['token_length'] = data.tokens.apply(lambda x: len(x))
data['tag_length'] = data.ner_tags.apply(lambda x: len(x))
data["difference"] = data.token_length - data.tag_length
data = data[data['difference'] == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tokens"] = data['full_text'].apply(lambda x: x.split())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["ner_tags"] = data["ner_tags_text"].apply(lambda x: convert_label_to_id(x, label2id))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['token_length'] = data.tokens.apply(lambda x: l

## Populate dataframe with BIO labels

In [18]:
train_data = populate_df_with_labels(train_data, id2label, label2id)

In [None]:
val_data = populate_df_with_labels(val_data, id2label, label2id)

In [20]:
len(train_data)

149

In [21]:
len(val_data)

44

# Model Training

In [24]:
from datasets import Dataset, Features, Value, ClassLabel, Sequence
from transformers import AutoTokenizer, AutoModel

In [25]:
class TCDataMaker:
    def __init__(self, df: pd.DataFrame, id2label: dict,  label2id: dict):
        self.id2label = id2label
        self.label2id = label2id
        self.ner_tags = []
        self.tokens = []
        self.ids = []
        
        for ind, row in df.iterrows():
            self.ner_tags.append(row["ner_tags"])
            self.tokens.append(row["tokens"])
            self.ids.append(ind)
            
        self.data = {
            "id": self.ids,
            "ner_tags": self.ner_tags,
            "tokens": self.tokens
        }  
    
    def get_dataset(self):
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=list(id2label.values()))),
            "id": Value("int32")
        })
        return Dataset.from_dict(self.data, features)

In [26]:
train_datamaker = TCDataMaker(train_data,id2label,label2id)
val_datamaker = TCDataMaker(val_data,id2label,label2id)

In [27]:
train_ner_dataset = train_datamaker.get_dataset()
val_ner_dataset = val_datamaker.get_dataset()

In [28]:
from datasets import Dataset
from transformers import AutoTokenizer,DataCollatorForTokenClassification

In [29]:
tokenizer = AutoTokenizer.from_pretrained("StanfordAIMI/stanford-deidentifier-base")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [31]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [32]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            # print(word_idx)
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [33]:
tokenized_train_dataset = train_ner_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val_dataset = val_ner_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

In [35]:
label_list = list(label2id.keys())

In [37]:
import evaluate
import random
def compute_metrics(eval_preds):
  metric = evaluate.load("f1")

  results = {}

  predictions, labels = eval_preds
  predictions = np.argmax(predictions, axis=2)

  true_predictions = [
      [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]
  true_labels = [
      [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(predictions, labels)
  ]

  true_predictions = [p for pred in true_predictions for p in pred]
  true_labels = [l for label in true_labels for l in label]
  

  true_predictions = [label2id[pred] for pred in true_predictions]
  true_labels = [label2id[label] for label in true_labels]

  for label in set(true_labels):
    binary_predictions = [1 if pred == label else 0 for pred in true_predictions]
    binary_labels = [1 if l == label else 0 for l in true_labels]
    results[id2label[label]] = metric.compute(predictions=binary_predictions, references=binary_labels, average="binary")["f1"]

  return(results)

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import BertForTokenClassification


model = BertForTokenClassification.from_pretrained("StanfordAIMI/stanford-deidentifier-base", num_labels=len(id2label), id2label=id2label, label2id=label2id, ignore_mismatched_sizes=True)

training_args = TrainingArguments(
    output_dir="./results/de_identification",
    overwrite_output_dir=True,
    logging_strategy='steps',
    logging_steps=1,
    logging_dir='/content/drive/MyDrive/Robust_SlotFilling/results/runs',
    report_to="wandb",
    run_name="manual-dataset-run-3-195-stanford",
    warmup_steps=0,
    evaluation_strategy="steps",
    eval_steps=1,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

In [40]:
from transformers import pipeline

## Loading Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("./results/de_identification/checkpoint-500/")
model = AutoModelForTokenClassification.from_pretrained("./results/de_identification/checkpoint-500/")

# Model Testing

In [41]:
token_classifier = pipeline("token-classification", model=model, tokenizer=tokenizer, aggregation_strategy= "average", device='cuda:0')

In [42]:
token_classifier(val_data.full_text.to_list())

[[{'entity_group': 'ORGANIZATION',
   'score': 0.73633033,
   'word': 'kragen auto parts',
   'start': 0,
   'end': 17}],
 [{'entity_group': 'GPE',
   'score': 0.9432222,
   'word': 'germany',
   'start': 18,
   'end': 25}],
 [{'entity_group': 'EMAIL_ADDRESS',
   'score': 0.9346185,
   'word': 'aleahdocherty',
   'start': 23,
   'end': 36},
  {'entity_group': 'EMAIL_ADDRESS',
   'score': 0.8133048,
   'word': '@',
   'start': 36,
   'end': 37},
  {'entity_group': 'ORGANIZATION',
   'score': 0.48778424,
   'word': 'jourrapide',
   'start': 37,
   'end': 47},
  {'entity_group': 'ORGANIZATION',
   'score': 0.31336486,
   'word': '. com',
   'start': 47,
   'end': 51}],
 [{'entity_group': 'TITLE',
   'score': 0.94536376,
   'word': 'mrs',
   'start': 84,
   'end': 87},
  {'entity_group': 'TITLE',
   'score': 0.5880402,
   'word': '.',
   'start': 87,
   'end': 88},
  {'entity_group': 'PERSON',
   'score': 0.93396324,
   'word': 'thea solberg',
   'start': 89,
   'end': 101}],
 [{'entity_gr

## Model Interpretability Tests

In [None]:
!pip install transformers-interpret

In [44]:
from transformers_interpret import TokenClassificationExplainer


In [None]:
ner_explainer = TokenClassificationExplainer(
    model,
    tokenizer,
)

# sample_text = "We visited Paris last weekend, where Emmanuel Macron lives."
sample_text = val_data.full_text.to_list()

for text in sample_text:
  word_attributions = ner_explainer(text)
  ner_explainer.visualize("bert_ner_viz.html")