In [1]:
!pip install tqdm
!pip install transformers




[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


^C





[notice] A new release of pip is available: 23.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import pandas as pd
from tqdm import tqdm
import json

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

raw_dataset = json.load(open("../../data/lighttag_preprocessing/validation_parents.json", "r", encoding="utf-8"))

In [13]:
id2label = {
    0: "O",
    1: "B-Artifacts",
    2: "I-Artifacts",
    3: "B-Behavioral",
    4: "I-Behavioral",
    5: "B-Biological Kind",
    6: "I-Biological Kind",
    7: "B-Mental State",
    8: "I-Mental State",
    9: "B-Non-Living Kind",
    10: "I-Non-Living Kind",
    11: "B-Normative Feature",
    12: "I-Normative Feature",
    13: "B-Perceptual",
    14: "I-Perceptual",
    15: "B-Social Kind/Role",
    16: "I-Social Kind/Role",
    17: "B-Anthropomorphized",
    18: "I-Anthropomorphized",
}
label2id = {v: k for k, v in id2label.items()}

In [14]:
dataset = {}
for data in raw_dataset:
    
    sentence_id, sentence, label = data["sentence_id"], data["sentence"], data["label"]
    if label in ["Relation", "Purpose"]:
        continue
    if sentence_id not in dataset:
        dataset[sentence_id] = {"sentence": sentence, "sentence_id": sentence_id, "labels": []}
        
    dataset[sentence_id]["labels"].append({
        "word": data["word"],
        "label": data["label"],
        "start": data["start"], 
        "end": data["end"],
    })
dataset["07dfb029-196c-4db6-a7c9-fff69233d4c3"]

{'sentence': 'can you say hi Amanda',
 'sentence_id': '07dfb029-196c-4db6-a7c9-fff69233d4c3',
 'labels': [{'word': 'you', 'label': 'Biological Kind', 'start': 4, 'end': 7},
  {'word': 'Amanda', 'label': 'Biological Kind', 'start': 15, 'end': 21},
  {'word': 'say', 'label': 'Behavioral', 'start': 8, 'end': 11}]}

In [7]:
tokenizer.tokenize("looks")

['looks']

In [16]:
def tokenize(data):
    encoded = tokenizer.encode_plus(data['sentence'], return_offsets_mapping=True)
    ids = encoded["input_ids"]
    offsets = encoded["offset_mapping"]
    tokens = tokenizer.convert_ids_to_tokens(ids)
    mask = encoded["attention_mask"]
    result = []
    for i, token in enumerate(tokens):
        result.append({
            "token": token,
            "start_index": offsets[i][0],
            "end_index": offsets[i][1],
            "id": ids[i],
        })
    return result

def most_frequent(list_of_labels):
   return max(set(list_of_labels), key=list_of_labels.count)
    
    
def get_clean_label(label):
    if label == "O":
        return label
    else:
        return label.split(" @@@ ")[0]


def generate_labeled_tokens(data):
    text = data["sentence"]
    labels = data["labels"]
    tokens = tokenize(data)

    char_label = ["O"] * len(text)

    for i, span in enumerate(labels):

        label = span["label"]
        start = span["start"]
        end = span["end"]

        char_label[start:end] = [f"{label} @@@ #{i}"] * (end - start)

    for i, token in enumerate(tokens):
        if token["start_index"] != token["end_index"]:
            token["raw_label"] = most_frequent(
                char_label[token["start_index"] : token["end_index"]]
            )
        else:
            token["raw_label"] = "O"
        token["clean_label"] = get_clean_label(token["raw_label"])
        
    # BIO labels
    for i, token in enumerate(tokens):
        if token["raw_label"] != "O":
            if i == 0:
                token["bio_label"] = "B-" + token["clean_label"]

            else:
                if tokens[i - 1]["raw_label"] == tokens[i]["raw_label"]:
                    token["bio_label"] = "I-" + token["clean_label"]
                else:
                    token["bio_label"] = "B-" + token["clean_label"]
        else:
            token["bio_label"] = "O"

    return tokens

def tokens_to_list(sentence_id, tokens):
    ner_tags = []
    tokens_list = []
    token_ids = []
    label_ids = []
    for token in tokens:
        tokens_list.append(token["token"])
        token_ids.append(token["id"])
        if (token["token"].startswith("##")) or (token["token"] in ['[CLS]', '[SEP]', '[PAD]']):
            label_ids.append(-100)
            ner_tags.append(-100)
        else:
            label_ids.append(label2id[token["bio_label"]])
            ner_tags.append(token["bio_label"])

    return {
        "sentence_id": sentence_id,
        "tokens": tokens_list,
        "token_ids": token_ids,
        "ner_tags": ner_tags,
        "label_ids": label_ids,
    }

result = tokens_to_list(459, generate_labeled_tokens(dataset["07dfb029-196c-4db6-a7c9-fff69233d4c3"]))
print(result['tokens'])
print(result['ner_tags'])
print(result['token_ids'])
print(result['label_ids'])

['[CLS]', 'can', 'you', 'say', 'hi', 'amanda', '[SEP]']
[-100, 'O', 'B-Biological Kind', 'B-Behavioral', 'O', 'B-Biological Kind', -100]
[101, 2064, 2017, 2360, 7632, 8282, 102]
[-100, 0, 5, 3, 0, 5, -100]


In [17]:
results = []
for sentence_id in dataset:
    result = tokens_to_list(sentence_id, generate_labeled_tokens(dataset[sentence_id]))
    results.append(result)
results

[{'sentence_id': '207e43bd-33d0-4015-86f5-e3742943fc0a',
  'tokens': ['[CLS]',
   'let',
   'me',
   'just',
   'double',
   'check',
   'there',
   '[SEP]'],
  'token_ids': [101, 2292, 2033, 2074, 3313, 4638, 2045, 102],
  'ner_tags': [-100,
   'O',
   'B-Biological Kind',
   'O',
   'B-Perceptual',
   'B-Behavioral',
   'O',
   -100],
  'label_ids': [-100, 0, 5, 0, 13, 3, 0, -100]},
 {'sentence_id': '677e16cb-3282-45be-b535-9347fb092d08',
  'tokens': ['[CLS]', 'what', 'are', 'you', 'doing', 'huh', '[SEP]'],
  'token_ids': [101, 2054, 2024, 2017, 2725, 9616, 102],
  'ner_tags': [-100, 'O', 'O', 'B-Biological Kind', 'B-Behavioral', 'O', -100],
  'label_ids': [-100, 0, 0, 5, 3, 0, -100]},
 {'sentence_id': '8f873564-bf07-48bb-981e-0508526128c5',
  'tokens': ['[CLS]', 'i', 'gotta', 'put', 'my', 'sister', '[SEP]'],
  'token_ids': [101, 1045, 10657, 2404, 2026, 2905, 102],
  'ner_tags': [-100,
   'B-Biological Kind',
   'B-Behavioral',
   'B-Behavioral',
   'O',
   'B-Social Kind/Role',
   

In [11]:
with open("../../data/validation_parents_ner.json", 'w', encoding="utf-8") as json_file:
    json.dump(results, json_file, ensure_ascii=False, indent=4)