In [2]:
import json
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [3]:
data = json.load(open("/drive2/kaggle/pii-dd/data/train.json"))

In [4]:
d = data[0]

tokens = d["tokens"]
labels = d["labels"]
tw = d["trailing_whitespace"]

print(list(zip(tokens, labels))[:20])

[('Design', 'O'), ('Thinking', 'O'), ('for', 'O'), ('innovation', 'O'), ('reflexion', 'O'), ('-', 'O'), ('Avril', 'O'), ('2021', 'O'), ('-', 'O'), ('Nathalie', 'B-NAME_STUDENT'), ('Sylla', 'I-NAME_STUDENT'), ('\n\n', 'O'), ('Challenge', 'O'), ('&', 'O'), ('selection', 'O'), ('\n\n', 'O'), ('The', 'O'), ('tool', 'O'), ('I', 'O'), ('use', 'O')]


In [4]:
text = ""

for t, l, w in zip(tokens, labels, tw):
    text += t
    if w:
        text += " "    

In [7]:
tokenized = tokenizer(text, return_offsets_mapping=True, return_overflowing_tokens=True, stride=128, max_length=384)
len(tokenized.input_ids)

3

In [13]:
tokenized["overflow_to_sample_mapping"]

[0, 0, 0]

In [14]:
tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [99]:
import numpy as np

def tokenize(example, tokenizer, label2id, max_length, stride):
    text = []
    char_labels = []

    

    tokens = example["tokens"][0]
    provided_labels = example["provided_labels"][0]
    trailing_whitespace = example["trailing_whitespace"][0]

    for t, l, ws in zip(
        tokens, provided_labels, trailing_whitespace
    ):
        text.append(t)
        char_labels.extend([l] * len(t))
        if ws:
            text.append(" ")
            char_labels.append("O")



    tokenized = tokenizer(
        "".join(text),
        return_offsets_mapping=True,
        max_length=max_length,
        truncation=True,
        stride=stride,
        return_overflowing_tokens=True,
        padding="max_length",
    )

    # tokenized is now a list of lists depending on how long the input is, the max length, and the stride

    char_labels = np.array(char_labels)

    text = "".join(text)
    token_labels = np.ones((len(tokenized.input_ids), max_length), dtype=np.int32) * label2id["O"]

    for i in range(len(tokenized.input_ids)):

        for j, (start_idx, end_idx) in enumerate(tokenized.offset_mapping[i]):
            # CLS token
            if start_idx == 0 and end_idx == 0:
                continue

            # case when token starts with whitespace
            while text[start_idx].isspace():
                start_idx += 1  

            
            start_idx = min(start_idx, len(char_labels) - 1)

            token_labels[i, j] = label2id[char_labels[start_idx]]

    return {**tokenized, "labels": token_labels}

In [6]:
base_labels = {
    "EMAIL",
    "ID_NUM",
    "NAME_STUDENT",
    "PHONE_NUM",
    "STREET_ADDRESS",
    "URL_PERSONAL",
    "USERNAME",
}
all_labels = []
for l in base_labels:
    all_labels.append(f"B-{l}")
    all_labels.append(f"I-{l}")
all_labels.append("O")

all_labels = sorted(all_labels)
label2id = {l: i for i, l in enumerate(all_labels)}
id2label = {v: k for k, v in label2id.items()}

In [7]:
from datasets import Dataset


ds1 = Dataset.from_dict(
        {
            "full_text": [x["full_text"] for x in data],
            # "document": [x["document"] for x in data],
            "tokens": [x["tokens"] for x in data],
            "trailing_whitespace": [x["trailing_whitespace"] for x in data],
            "provided_labels": [x["labels"] for x in data],
        }
    )

In [100]:
tds = ds1.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "label2id": label2id, "max_length": 384, "stride": 128}, remove_columns=ds1.column_names, batched=True, batch_size=1)

Map:   0%|          | 0/6807 [00:00<?, ? examples/s]

In [102]:
x["input_ids"]

[1,
 8837,
 2969,
 1489,
 262,
 2362,
 261,
 306,
 403,
 2537,
 579,
 262,
 780,
 1017,
 261,
 2128,
 2177,
 360,
 602,
 864,
 265,
 9608,
 267,
 5646,
 263,
 1236,
 3478,
 2588,
 261,
 269,
 449,
 4515,
 262,
 1933,
 296,
 298,
 1067,
 264,
 1348,
 271,
 1492,
 715,
 38217,
 9608,
 1262,
 293,
 262,
 513,
 265,
 16789,
 260,
 7616,
 261,
 401,
 265,
 262,
 1348,
 271,
 1492,
 715,
 38217,
 1074,
 22858,
 263,
 262,
 671,
 276,
 268,
 22423,
 5646,
 1706,
 261,
 2076,
 770,
 278,
 1037,
 1446,
 277,
 462,
 3478,
 263,
 14340,
 260,
 329,
 9482,
 269,
 461,
 38459,
 261,
 304,
 262,
 671,
 296,
 286,
 264,
 3566,
 262,
 2943,
 10980,
 267,
 469,
 384,
 628,
 261,
 263,
 1446,
 283,
 28890,
 283,
 628,
 260,
 325,
 327,
 303,
 264,
 413,
 469,
 1016,
 272,
 295,
 825,
 262,
 1933,
 939,
 497,
 1114,
 2177,
 260,
 545,
 291,
 1956,
 266,
 310,
 9939,
 263,
 1636,
 671,
 260,
 3289,
 261,
 385,
 299,
 2262,
 8909,
 265,
 1506,
 261,
 278,
 1015,
 264,
 286,
 32601,
 352,
 266,
 6152,
 260,

In [116]:
# Confirm that alignment is good

# run multiple times to see different rows
x = tds.shuffle()[0]


print("*"*100)

print(tokenizer.decode(x["input_ids"]))
       
for t, l in zip(tokenizer.convert_ids_to_tokens(x["input_ids"]), x["labels"]):
    if id2label[l] != "O":
        print((t,id2label[l]))
        

****************************************************************************************************
[CLS] REFLECTION – LEARNING LAUNCH Irfan Khan Reflection – Learning Launch Challenge I was recently promoted as the team leader of a small team consisting of two (2) web developers, one (1) graphic designer, one (1) marketing associate and one (1) lead generation specialist and my/our challenge is to create an effective workflow within the team in order to successfully develop and launch our client’s brand. The project scope includes building a brand, starting a marketing campaign and launch a website. The team of five (5) employees will work on this project with an even distribution of working hours amongst the team members. The completion of this project should not exceed 45 business days. We will focus on trying to determine what is the best workflow that works for our team. Selection This will be the first design thinking activities/tools that our team will try, and we chose the lea

14

In [51]:
print(tokenizer.convert_ids_to_tokens(x["input_ids"]))

['[CLS]', '’', 's', '▁income', '▁for', '▁livelihood', '.', '▁Because', '▁of', '▁the', '▁sudden', '▁lockdown', ',', '▁Santosh', '▁was', '▁unaware', '▁of', '▁the', '▁scenario', '▁and', '▁he', '▁and', '▁his', '▁family', '▁are', '▁in', '▁Gujrat', '▁and', '▁his', '▁parents', '▁are', '▁in', '▁Odisha', '.', '▁The', '▁production', '▁in', '▁his', '▁factory', '▁is', '▁at', '▁halt', ',', '▁and', '▁since', '▁he', '▁is', '▁a', '▁daily', '▁worker', ',', '▁he', '▁is', '▁left', '▁with', '▁no', '▁source', '▁of', '▁income', '▁and', '▁fails', '▁to', '▁feed', '▁his', '▁family', '.', '▁He', '▁has', '▁requested', '▁the', '▁government', '▁to', '▁allow', '▁him', '▁move', '▁to', '▁his', '▁home', '▁state', '▁where', '▁he', '▁can', '▁at', '▁least', '▁work', '▁as', '▁a', '▁farmer', '.', '▁His', '▁request', '▁has', '▁been', '▁denied', '▁by', '▁the', '▁government', '▁and', '▁now', '▁he', '▁has', '▁no', '▁idea', '▁what', '▁he', '▁should', '▁do', '▁to', '▁feed', '▁his', '▁wife', ',', '▁two', '▁daughters', '▁and', '▁o

In [4]:
import numpy as np

n = np.random.rand(4, 8)
p = np.random.rand(1, 8)

np.maximum(n, p)

array([[0.71691552, 0.64349265, 0.85438092, 0.58456591, 0.10275883,
        0.36860505, 0.51464224, 0.46502847],
       [0.41150537, 0.38086034, 0.36238575, 0.95276757, 0.92933539,
        0.36860505, 0.60820741, 0.47948528],
       [0.75110017, 0.80993238, 0.52904256, 0.58456591, 0.90332888,
        0.48367086, 0.62744279, 0.76127702],
       [0.76630244, 0.42193554, 0.36238575, 0.58456591, 0.70000835,
        0.36860505, 0.68136607, 0.25338795]])

In [5]:
n

array([[0.71691552, 0.64349265, 0.85438092, 0.04943591, 0.10275883,
        0.02191087, 0.51464224, 0.46502847],
       [0.41150537, 0.38086034, 0.34002642, 0.95276757, 0.92933539,
        0.26779906, 0.60820741, 0.47948528],
       [0.75110017, 0.80993238, 0.52904256, 0.3263116 , 0.90332888,
        0.48367086, 0.62744279, 0.76127702],
       [0.76630244, 0.42193554, 0.35928957, 0.25104063, 0.70000835,
        0.10567496, 0.68136607, 0.04068233]])

In [6]:
p

array([[0.01843215, 0.33613288, 0.36238575, 0.58456591, 0.07117521,
        0.36860505, 0.12551771, 0.25338795]])