In [1]:
import os
import json
from dotenv import load_dotenv
from pathlib import Path
import random

dotenv_path = Path("../../.env")
if dotenv_path.exists():
    print("Loaded .env file!")
    load_dotenv(str(dotenv_path))


data = json.load(open(Path(os.environ["PROJECT_HOME_DIR"]) / "data/train.json"))

Loaded .env file!


In [2]:
for d in data:
    temp = ""
    for token, ll, ws in zip(d["tokens"], d["labels"], d["trailing_whitespace"]):
        if "STREET" in ll:
            temp += token
            if ws:
                temp += " "

    if temp != "":
        print(temp)
        print("*"*10)

591 Smith Centers Apt. 656
Joshuamouth, RI 95963 
**********
743 Erika Bypass Apt. 419
Andreahaven, IL 54207
**********


In [3]:
b_tokens = set()
i_tokens = set()

for d in data:
    for token, ll in zip(d["tokens"], d["labels"]):
        if "STREET" in ll:
            if ll.startswith("B-"):
                b_tokens.add(token)
            if ll.startswith("I-"):
                i_tokens.add(token)

print(b_tokens)
print("*"*10)
print(i_tokens)

{'591', '743'}
**********
{'95963', '656', '54207', '.', 'Smith', '\n', 'Bypass', 'Andreahaven', 'IL', 'Apt', ',', '419', 'Joshuamouth', 'Erika', 'Centers', 'RI'}


checking if preprocessing labels newlines correctly

In [4]:
from piidd.processing.pre import strided_tokenize
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")

temp = [d for d in data if any(["STREET" in ll for ll in d["labels"]])]

ds1 = Dataset.from_dict(
        {
            "full_text": [x["full_text"] for x in temp],
            "document": [x["document"] for x in temp],
            "tokens": [x["tokens"] for x in temp],
            "trailing_whitespace": [x["trailing_whitespace"] for x in temp],
            "provided_labels": [x["labels"] for x in temp],
        }
    )

labels = set()
for d in ds1["provided_labels"]:
    for ll in d:
        labels.add(ll)

labels = list(labels)

label2id = {label: i for i, label in enumerate(labels)}



tds = ds1.map(
    strided_tokenize,
    batched=True,
    fn_kwargs={"tokenizer": tokenizer, "stride": 128, "max_length": 512, "label2id": label2id}, 
    batch_size=1,
    remove_columns=ds1.column_names
)



Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [5]:
idx = 2

ids = tds[idx]["input_ids"]
labels = tds[idx]["labels"]

[(x,y) for x,y in list(zip(tokenizer.convert_ids_to_tokens(ids), [labels[i] for i in range(len(labels))])) if y in {label2id["I-STREET_ADDRESS"], label2id["B-STREET_ADDRESS"]}]

[('▁743', 5),
 ('▁Erika', 0),
 ('▁Bypass', 0),
 ('▁Apt', 0),
 ('.', 0),
 ('▁419', 0),
 ('▁Andrea', 0),
 ('haven', 0),
 (',', 0),
 ('▁IL', 0),
 ('▁54', 0),
 ('207', 0)]

In [6]:
from tokenizers import AddedToken

tokenizer.add_tokens(AddedToken("\n", normalized=False))

tds2 = ds1.map(
    strided_tokenize,
    batched=True,
    fn_kwargs={"tokenizer": tokenizer, "stride": 128, "max_length": 512, "label2id": label2id}, 
    batch_size=1,
    remove_columns=ds1.column_names
)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [7]:
idx = 2

ids = tds2[idx]["input_ids"]
labels = tds2[idx]["labels"]

[(x,y) for x,y in list(zip(tokenizer.convert_ids_to_tokens(ids), [labels[i] for i in range(len(labels))])) if y in {label2id["I-STREET_ADDRESS"], label2id["B-STREET_ADDRESS"]}]

[('▁743', 5),
 ('▁Erika', 0),
 ('▁Bypass', 0),
 ('▁Apt', 0),
 ('.', 0),
 ('▁419', 0),
 ('\n', 0),
 ('▁Andrea', 0),
 ('haven', 0),
 (',', 0),
 ('▁IL', 0),
 ('▁54', 0),
 ('207', 0)]

In [8]:
toke_d1 = AutoTokenizer.from_pretrained('microsoft/deberta-large')

tdsd1 = ds1.map(
    strided_tokenize,
    batched=True,
    fn_kwargs={"tokenizer": toke_d1, "stride": 128, "max_length": 512, "label2id": label2id}, 
    batch_size=1,
    remove_columns=ds1.column_names
)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [10]:
idx = 2

ids = tdsd1[idx]["input_ids"]
labels = tdsd1[idx]["labels"]

[(x,y) for x,y in list(zip(toke_d1.convert_ids_to_tokens(ids), [labels[i] for i in range(len(labels))])) if y in {label2id["I-STREET_ADDRESS"], label2id["B-STREET_ADDRESS"]}]

[('Ġ7', 5),
 ('43', 5),
 ('ĠE', 0),
 ('rika', 0),
 ('ĠBy', 0),
 ('pass', 0),
 ('ĠA', 0),
 ('pt', 0),
 ('.', 0),
 ('Ġ419', 0),
 ('Ċ', 0),
 ('Andre', 0),
 ('ah', 0),
 ('aven', 0),
 (',', 0),
 ('ĠIL', 0),
 ('Ġ54', 0),
 ('207', 0)]

In [12]:
toke_d2 = AutoTokenizer.from_pretrained('microsoft/deberta-v2-xlarge')

tdsd2 = ds1.map(
    strided_tokenize,
    batched=True,
    fn_kwargs={"tokenizer": toke_d2, "stride": 128, "max_length": 512, "label2id": label2id}, 
    batch_size=1,
    remove_columns=ds1.column_names
)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [13]:
idx = 2

ids = tdsd2[idx]["input_ids"]
labels = tdsd2[idx]["labels"]

[(x,y) for x,y in list(zip(toke_d2.convert_ids_to_tokens(ids), [labels[i] for i in range(len(labels))])) if y in {label2id["I-STREET_ADDRESS"], label2id["B-STREET_ADDRESS"]}]

[('▁743', 5),
 ('▁Erika', 0),
 ('▁Bypass', 0),
 ('▁Apt', 0),
 ('.', 0),
 ('▁419', 0),
 ('▁Andrea', 0),
 ('haven', 0),
 (',', 0),
 ('▁IL', 0),
 ('▁542', 0),
 ('07)', 0)]