In [2]:
import os
import json
import random
import string
from pathlib import Path

from dotenv import load_dotenv
from datasets import Dataset

from piidd.data_generation.utils import (
    first_names,
    last_names,
)

load_dotenv("../../.env")

home_dir = Path(os.getenv("PROJECT_HOME_DIR"))

data = json.load(open(home_dir / "data" / "train.json"))

In [3]:
ds = Dataset.from_dict({"tokens": [x["tokens"] for x in data], "labels": [x["labels"] for x in data]})

In [4]:
first_names = set([x.title() for x in first_names])
last_names = set([x.title() for x in last_names])


def find_candidates(examples):

    all_candidates = []
    token_candidates = []

    for tokens in examples["tokens"]:
        candidates = []
        tc = []
        for i, token in enumerate(tokens):
            if token.istitle() and (token in first_names or token in last_names):
                candidates.append(i)
                tc.append(token)
        all_candidates.append(candidates)
        token_candidates.append(tc)

    return {
        "candidates": all_candidates,   
        "token_candidates": token_candidates,
    }


candidates = ds.map(find_candidates, batched=True, num_proc=8)

Map (num_proc=8):   0%|          | 0/6807 [00:00<?, ? examples/s]

In [5]:
def find_names(example):
    b_names = []
    i_names = []

    for i, (t, ll) in enumerate(zip(example["tokens"], example["labels"])):
        if ll == "B-NAME_STUDENT":
            b_names.append(t)
        if ll == "I-NAME_STUDENT":
            i_names.append(t)

    return {
        "b_names": "||".join(b_names),
        "i_names": "||".join(i_names),
    }

labeled_names = ds.map(find_names, num_proc=8)

Map (num_proc=8):   0%|          | 0/6807 [00:00<?, ? examples/s]

In [6]:
b_names = [x.split("||") for x in labeled_names["b_names"]]
i_names = [x.split("||") for x in labeled_names["i_names"]]

In [7]:
from itertools import chain


label_names = set(list(chain(*(b_names + i_names))))

len(label_names), len(label_names - first_names), len(label_names - last_names)

(1270, 921, 807)

In [8]:
mixtral_first_names = json.load(open("/drive2/kaggle/pii-dd/piidd/data_generation/mixtral_first_names.json"))
mixtral_last_names = json.load(open("/drive2/kaggle/pii-dd/piidd/data_generation/mixtral_last_names.json"))

In [9]:
len(label_names - set(mixtral_first_names)-set(mixtral_last_names))

271

In [14]:
len(label_names - set(mixtral_first_names)-set(mixtral_last_names) - first_names - last_names)

234

In [11]:
label_names - first_names - last_names

{'',
 'Aakash',
 'Aarts',
 'Abdiel',
 'Abdo',
 'Abdullahi',
 'Abidin',
 'Abiodun',
 'Abo',
 'Abul',
 'Achraf',
 'Adebayo',
 'Adem',
 'Adigun',
 'Adri',
 'Afiq',
 'Afridi',
 'Agim',
 'Ahamad',
 'Ajay',
 'Ajayi',
 'Akash',
 'Akhilesh',
 'Akram',
 'Al',
 'Alaa',
 'Alberti',
 'Alejandra',
 'Alina',
 'Alirio',
 'Alvaro',
 'Amira',
 'Amit',
 'Amritpal',
 'Anas',
 'Andreia',
 'Angelik',
 'Anil',
 'Anjali',
 'Ankit',
 'Aoife',
 'Arath',
 'Ari',
 'Arnaldo',
 'Art',
 'Ashok',
 'Ashraf',
 'Ashry',
 'Asia',
 'Asif',
 'Asim',
 'Asiri',
 'Asmaa',
 'Auni',
 'Aylin',
 'Azubuike',
 'Azwan',
 'Bahar',
 'Baloch',
 'Bankole',
 'Bappa',
 'Barbie',
 'Barco',
 'Bas',
 'Basavaraju',
 'Basha',
 'Basmah',
 'Bekim',
 'Belal',
 'Bellafiore',
 'Bennani',
 'Bergamo',
 'Bertolini',
 'Bezuidenhout',
 'Bhai',
 'Bikram',
 'Birra',
 'Bisht',
 'Bk',
 'Blasi',
 'Bomkazi',
 'Boonstra',
 'Borz',
 'Bunga',
 'Buonincontro',
 'Busatta',
 'Busi',
 'Busisiwe',
 'Camilo',
 'Campanelli',
 'Campani',
 'Cara',
 'Cardo',
 'Carella',


In [12]:
b_names

[['Nathalie', 'Nathalie', 'Nathalie'],
 ['Diego', 'Diego'],
 ['Gilberto'],
 ['Sindy'],
 ['Nadine'],
 ['Eladio'],
 ['Silvia'],
 ['Sakir'],
 ['Francisco'],
 ['Stefano'],
 ['Al'],
 ['Pepa'],
 ['Deiby'],
 ['Fareed'],
 ['Claudia'],
 ['Rajinder'],
 ['Maud', 'Maud', 'Maud', 'Maud'],
 [''],
 ['Zia'],
 ['Davide'],
 ['Karan'],
 ['Milton'],
 ['Luis'],
 ['Cesar'],
 ['Dharmendra', 'Dharmendra'],
 ['Daniel'],
 ['Suhag'],
 ['Eina'],
 ['Mauro'],
 ['Gabriel',
  'Hlengiwe',
  'Tino',
  'Tino',
  'Swetha',
  'Alex',
  'Alex',
  'Tino',
  'Hlengiwe'],
 ['Mohd'],
 ['Wilson', 'Wilson'],
 ['Madina'],
 ['Amparo'],
 ['Edgar'],
 ['Ahmed', 'Ahmed', 'Ahmed'],
 ['Joao'],
 ['Manuel'],
 ['Mlungisi'],
 ['Fatima',
  'Fatima',
  'Fatima',
  'Fatima',
  'Fatima',
  'Fatima',
  'Fatima',
  'Fatima'],
 ['Narayan'],
 ['Jose'],
 ['Florian'],
 ['David'],
 ['Simon'],
 ['Asim'],
 ['Vicki', 'Vicki'],
 ['Olivier', 'Olivier'],
 ['Abul'],
 [''],
 ['Sjoerd'],
 ['Medo', 'Medo'],
 ['Diadie'],
 ['Sandra'],
 ['Valdecir'],
 ['Saeed'],
 

In [13]:
candidates["token_candidates"]

[['Paris'],
 ['Diego', 'Estrada', 'Diego', 'Estrada'],
 ['Gamboa', 'Given'],
 ['Sindy', 'My', 'George'],
 ['Nadine', 'Born', 'Trail', 'To', 'An', 'Canada', 'Good'],
 ['Amaya', 'Spain'],
 ['Silvia', 'Villalobos', 'An'],
 ['Ahmad', 'Will'],
 ['Francisco', 'Ferreira', 'Carlos', 'Brazil', 'So', 'To', 'So'],
 ['Gandhi',
  'Stefano',
  'Lovato',
  'April',
  'Adam',
  'Smith',
  'John',
  'Stuart',
  'Mill',
  'So',
  'Igor',
  'Smith',
  'So',
  'Alexander',
  'Treat',
  'Player',
  'Player',
  'To',
  'To',
  'So',
  'Charity',
  'So',
  'So',
  'Ernst',
  'Fehr',
  'Smith',
  'Fehr',
  'Schmidt'],
 ['Dear', 'My', 'So', 'Virginia', 'Virginia'],
 ['Medrano', 'April', 'My', 'You', 'My'],
 ['My', 'Angela', 'Meyer', 'So'],
 ['Ponce', 'Story', 'So', 'Post'],
 ['To', 'Claudia'],
 ['Santos', 'My'],
 ['Maud',
  'Dias',
  'To',
  'Maud',
  'Dias',
  'Maud',
  'Dias',
  'Maud',
  'Dias',
  'Hunt',
  'Hunt'],
 ['You', 'Apple', 'So', 'So', 'So', 'So', 'Good', 'Good', 'Good', 'Good'],
 ['Virginia',
  '