This notebook uses mixtral to generate names. This is an attempt to get a diverse range of names.

It uses the csv found [here](https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv) to insert country names into the prompt to get different names.

In [2]:
import pandas as pd
import os
import string
import random
import json


from huggingface_hub import InferenceClient
from dotenv import load_dotenv
from datasets import Dataset

load_dotenv("../../.env")

countries = pd.read_csv("../../data/countries.csv")

client = InferenceClient(token=os.environ["HF_TOKEN"])

In [16]:
def generate(example, first_name=True):

    country = example["country"]

    name_type = "first" if first_name else "last"

    rarity = random.choice(["common", "uncommon", "rare", "unique", "very rare", ""])

    if len(rarity) != "":
        rarity = f"{rarity} "

    prompt = f"""
<s> [INST] Generate 5 different {rarity}{name_type} names of people who live in the United States. Only provide the names and no explanation.

Names:

John
Mary
Nick
Miles
Sarah

Generate 20 different {rarity}{name_type} names of people who live in {country}. Only provide the names and no explanation. [/INST]

Names:

""".lstrip()

    try:
        r = client.post(
            json={
                "inputs": prompt,
                "parameters": {
                    "max_new_tokens": 300,
                    "top_k": 50,
                    "temperature": 1.0,
                    "return_full_text": False,
                    "stop": ["\n\n"]
                },
                "options": {"use_cache": False},
            },
            model="mistralai/Mixtral-8x7B-Instruct-v0.1",
        )

        text = json.loads(r.decode())[0]["generated_text"]

    except Exception as e:
        print(e)
        text = "<|Error|>"

    return {
        "generated_text": text,
        "rarity": rarity,
    }

In [17]:
def random_string(length):
    letters = string.ascii_lowercase
    return "".join(random.choice(letters) for i in range(length))

In [21]:
for i in range(1, 50):
    ds = Dataset.from_dict({"id": [random_string(10) for _ in range(len(countries))], "country": countries["name"]})
    ds = ds.map(generate, num_proc=10, fn_kwargs={"first_name": True})

    ds.to_parquet(f"names/first_names{i}.parquet")

for i in range(1, 50):
    ds = Dataset.from_dict({"id": [random_string(10) for _ in range(len(countries))], "country": countries["name"]})
    ds = ds.map(generate, num_proc=10, fn_kwargs={"first_name": False})
    ds.to_parquet(f"names/last_names{i}.parquet")

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

502 Server Error: Bad Gateway for url: https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

504 Server Error: Gateway Time-out for url: https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

502 Server Error: Bad Gateway for url: https://api-inference.huggingface.co/models/mistralai/Mixtral-8x7B-Instruct-v0.1


Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map (num_proc=10):   0%|          | 0/249 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [3]:
from pathlib import Path
from datasets import concatenate_datasets, Dataset
import os

files = (Path(os.environ["PROJECT_HOME_DIR"]) / "piidd/data_generation/names").glob("first_names*")
first_names_ds = concatenate_datasets([Dataset.from_parquet(str(f)) for f in files])

files = (Path(os.environ["PROJECT_HOME_DIR"]) / "piidd/data_generation/names").glob("last_names*")
last_names_ds = concatenate_datasets([Dataset.from_parquet(str(f)) for f in files])

In [4]:
first_names = first_names_ds["generated_text"]
last_names = last_names_ds["generated_text"]

In [50]:
# from unidecode import unidecode

# original_text = "Próspero"
# normalized_text = unidecode(original_text)
# print(normalized_text)  # Output: Prospero


def clean(raw):

    s = raw.split("\n\n")[0]
    s = s.split("\nNote:")[0]
    s = s.split("\n(Note:")[0]
    s = s.split("\nUser:")[0]


    s = s.strip().split("\n")
    s = [x.lstrip("*-").strip() for x in s]

    clean = []
    for x in s:
        if "." in x[:3]:
            x = x.split(".")[1].strip()
        clean.append(x)


    names =  [x for x in clean if 30 > len(x) > 1]
    names = [x for x in names if all(["," not in x, "(" not in x, ")" not in x])]

    def remove_male_female(x):
        patterns = [
            "Female: ",
            "Male: ",
        ]
        for pattern in patterns:
            if x.startswith(pattern):
                return x[len(pattern):].strip()
        return x.strip()

    names = [remove_male_female(x) for x in names]

    return names
    

clean_first_names = [clean(x) for x in first_names]
clean_last_names = [clean(x) for x in last_names]

In [51]:
first_names[3]

'Djamila\nMohand\nKahina\nTahar\nZinedine\nMalika\nFarid\nMeriem\nLila\nNoureddine\nAssia\nYasmine\nHacene\nFethi\nSabrina\nSofiane\nAmira\nHouari\nFatima\nAli\n\nPlease note that while these names are less common in the United States, they are relatively common in Algeria, a country with a unique cultural and linguistic heritage.'

In [52]:
from itertools import chain

flat_clean_first_names = list(chain(*clean_first_names))
flat_clean_last_names = list(chain(*clean_last_names))

In [39]:
[x for x in first_names if "(Note" in x and "\n\n" not in x]

['Ieng\nWeng\nKeong\nWai\nMei\nLiang\nSou\nCheok\nWeng\nHin\nFong\nMing\nLei\nKa\nMan\nTin\nHei\nWan\nLok\nIok\n(Note: Macao is a special administrative region of China, and many residents have Chinese names. The above names are common Chinese given names for people in Macao.)',
 '1. Jung\n2. Kim\n3. Park\n4. Lee\n5. Choi\n6. Jo\n7. Han\n8. Ryu\n9. An\n10. Son\n11. Yun\n12. Jang\n13. Bae\n14. Sim\n15. Jung\n16. Kang\n17. Go\n18. Cho\n19. Chae\n20. Kim (Note: There are several common surnames in Korea. I have provided some of the most common ones. However, the order of surnames and given names is reversed in Korea, and individuals are usually referred to by their given names. I have listed them here as they would be listed in the United States for clarity.)']

In [53]:
[x for x in flat_clean_first_names if "," in x]

[]

In [54]:
len(flat_clean_first_names), len(set(flat_clean_first_names))

(245571, 51478)

In [55]:
max([len(x) for x in flat_clean_first_names])

23

In [56]:
[x for x in flat_clean_first_names if len(x) > 10]

['Leatualevao',
 'Bonaventura',
 'Britneyanne',
 'Jean-Baptiste',
 'Ndayishimiye',
 'Ntakarutimana',
 'Ntibazonkiza',
 'Christopher',
 'Koudoukatchi',
 'JoséIgnacio',
 'Nuutafaravai',
 'Luis Fernando',
 'Laura Sofía',
 'Jean-Pierre',
 'Hadjipavlos',
 'Maria Eugenia',
 'Jose Antonio',
 'Francisco José',
 'Jose Manuel',
 'Maria Teresa',
 'Víctor Manuel',
 'Nkosingiphile',
 'Ntombizodwa',
 'Makhosazana',
 'Jean-Pierre',
 'Marie-Paule',
 'Jacques-Yves',
 'Jean-Michel',
 'Pierre-Henri',
 'Agnes-Maria',
 'Aristarchus',
 'Dionysodorus',
 'Christopher',
 'Bonaventura',
 'Vivienne-Rose',
 'Khonesavanh',
 'Soulinthone',
 'Louis-Ducruet',
 'Munkh-Erdene',
 'Narantsetseg',
 'Erdenebayar',
 'Naranbaatar',
 'Oyun-Erdene',
 'Demmiedemmie',
 'Jean-Philippe',
 'Halakilangi',
 'Muhammadali',
 'Muhammadusama',
 'Ngarcheongs',
 'Guillermina',
 'Bogdanowica',
 'Constantino',
 'Luisantonio',
 'Habyarimana',
 'Mfashingabo',
 'Elisha-Marie',
 'Nhlakanipho',
 'Abdughaffor',
 'Pattarapong',
 'Chanidaporn',
 'Wa

In [57]:
[x for x in flat_clean_last_names if len(x) > 10]

['Hammarström',
 'Johansson-Karlsson',
 'Tualaulelei',
 'Harutyunyan',
 'Hovhannisyan',
 'Haroutunian',
 'Atakishiyev',
 'Braithwaite',
 'Liashkevich',
 'Tsishkevich',
 'Vabishchevich',
 'Vanadzinskaya',
 'Yermalovich',
 'Zyryanovich',
 'Van den Berghe',
 'Van den Bossche',
 'Schoolmeester',
 'Van De Putte',
 'Donoumassou',
 'Goutonridana',
 'Johansmeijer',
 'Ibrahimović',
 'Muharemagić',
 'Garebatshobelw',
 'Xenophontos',
 'Sharifuddin',
 'Abdul Rahim',
 'Ndabashinze',
 'Nyaruhirira',
 'Nzohabonayo',
 'Ruberintwari',
 'Ruzirabwoba',
 'Sabushimike',
 'Saganahunze',
 'Thankasaray',
 'Ebanks-Petrie',
 'Mboula-Kamas',
 'Lengue-Zimana',
 'Ouandja-Malekou',
 'Mbenguere-Dokolo',
 'Rochecouste',
 'Rajakulendran',
 'Diarrassouba',
 'Alvarez-Quiñones',
 'Cárdenas-López',
 'Echemendía-Santana',
 'Fuentes-Fernández',
 'Herrera-Rodríguez',
 'Izquierdo-Ramírez',
 'Jiménez-Guzmán',
 'La Rosa-Rivero',
 'Milián-Morales',
 'Núñez-Valdés',
 "O'Reilly-García",
 'Pérez-Duarte',
 'Quintana-Fonseca',
 'Ramí

In [60]:
countries = first_names_ds.unique("country")

country2names = {}

for c, n in zip(first_names_ds["country"], clean_first_names):
    if c not in country2names:
        country2names[c] = dict()
    
    if "first_name" not in country2names[c]:
        country2names[c]["first_name"] = set()
    country2names[c]["first_name"].update(n)

for c, n in zip(last_names_ds["country"], clean_last_names):
    if c not in country2names:
        country2names[c] = dict()
    
    if "last_name" not in country2names[c]:
        country2names[c]["last_name"] = set()
    country2names[c]["last_name"].update(n)

In [64]:
for c, d in country2names.items():
    d["first_name"] = list(d["first_name"])
    d["last_name"] = list(d["last_name"])

    print(c, len(d["first_name"]), len(d["last_name"]))

Afghanistan 352 393
Åland Islands 408 279
Albania 338 419
Algeria 206 471
American Samoa 464 446
Andorra 316 348
Angola 365 595
Anguilla 725 451
Antarctica 418 556
Antigua and Barbuda 615 437
Argentina 268 443
Armenia 225 207
Aruba 679 505
Australia 440 623
Austria 363 449
Azerbaijan 273 226
Bahamas 586 228
Bahrain 222 303
Bangladesh 427 249
Barbados 513 260
Belarus 169 480
Belgium 353 539
Belize 589 390
Benin 577 486
Bermuda 579 319
Bhutan 86 165
Bolivia (Plurinational State of) 548 313
Bonaire, Sint Eustatius and Saba 709 537
Bosnia and Herzegovina 274 461
Botswana 246 503
Bouvet Island 492 399
Brazil 385 151
British Indian Ocean Territory 674 652
Brunei Darussalam 426 480
Bulgaria 256 293
Burkina Faso 375 183
Burundi 390 267
Cabo Verde 612 225
Cambodia 284 347
Cameroon 672 550
Canada 417 431
Cayman Islands 507 309
Central African Republic 543 720
Chad 337 490
Chile 262 394
China 447 331
Christmas Island 605 688
Cocos (Keeling) Islands 632 694
Colombia 192 376
Comoros 394 388
Congo 5

In [65]:
with open("country2names.json", "w") as f:
    json.dump(country2names, f, indent=4)