In [19]:
from names_dataset import NameDataset

nd = NameDataset()

In [17]:
from typing import Dict, List, Optional
from collections import defaultdict

def get_top_names(
        names_data: 'NameDataset',
        n: int = 100,
        use_first_names: bool = True,
        gender: Optional[str] = None,
        ascii_only: bool = True
) -> Dict[str, List[str]]:
    if n <= 0:
        raise ValueError('[n] has to be positive.')
    if use_first_names and names_data.first_names is None:
        raise ValueError('First names data is not loaded.')
    if not use_first_names and names_data.last_names is None:
        raise ValueError('Last names data is not loaded.')
    if gender is not None:
        if gender.title() in ['M', 'Male']:
            gender = 'M'
        elif gender.title() in ['F', 'Female']:
            gender = 'F'
        else:
            raise ValueError('Invalid gender value.')
    if not use_first_names and gender is not None:
        raise ValueError('Selecting a gender for last names is invalid.')

    global_ranks = defaultdict(lambda: defaultdict(float))
    lookup_table = names_data.first_names if use_first_names else names_data.last_names

    for name, name_info in lookup_table.items():
        if ascii_only and not name.isascii():
            continue  # Skip non-ASCII names if ascii_only is True
        
        if len(name_info['gender']) == 0:
            gender_ = 'N/A'  # default
        elif len(name_info['gender']) == 1:
            gender_ = list(name_info['gender'].keys())[0]
        else:
            gender_ = 'M' if name_info['gender']['M'] > name_info['gender']['F'] else 'F'
        
        if gender is None or gender == gender_:
            for country_, rank in name_info['rank'].items():
                global_ranks[gender_][name] += 1 / rank  # Use inverse rank as a score

    top_names = {}
    for gender_ in global_ranks:
        sorted_names = sorted(global_ranks[gender_].items(), key=lambda x: x[1], reverse=True)
        top_names[gender_] = [name for name, score in sorted_names[:n]]

    if not use_first_names:
        return {'N/A': top_names['N/A']}
    else:
        if 'N/A' in top_names:
            del top_names['N/A']
        return top_names

In [23]:
top_20000_names = get_top_names(nd, n=10000)["M"] + get_top_names(nd, n=10000)["F"]

In [24]:
# Save in a yaml file
import yaml

with open('top_names.yaml', 'w') as f:
    yaml.dump(top_20000_names, f)