# Get the CSO data for baby names in Ireland

Import required modules and initalise data session with CSO API.

In [29]:
# get imports
import pandas as pd
from cso_ireland_data import CSODataSession
from datetime import timedelta

# prepare data session using CSO API
cso = CSODataSession(
    cached_session_params={
        "use_cache_dir": True,  # Save files in the default user cache dir
        "cache_control": True,  # Use Cache-Control response headers for expiration, if available
        "expire_after": timedelta(
            days=1
        ),  # Otherwise expire responses after one day
    }
)

Download data for boy and girl names datasets using CSO API.

In [30]:
# prepare initial unmodified dataframes
df_boy = cso.get_table("VSA50")
df_girl = cso.get_table("VSA60")

## Reshape the downloaded data

Combine the male and female tables, keeping only the counts and discarding the rank.

In [31]:
# combine dataframes
df = pd.concat(objs=[df_boy, df_girl]).fillna(0)

# drop rank column
keep_cols = [c for c in df.columns if "Rank" not in c]
df = df[keep_cols]
df.columns = ["Male", "Female"]
df.reset_index(inplace=True, level=["Year"])
df["Year"] = df["Year"].astype(int)
print(f"\nSample of data from 'df' ({df.shape[0]} entries):")
df.sample(5)


Sample of data from 'df' (375960 entries):


Unnamed: 0_level_0,Year,Male,Female
Names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Andi,1967,0.0,0.0
Arav,1998,0.0,0.0
Saif,2009,0.0,0.0
Magella,2019,0.0,0.0
Kailen,2004,0.0,0.0


Reorganise the table by year.

In [32]:
# Reshape data under years
pivot_df = df.pivot_table(
    index="Names",
    columns="Year",
    values=["Male", "Female"],
    aggfunc="sum",
)
pivot_df = pivot_df.swaplevel(axis=1).sort_index(
    axis=1, level=0, ascending=False
)
print(f"\nSample of data from 'pivot_df' ({pivot_df.shape[0]} entries):")
pivot_df.sample(5)


Sample of data from 'pivot_df' (6027 entries):


Year,2023,2023,2022,2022,2021,2021,2020,2020,2019,2019,...,1968,1968,1967,1967,1966,1966,1965,1965,1964,1964
Unnamed: 0_level_1,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,...,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female
Names,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Skaiste,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cornelia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0
Sion,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Shantel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mercedes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


Only keep years after 2014.

In [33]:
# only keep years after a certain cutoff
year_cutoff = 2014
cols_to_keep = [(year, gender) for year, gender in pivot_df.columns if year > year_cutoff]  # type: ignore
cso_df = pivot_df.loc[:, cols_to_keep]
cso_df["Total"] = cso_df.sum(axis=1)  # type: ignore
cso_df.sort_values(by="Total", ascending=False, inplace=True)  # type: ignore
print(f"\nSample of data from 'cso_df' ({cso_df.shape[0]} entries):")
cso_df.head(10)


Sample of data from 'cso_df' (6027 entries):


Year,2023,2023,2022,2022,2021,2021,2020,2020,2019,2019,2018,2018,2017,2017,2016,2016,2015,2015,Total
Unnamed: 0_level_1,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Unnamed: 19_level_1
Names,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Jack,561.0,0.0,641.0,0.0,667.0,0.0,597.0,0.0,677.0,0.0,686.0,0.0,685.0,0.0,684.0,0.0,752.0,0.0,5950.0
James,369.0,0.0,412.0,0.0,442.0,0.0,495.0,0.0,534.0,0.0,589.0,0.0,619.0,0.0,688.0,0.0,697.0,0.0,4845.0
Noah,473.0,0.0,485.0,0.0,475.0,0.0,447.0,0.0,502.0,0.0,494.0,0.0,442.0,0.0,446.0,0.0,438.0,0.0,4202.0
Emily,0.0,297.0,0.0,349.0,0.0,388.0,0.0,329.0,0.0,452.0,0.0,460.0,0.0,459.0,0.0,490.0,0.0,626.0,3850.0
Daniel,256.0,0.0,303.0,0.0,325.0,0.0,359.0,0.0,399.0,0.0,433.0,0.0,536.0,0.0,558.0,0.0,617.0,0.0,3786.0
Conor,244.0,0.0,275.0,0.0,360.0,0.0,345.0,0.0,427.0,0.0,489.0,0.0,472.0,0.0,558.0,0.0,558.0,0.0,3728.0
Grace,0.0,339.0,0.0,342.0,0.0,412.0,0.0,410.0,0.0,426.0,0.0,423.0,0.0,371.0,0.0,452.0,0.0,367.0,3542.0
Charlie,244.0,7.0,348.0,22.0,345.0,16.0,305.0,13.0,316.0,6.0,347.0,14.0,338.0,17.0,369.0,16.0,399.0,15.0,3137.0
Sophie,0.0,283.0,0.0,292.0,0.0,336.0,0.0,328.0,0.0,330.0,0.0,344.0,0.0,364.0,0.0,369.0,0.0,407.0,3053.0
Michael,239.0,0.0,278.0,0.0,310.0,0.0,275.0,0.0,302.0,0.0,354.0,0.0,359.0,0.0,394.0,0.0,434.0,0.0,2945.0


# Getting a list of native Irish names from Wikipedia
The Wikipedia article "List of Irish-language given names" has several tables for native Irish names as well as those of foreign origin. We take the first 4 tables from the page here, merge them into a single dataframe, and rename the columns (dropping the reference column).

In [34]:
# get a list of Irish names from Wikipedia
from io import StringIO
import wikipedia as wp


wiki_title = "List of Irish-language given names"


html = StringIO(wp.page(wiki_title).html())


# get first 4 tables on page
name_dfs = list()


i = 0


while i <= 3:

    try:

        this_df = pd.read_html(html)[i]
        this_df.columns = [
            "Name",
            "Foreign Version",
            "Notes",
            "Ref",
        ]
        this_df.drop(columns="Ref", inplace=True)
        name_dfs.append(this_df)

        i += 1

    except IndexError:

        break


# combine into one dataframe


tables_df = pd.concat(name_dfs).sort_values(by="Name")


print(f"Sample from 'tables_df' ({tables_df.shape[0]} entries)")


tables_df.sample(5)

Sample from 'tables_df' (325 entries)


Unnamed: 0,Name,Foreign Version,Notes
61,Déaglán,Declan (anglicisation),
32,Fionn(gh)uala,"Finola, Finuala, Nola (anglicisations)",
7,Amhlaoibh,"Auliffe (anglicisation) Olaf, Humphrey (Englis...",From Old Norse Óláfr.
107,Lomán,Loman (anglicisation),
51,Cosnamhach,,


For each name, we then separate the alternatives into separate rows, i.e.:
- those separated by whitespace
- each version of a name with parenthesised alternatives
- a version for each combination of letters with a fada
- a version with no letters with a fada

For example, "Éigneach(án)" becomes: Éigneachán, Éigneachan, Eigneachan, Éigneach, and Eigneach.
This takes unconventional fada use into consideration.

In [35]:
# Organise the downloaded names and get alternate versions
import itertools
import re
import unicodedata


def paren_alts(name: str) -> list[str]:
    """Generate all possible alternatives for a given name using regex."""
    parts = re.split(r"\s+", name)  # split by whitespace
    all_alternatives = []

    for part in parts:
        # Find all parts with parentheses
        regex = re.compile(r"\((.*?)\)")
        base_parts = regex.split(part)

        # Generate alternatives by including and excluding the parenthesised parts
        alternatives = [""]
        for i, base_part in enumerate(base_parts):
            if i % 2 == 0:
                # This is a non-parenthesised part
                alternatives = [alt + base_part for alt in alternatives]
            else:
                # This is a parenthesised part
                new_alternatives = []
                for alt in alternatives:
                    new_alternatives.append(alt + base_part)
                    new_alternatives.append(alt)
                alternatives = new_alternatives

        all_alternatives.append(alternatives)

    # Generate all combinations of the alternatives
    return [
        " ".join(alternative)
        for alternative in itertools.product(*all_alternatives)
    ]


def remove_fadas(input_str: str) -> str:
    """Remove fadas from a given string."""
    nfkd_form = unicodedata.normalize("NFKD", input_str)
    return "".join([c for c in nfkd_form if not unicodedata.combining(c)])


def vowel_fada_alts(name: str) -> list[str]:
    """Generate all possible combinations of names without vowel fadas."""
    alternatives = set([name])  # Use a set to avoid duplicates
    fadas_removed = remove_fadas(name)

    if fadas_removed != name:
        alternatives.add(fadas_removed)

    return list(alternatives)


def generate_all_alts(name: str) -> list[str]:
    """Generate all alternatives including parentheses and vowel fadas."""
    paren_alternatives = paren_alts(name)
    all_alternatives = set()  # Use a set to avoid duplicates

    for alt in paren_alternatives:
        all_alternatives.update(vowel_fada_alts(alt))

    return list(all_alternatives)


# Initialize a new DataFrame to store the results
new_data = tables_df.copy()

# split entries based on whitespace
new_data["Original Entry"] = new_data["Name"].str.split()
new_data = new_data.explode("Original Entry")

new_list = list()
# Iterate through the DataFrame and generate alternatives
for index, row in new_data.iterrows():
    name_alternatives = generate_all_alts(row["Original Entry"])  # type: ignore
    for alternative in name_alternatives:
        new_row = row.copy()
        new_row["Name"] = alternative
        new_list.append(new_row)

# Create a new DataFrame with the new data
wiki_name_df = pd.DataFrame(new_list)

print(
    f"\nSample of data from 'wiki_name_df' ({wiki_name_df.shape[0]} entries found):"
)
wiki_name_df.sample(5)


Sample of data from 'wiki_name_df' (659 entries found):


Unnamed: 0,Name,Foreign Version,Notes,Original Entry
126,Neasan,Nessan (anglicisation),,Neasán
127,Niallán,"Neil, Neal(e) (anglicisations)",,Niall(án)
100,Garbhán,Garvan (anglicisation),,Garbhán
80,Eimhin,Evin (anglicisation),,Éimhín
53,Órla,Orla (anglicisation),,Ór(fh)la(ith)


# Combine the Irish names list with the CSO statistics

In [36]:
# Create a boolean mask where the Name index in cso_df is in wiki_name_df["Name"]
mask = cso_df.index.isin(wiki_name_df["Name"])
# Use the mask to filter cso_df
irish_names_df = cso_df[mask]
print(f"Sample of 'irish_names_df' ({irish_names_df.shape[0]} entries):")
irish_names_df.head(5)

Sample of 'irish_names_df' (277 entries):


Year,2023,2023,2022,2022,2021,2021,2020,2020,2019,2019,2018,2018,2017,2017,2016,2016,2015,2015,Total
Unnamed: 0_level_1,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,Female,Unnamed: 19_level_1
Names,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Liam,289.0,0.0,323.0,0.0,353.0,0.0,329.0,0.0,334.0,0.0,338.0,0.0,312.0,0.0,333.0,0.0,324.0,0.0,2935.0
Fionn,306.0,0.0,287.0,0.0,289.0,0.0,323.0,0.0,291.0,0.0,309.0,0.0,314.0,0.0,308.0,0.0,325.0,0.0,2752.0
Fiadh,0.0,300.0,0.0,320.0,0.0,424.0,0.0,366.0,0.0,334.0,0.0,306.0,0.0,242.0,0.0,235.0,0.0,186.0,2713.0
Cillian,275.0,0.0,316.0,0.0,322.0,0.0,302.0,0.0,289.0,0.0,289.0,0.0,302.0,0.0,298.0,0.0,299.0,0.0,2692.0
Tadhg,301.0,0.0,324.0,0.0,318.0,0.0,272.0,0.0,318.0,0.0,280.0,0.0,254.0,0.0,228.0,0.0,230.0,0.0,2525.0


# Split the lists by gender

In [59]:
def split_by_gender(source_df: pd.DataFrame, gender: str) -> pd.DataFrame:
    df = source_df.loc[:, (slice(None), gender)]  # type: ignore
    df = df.swaplevel(axis=1)
    df.columns = df.columns.droplevel(0)
    df = df.loc[~(df == 0).all(axis=1)]
    df.sort_values(by=2023, ascending=False, inplace=True)
    return df


male_df = split_by_gender(irish_names_df, "Male")
female_df = split_by_gender(irish_names_df, "Female")

## Top 50 Boy Names

In [60]:
male_df.head(50)

Year,2023,2022,2021,2020,2019,2018,2017,2016,2015
Names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Oisín,330.0,340.0,302.0,277.0,310.0,303.0,0.0,0.0,0.0
Fionn,306.0,287.0,289.0,323.0,291.0,309.0,314.0,308.0,325.0
Tadhg,301.0,324.0,318.0,272.0,318.0,280.0,254.0,228.0,230.0
Liam,289.0,323.0,353.0,329.0,334.0,338.0,312.0,333.0,324.0
Cillian,275.0,316.0,322.0,302.0,289.0,289.0,302.0,298.0,299.0
Seán,240.0,239.0,274.0,266.0,255.0,317.0,0.0,0.0,0.0
Cian,156.0,185.0,217.0,203.0,200.0,241.0,291.0,344.0,358.0
Oscar,154.0,156.0,189.0,178.0,156.0,185.0,168.0,172.0,170.0
Eoin,78.0,55.0,80.0,87.0,88.0,98.0,120.0,135.0,162.0
Cathal,78.0,92.0,71.0,98.0,107.0,113.0,114.0,116.0,118.0


## Top 50 Girl Names

In [61]:
female_df.head(50)

Year,2023,2022,2021,2020,2019,2018,2017,2016,2015
Names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Fiadh,300.0,320.0,424.0,366.0,334.0,306.0,242.0,235.0,186.0
Éabha,241.0,271.0,288.0,234.0,178.0,121.0,0.0,0.0,0.0
Caoimhe,158.0,190.0,183.0,163.0,226.0,190.0,223.0,256.0,252.0
Saoirse,157.0,212.0,190.0,214.0,205.0,225.0,281.0,324.0,282.0
Aoife,148.0,158.0,159.0,179.0,229.0,240.0,256.0,300.0,321.0
Róisín,127.0,157.0,152.0,146.0,142.0,138.0,0.0,0.0,0.0
Sadhbh,120.0,153.0,150.0,128.0,145.0,114.0,104.0,108.0,123.0
Ailbhe,118.0,78.0,98.0,90.0,78.0,67.0,62.0,64.0,58.0
Méabh,118.0,82.0,100.0,68.0,66.0,58.0,0.0,0.0,0.0
Niamh,75.0,70.0,82.0,61.0,106.0,104.0,132.0,157.0,143.0


## Export to Excel

In [62]:
# Define the file name
file_name = "name_exploration.xlsx"

# Write DataFrames to separate sheets
with pd.ExcelWriter(file_name, engine="openpyxl") as writer:
    male_df.to_excel(writer, sheet_name="Boy")
    female_df.to_excel(writer, sheet_name="Girl")