# Stats computation

This notebook computes some statistics to help estimate the probability of a name occuring.


In [None]:
import json
import math

import pandas as pd
from tqdm.notebook import trange

from marvolo import atomize, join_groups

In [None]:
csv_name = "names-male"
names_df = pd.read_csv(f"{csv_name}.csv").dropna()
names_df


In [None]:
names = names_df["Nazwisko aktualne" if csv_name.startswith("surname") else "IMIĘ PIERWSZE"]
name_counts = names_df["Liczba" if csv_name.startswith("surname") else "LICZBA WYSTĄPIEŃ"]
name_counts[names.str.endswith("SKI")].sum() / name_counts.sum()


## Byte pair encoding

To efficiently find popular groups of characters, we use byte pair encoding on the names.


In [None]:
def iterate_group_pairs(grouped_name):
    for first_group, second_group in zip(grouped_name[:-1], grouped_name[1:]):
        yield first_group + second_group


list(iterate_group_pairs(atomize("KOWALSKI")))


In [None]:
def popularities(grouped_names, name_counts, name_to_iterable):
    result = {}
    for grouped_name, count in zip(grouped_names, name_counts):
        for occurence in name_to_iterable(grouped_name):
            if occurence not in result:
                result[occurence] = 0
            result[occurence] += count
    return result


popular_groups = []
grouped_names = [atomize(name) for name in names]
for i in trange(100 if csv_name.startswith("surname") else 50):
    group_pair_popularities = popularities(
        grouped_names,
        name_counts=name_counts,
        name_to_iterable=iterate_group_pairs,
    )
    most_popular_group_pair = max(
        group_pair_popularities.items(),
        key=lambda key_value: key_value[1],
    )[0]
    grouped_names = [
        join_groups(grouped_name, most_popular_group_pair)
        for grouped_name in grouped_names
    ]
    popular_groups.append(most_popular_group_pair)

popular_groups[:10]


## Probability computation

To be able to compute the probability of a name later, we need to compute a few probability distributions.


In [None]:
def log_probabilities(popularities):
    total_popularity = sum(popularities.values())
    return {
        key: math.log(popularity) - math.log(total_popularity)
        for key, popularity in popularities.items()
    }


log_group_probabilities = log_probabilities(
    popularities(
        grouped_names=grouped_names,
        name_counts=name_counts,
        name_to_iterable=lambda name: name,
    )
)
log_group_probabilities["^"]


In [None]:
log_pair_probabilities = log_probabilities(
    popularities(
        grouped_names,
        name_counts=name_counts,
        name_to_iterable=iterate_group_pairs,
    ),
)
len(log_pair_probabilities)


In [None]:
log_length_probabilities = log_probabilities(
    popularities(
        grouped_names=grouped_names,
        name_counts=name_counts,
        name_to_iterable=lambda grouped_name: [len(grouped_name)],
    )
)
log_length_probabilities[8]


## Saving the stats


In [None]:
with open(f"stats/{csv_name}.json", "w") as file:
    json.dump(
        {
            "popular_groups": popular_groups,
            "log_group_probabilities": log_group_probabilities,
            "log_pair_probabilities": log_pair_probabilities,
            "log_length_probabilities": log_length_probabilities,
        },
        file,
        indent=4,
    )
