# Stats computation

This notebook computes some statistics to help estimate the probability of a name occuring.


In [None]:
import json
import math

import pandas as pd
from tqdm.notebook import trange

from marvolo import atomize, join_groups, iterate_groups, iterate_group_pairs


In [None]:
csv_name = "surnames-male"
names_df = pd.read_csv(f"{csv_name}.csv").dropna()
names_df


In [None]:
names = names_df["Nawisko aktualne"]
name_counts = names_df["Liczba"]
name_counts[names.str.endswith("SKI")].sum() / name_counts.sum()


## Byte pair encoding

To efficiently find popular groups of characters, we use byte pair encoding on the names.


In [None]:
def popularities(grouped_names, name_counts, iterator_maker):
    result = {}
    for grouped_name, count in zip(grouped_names, name_counts):
        for occurence in iterator_maker(grouped_name):
            if occurence not in result:
                result[occurence] = 0
            result[occurence] += count
    return result


popular_groups = []
grouped_names = [atomize(name) for name in names]
for i in trange(256):
    group_pair_popularities = popularities(
        grouped_names,
        name_counts=name_counts,
        iterator_maker=iterate_group_pairs,
    )
    most_popular_group_pair = max(
        group_pair_popularities.items(),
        key=lambda key_value: key_value[1],
    )[0]
    grouped_names = [
        join_groups(grouped_name, most_popular_group_pair)
        for grouped_name in grouped_names
    ]
    popular_groups.append(most_popular_group_pair)

popular_groups[:10]


We can now find the log probabilities of each group occuring in a randomly selected name.


In [None]:
def log_probabilities(popularities):
    total_popularity = sum(popularities.values())
    return {
        key: math.log(popularity) - math.log(total_popularity)
        for key, popularity in popularities.items()
    }


log_group_probabilities = log_probabilities(
    popularities(
        grouped_names,
        name_counts=name_counts,
        iterator_maker=iterate_groups,
    ),
)
log_group_probabilities["SKI$"]


...and the probabilities of a grouped name having a given length. Together, these let use estimate the probability of a name.


In [None]:
log_length_probabilities = log_probabilities(
    popularities(
        grouped_names=grouped_names,
        name_counts=name_counts,
        iterator_maker=lambda grouped_name: [len(grouped_name)],
    )
)
log_length_probabilities[8]


## Saving the stats


In [None]:
with open(f"stats/{csv_name}.json", "w") as file:
    json.dump(
        {
            "popular_groups": popular_groups,
            "log_group_probabilities": log_group_probabilities,
            "log_length_probabilities": log_length_probabilities,
        },
        file,
        indent=4,
    )
