# Stats computation

This notebook computes some statistics to help estimate the probability of a name occuring.


In [1]:
import json
import math

import pandas as pd
from tqdm.notebook import trange

from marvolo import atomize, join_groups

In [2]:
csv_name = "surnames-male"
names_df = pd.read_csv(f"{csv_name}.csv").dropna()
names_df


Unnamed: 0,Nawisko aktualne,Liczba
0,NOWAK,146263
1,KOWALSKI,94540
2,WIŚNIEWSKI,74268
3,WÓJCIK,67268
4,KOWALCZYK,66159
...,...,...
382090,ŻYWUSZCZAK,2
382091,ŻYWUTSKI,2
382092,ŻYŻELEWSKI,2
382093,ŻYŻKOWSKI,2


In [3]:
names = names_df["Nawisko aktualne"]
name_counts = names_df["Liczba"]
name_counts[names.str.endswith("SKI")].sum() / name_counts.sum()


0.2639712195759763

## Byte pair encoding

To efficiently find popular groups of characters, we use byte pair encoding on the names.


In [4]:
def iterate_group_pairs(grouped_name):
    for first_group, second_group in zip(grouped_name[:-1], grouped_name[1:]):
        yield first_group + second_group


list(iterate_group_pairs(atomize("KOWALSKI")))


['^K', 'KO', 'OW', 'WA', 'AL', 'LS', 'SK', 'KI', 'I$']

In [5]:
def popularities(grouped_names, name_counts, name_to_iterable):
    result = {}
    for grouped_name, count in zip(grouped_names, name_counts):
        for occurence in name_to_iterable(grouped_name):
            if occurence not in result:
                result[occurence] = 0
            result[occurence] += count
    return result


popular_groups = []
grouped_names = [atomize(name) for name in names]
for i in trange(100):
    group_pair_popularities = popularities(
        grouped_names,
        name_counts=name_counts,
        name_to_iterable=iterate_group_pairs,
    )
    most_popular_group_pair = max(
        group_pair_popularities.items(),
        key=lambda key_value: key_value[1],
    )[0]
    grouped_names = [
        join_groups(grouped_name, most_popular_group_pair)
        for grouped_name in grouped_names
    ]
    popular_groups.append(most_popular_group_pair)

popular_groups[:10]


  0%|          | 0/100 [00:00<?, ?it/s]

['KI', 'KI$', 'SKI$', 'K$', 'CZ', 'OW', '^K', 'A$', '^S', 'OWSKI$']

## Probability computation

To be able to compute the probability of a name later, we need to compute a few probability distributions.


In [6]:
def log_probabilities(popularities):
    total_popularity = sum(popularities.values())
    return {
        key: math.log(popularity) - math.log(total_popularity)
        for key, popularity in popularities.items()
    }


log_group_probabilities = log_probabilities(
    popularities(
        grouped_names=grouped_names,
        name_counts=name_counts,
        name_to_iterable=lambda name: name,
    )
)
log_group_probabilities["^"]


-3.791530765328309

In [7]:
log_pair_probabilities = log_probabilities(
    popularities(
        grouped_names,
        name_counts=name_counts,
        name_to_iterable=iterate_group_pairs,
    ),
)
log_pair_probabilities["CHA"]


-8.071827249576565

In [8]:
log_length_probabilities = log_probabilities(
    popularities(
        grouped_names=grouped_names,
        name_counts=name_counts,
        name_to_iterable=lambda grouped_name: [len(grouped_name)],
    )
)
log_length_probabilities[8]


-4.0135490143022245

## Saving the stats


In [9]:
with open(f"stats/{csv_name}.json", "w") as file:
    json.dump(
        {
            "popular_groups": popular_groups,
            "log_group_probabilities": log_group_probabilities,
            "log_pair_probabilities": log_pair_probabilities,
            "log_length_probabilities": log_length_probabilities,
        },
        file,
        indent=4,
    )
