In [1]:
import polars as pl
import warnings
from pathlib import Path

In [2]:
def filename2dataframe(f: str) -> pl.DataFrame:
    colnames = ["name", "municipality", "coordinates", "province"]
    records = []
    province = Path(f).stem
    num_warnings = 0
    with open(f, "r") as fin:
        for idx, line in enumerate(fin):
            try:
                name, municip, coord = map(lambda s: s.strip(), line.split("\t")[:3])
                records.append([name, municip, coord, province])
            except Exception as e:
                warnings.warn(f"{e} occured while reading {f}")
                num_warnings += 1
    warnings.warn(f"{f} produced {num_warnings} warnings")
    return pl.DataFrame(
        {colname: [row[i] for row in records] for i, colname in enumerate(colnames)}
    )

In [3]:
raw_data = filename2dataframe("data/Groningen.txt")



In [4]:
raw_data = pl.concat([filename2dataframe(f) for f in Path("data").glob("*.txt")])



In [5]:
raw_data

name,municipality,coordinates,province
str,str,str,str
"""Name""","""Municipality""","""Coordinates""","""Friesland"""
"""Augustinusga""","""Achtkarspelen""","""53°13′N 6°10′E""","""Friesland"""
"""Augsbuurt-Lutjewoude""","""Kollumerland en Nieuwkruisland""","""53°16′N 6°10′E""","""Friesland"""
"""Atzeburen""","""Súdwest-Fryslân""","""53°01′N 5°30′E""","""Friesland"""
"""Arum""","""Súdwest-Fryslân""","""53°08′N 5°29′E""","""Friesland"""
…,…,…,…
"""Zwarteweg""","""Oldebroek""","""52°28′N 5°55′E""","""Gelderland"""
"""Zweekhorst""","""Zevenaar""","""51°58′N 6°05′E""","""Gelderland"""
"""Zwiep""","""Lochem""","""52°09′N 6°27′E""","""Gelderland"""
"""Zwilbroek""","""Oost Gelre""","""52°03′N 6°41′E""","""Gelderland"""


In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
from utils import Tokenizer

alphabet = "".join(set("".join(raw_data.select("name").get_column("name").to_list())))

t = Tokenizer(
    alphabet=alphabet,
    max_len=16,
)

In [8]:
raw_data.with_columns(pl.col("name").str.len_bytes().alias("len")).get_column(
    "len"
).value_counts().sort("len").filter(pl.col("count") > 100)

len,count
u32,u32
4,261
5,423
6,648
7,719
8,874
…,…
11,578
12,352
13,229
14,162


So we'll select only names between 4 and 15 tokens👌

In [25]:
max_length = 15
min_length = 4

df = (
    raw_data.rename({"name": "sequence"})
    .select("sequence")
    .filter(
        pl.col("sequence").str.len_chars() >= 4,
        pl.col("sequence").str.len_chars() <= 14,
        pl.col("sequence").str.starts_with("Name").not_(),
    )
)

In [30]:
start_token = "<"
end_token = ">"
pad_token = "."
max_word_len = 20
colname = "sequence"

(
    df.with_columns(
        pl.concat_str(
            pl.lit(start_token),
            pl.col(colname),
            pl.lit(end_token),
        )
        .str.pad_end(max_word_len, pad_token)
        .alias("split")
        # .str.split(by="")
    )
)

sequence,split
str,str
"""Augustinusga""","""<Augustinusga>......"""
"""Atzeburen""","""<Atzeburen>........."""
"""Arum""","""<Arum>.............."""
"""Arkum""","""<Arkum>............."""
"""Arkens""","""<Arkens>............"""
…,…
"""Zwarteweg""","""<Zwarteweg>........."""
"""Zweekhorst""","""<Zweekhorst>........"""
"""Zwiep""","""<Zwiep>............."""
"""Zwilbroek""","""<Zwilbroek>........."""
