In [None]:
# 7_vocab_analysis_visualizer.ipynb
"""
Analyze token frequency, character n-grams, and estimated morphemes
from a Kanien'kéha or other polysynthetic language corpus.

Jupyter version of ../datasets/vocab_builder.py
"""


In [None]:

# 📦 Step 1: Install if needed
!pip install matplotlib pandas

In [None]:

# 🧠 Step 2: Import modules
import re
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from pathlib import Path

In [None]:

# 📂 Step 3: Set file paths
input_file = "../datasets/sample_corpus.txt"
output_dir = "../datasets/vocab_analysis/"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:

# 🧾 Step 4: Load and tokenize corpus
with open(input_file, "r", encoding="utf-8") as f:
    lines = [line.strip().lower() for line in f if line.strip()]

tokens = []
for line in lines:
    tokens.extend(re.findall(r"\b\w+\b", line))

In [None]:

# 📊 Step 5: Token frequency
token_counts = Counter(tokens)
df_tokens = pd.DataFrame(token_counts.most_common(50), columns=["token", "count"])

In [None]:

# 🔍 Step 6: Character n-gram analysis
char_ngrams = Counter()
for token in tokens:
    token = f"_{token}_"
    for n in range(2, 6):
        for i in range(len(token) - n + 1):
            char_ngrams[token[i:i+n]] += 1

df_ngrams = pd.DataFrame(char_ngrams.most_common(50), columns=["ngram", "count"])

In [None]:

# 🔤 Step 7: Estimated morphemes (basic heuristic)
morpheme_like = [t for t in tokens if "-" in t or len(t) > 8]
df_morphemes = pd.DataFrame(Counter(morpheme_like).most_common(30), columns=["morpheme_candidate", "count"])

In [None]:

# 📈 Step 8: Plot results
plt.figure(figsize=(12, 6))
df_tokens.plot(kind="bar", x="token", y="count", legend=False)
plt.title("Top Tokens")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(output_dir + "top_tokens.png")
plt.close()

plt.figure(figsize=(12, 6))
df_ngrams.plot(kind="bar", x="ngram", y="count", legend=False)
plt.title("Top Character N-Grams")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(output_dir + "top_char_ngrams.png")
plt.close()

In [None]:

# 📦 Step 9: Save to CSV
df_tokens.to_csv(output_dir + "token_frequencies.csv", index=False)
df_ngrams.to_csv(output_dir + "char_ngrams.csv", index=False)
df_morphemes.to_csv(output_dir + "estimated_morphemes.csv", index=False)

print("✅ Vocab analysis complete. Results saved to:", output_dir)