In [1]:
# 7_vocab_analysis_visualizer.ipynb
"""
Analyze token frequency, character n-grams, and estimated morphemes
from a Kanien'kéha or other polysynthetic language corpus.

Jupyter version of ../datasets/vocab_builder.py
"""


"\nAnalyze token frequency, character n-grams, and estimated morphemes\nfrom a Kanien'kéha or other polysynthetic language corpus.\n\nJupyter version of ../datasets/vocab_builder.py\n"

In [2]:

# 📦 Step 1: Install if needed
!pip install matplotlib pandas

Collecting matplotlib
  Using cached matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.59.0-cp313-cp313-win_amd64.whl.metadata (110 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.8-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pillow-11.3.0-cp313-cp313-win_amd64.whl.metadata (9.2 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.3-cp313-cp313-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:--:--
   ---------------------------------------- 8.1/8.1 MB 44.4 MB/s eta 0:0

In [3]:

# 🧠 Step 2: Import modules
import re
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from pathlib import Path

In [4]:

# 📂 Step 3: Set file paths
input_file = "../datasets/sample_corpus.txt"
output_dir = "../datasets/vocab_analysis/"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [5]:

# 🧾 Step 4: Load and tokenize corpus
with open(input_file, "r", encoding="utf-8") as f:
    lines = [line.strip().lower() for line in f if line.strip()]

tokens = []
for line in lines:
    tokens.extend(re.findall(r"\b\w+\b", line))

In [6]:

# 📊 Step 5: Token frequency
token_counts = Counter(tokens)
df_tokens = pd.DataFrame(token_counts.most_common(50), columns=["token", "count"])

In [7]:

# 🔍 Step 6: Character n-gram analysis
char_ngrams = Counter()
for token in tokens:
    token = f"_{token}_"
    for n in range(2, 6):
        for i in range(len(token) - n + 1):
            char_ngrams[token[i:i+n]] += 1

df_ngrams = pd.DataFrame(char_ngrams.most_common(50), columns=["ngram", "count"])

In [8]:

# 🔤 Step 7: Estimated morphemes (basic heuristic)
morpheme_like = [t for t in tokens if "-" in t or len(t) > 8]
df_morphemes = pd.DataFrame(Counter(morpheme_like).most_common(30), columns=["morpheme_candidate", "count"])

In [9]:

# 📈 Step 8: Plot results
plt.figure(figsize=(12, 6))
df_tokens.plot(kind="bar", x="token", y="count", legend=False)
plt.title("Top Tokens")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(output_dir + "top_tokens.png")
plt.close()

plt.figure(figsize=(12, 6))
df_ngrams.plot(kind="bar", x="ngram", y="count", legend=False)
plt.title("Top Character N-Grams")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(output_dir + "top_char_ngrams.png")
plt.close()

<Figure size 1200x600 with 0 Axes>

<Figure size 1200x600 with 0 Axes>

In [10]:

# 📦 Step 9: Save to CSV
df_tokens.to_csv(output_dir + "token_frequencies.csv", index=False)
df_ngrams.to_csv(output_dir + "char_ngrams.csv", index=False)
df_morphemes.to_csv(output_dir + "estimated_morphemes.csv", index=False)

print("✅ Vocab analysis complete. Results saved to:", output_dir)

✅ Vocab analysis complete. Results saved to: ../datasets/vocab_analysis/
