In [22]:
from vocabulous import Vocabulous

model = Vocabulous.load("../notebooks/ultrabulous")

In [10]:
# pip install datasets pandas numpy tqdm blingfire

from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
import os, sys, traceback

try:
    from blingfire import text_to_sentences as split_sentences_fast
    def split_into_sentences(text: str):
        return [s.strip() for s in split_sentences_fast(text).split("\n") if s.strip()]
except Exception:
    import re
    SENT_RE = re.compile(r"(?<=[.!?])\s+")
    def split_into_sentences(text: str):
        return [s.strip() for s in SENT_RE.split(text) if s.strip()]

def process_stream(
    model,
    dataset_name="HuggingFaceFW/fineweb-2",
    dataset_config="zsm_Latn",
    split="train",
    streaming=True,
    target_lang="dtp_Latn",
    negative_lang="zsm_Latn",
    output_path="outputs/dtp_vs_zsm_results.csv",
    save_every_rows=1000,           # save after every N dataset rows
    buffer_sentence_threshold=5000, # also save when buffer reaches this many sentences
    max_rows=None,                  # optional dataset row limit
    min_sentence_chars=5,           # relaxed to avoid over-filtering
    max_sentence_chars=2000,
    text_key="text",                # change if your dataset uses a different column
    verbose=True
):
    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
    write_header = not os.path.exists(output_path)

    ds = load_dataset(dataset_name, dataset_config, streaming=streaming, split=split)

    def score_sentence(sent):
        try:
            scores = model._score_sentence(sent)
            return scores if isinstance(scores, dict) and len(scores) > 0 else None
        except Exception as e:
            if verbose:
                print("Score error:", e)
            return None

    def row_to_records(row_idx, row_text):
        records = []
        sentences = split_into_sentences(row_text or "")
        for sent_idx, sent in enumerate(sentences):
            L = len(sent)
            if L < min_sentence_chars or L > max_sentence_chars:
                continue

            scores = score_sentence(sent)
            if not scores:
                continue

            ranked = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
            top1_lang, top1_score = ranked[0]
            if len(ranked) > 1:
                top2_lang, top2_score = ranked[1]
            else:
                top2_lang, top2_score = "", 0.0
            if len(ranked) > 2:
                top3_lang, top3_score = ranked[2]
            else:
                top3_lang, top3_score = "", 0.0

            top_confidence = float(top1_score - top2_score)
            dtp_score = float(scores.get(target_lang, 0.0))
            zsm_score = float(scores.get(negative_lang, 0.0))

            if top1_lang == target_lang:
                dtp_confidence = float(dtp_score - max(top2_score, top3_score))
            else:
                dtp_confidence = float(dtp_score - top1_score)

            records.append({
                "row_index": row_idx,
                "sentence_index": sent_idx,
                "sentence": sent,
                "char_len": L,

                "predicted_lang": top1_lang,
                "top1_lang": top1_lang, "top1_score": float(top1_score),
                "top2_lang": top2_lang, "top2_score": float(top2_score),
                "top3_lang": top3_lang, "top3_score": float(top3_score),
                "top_confidence": float(top_confidence),

                "dtp_lang": target_lang,
                "dtp_score": float(dtp_score),
                "dtp_confidence": float(dtp_confidence),

                "zsm_lang": negative_lang,
                "zsm_score": float(zsm_score),
            })
        return records, len(sentences)

    buffer = []
    rows_processed = 0
    rows_with_text = 0
    sentences_seen = 0
    sentences_kept = 0
    sentences_scored = 0

    def flush(reason):
        nonlocal write_header, buffer
        if not buffer:
            if verbose:
                print(f"[flush:{reason}] buffer empty, nothing to write.")
            return
        try:
            df = pd.DataFrame(buffer)
            mode = "a"
            df.to_csv(output_path, mode=mode, header=write_header, index=False)
            write_header = False
            if verbose:
                print(f"[flush:{reason}] wrote {len(df)} rows to {output_path} (total rows_processed={rows_processed})")
        except Exception as e:
            print(f"[flush:{reason}] ERROR writing CSV:", e)
            traceback.print_exc()
        finally:
            buffer.clear()
            sys.stdout.flush()

    try:
        for row_idx, row in tqdm(enumerate(ds), desc="Streaming rows", unit="rows"):
            text = row.get(text_key, None)
            if isinstance(text, str) and text.strip():
                rows_with_text += 1
                recs, sent_count = row_to_records(row_idx, text)
                sentences_seen += sent_count
                sentences_kept += len(recs)
                sentences_scored += len(recs)  # recs only include scored sentences
                buffer.extend(recs)

            rows_processed += 1

            if (rows_processed % save_every_rows == 0) or (len(buffer) >= buffer_sentence_threshold):
                flush("periodic")

            if (max_rows is not None) and (rows_processed >= max_rows):
                break

        flush("final")

    except KeyboardInterrupt:
        flush("interrupt")
        print("Interrupted. Partial results saved to:", output_path)

    print(f"Done. rows_processed={rows_processed}, rows_with_text={rows_with_text}, "
          f"sentences_seen={sentences_seen}, sentences_kept={sentences_kept}, sentences_scored={sentences_scored}")
    print(f"Output: {output_path}")

In [None]:
# Example: fast sanity run (adjust output_path)
process_stream(
    model=model,                 # your model instance
    max_rows=100000000,                    # tiny run to verify IO
    save_every_rows=10000,              # frequent flush
    buffer_sentence_threshold=200,  # flush on buffer growth
    output_path="outputs/dtp_vs_zsm_results.csv",
    verbose=False
)

Streaming rows: 9999rows [00:51, 195.99rows/s]

Done. rows_processed=10000, rows_with_text=10000, sentences_seen=410244, sentences_kept=386180, sentences_scored=386180
Output: outputs/dtp_vs_zsm_results.csv





In [23]:
model._score_sentence("KAMAL AMIR MENULIS MINGGU lalu negara kehilangan dua orang tokoh bergelar Tan Sri yang kembali kepangkuan Ilahi")

{'bjn_Latn': 0.4117647058823529,
 'zsm_Latn': 0.5294117647058824,
 'min_Latn': 0.29411764705882354,
 'pam_Latn': 0.17647058823529413,
 'ind_Latn': 0.5294117647058824,
 'ces_Latn': 0.17647058823529413,
 'jav_Latn': 0.35294117647058826,
 'lav_Latn': 0.17647058823529413,
 'ace_Latn': 0.23529411764705882,
 'ban_Latn': 0.29411764705882354,
 'bug_Latn': 0.17647058823529413,
 'wol_Latn': 0.058823529411764705,
 'btm_Latn': 0.11764705882352941,
 'gor_Latn': 0.29411764705882354,
 'bew_Latn': 0.29411764705882354,
 'dtp_Latn': 0.23529411764705882,
 'kin_Latn': 0.17647058823529413,
 'sun_Latn': 0.29411764705882354,
 'grn_Latn': 0.058823529411764705,
 'nld_Latn': 0.058823529411764705,
 'frr_Latn': 0.058823529411764705,
 'kaa_Latn': 0.17647058823529413,
 'mwl_Latn': 0.17647058823529413,
 'ami_Latn': 0.11764705882352941,
 'twi_Latn': 0.058823529411764705,
 'ilo_Latn': 0.11764705882352941,
 'epo_Latn': 0.11764705882352941,
 'bbc_Latn': 0.17647058823529413,
 'sqi_Latn': 0.17647058823529413,
 'crh_Latn':

In [13]:
from datasets import load_dataset
from itertools import islice

dataset_name = "HuggingFaceFW/fineweb-2"
dataset_config = "zsm_Latn"
split = "train"

ds = load_dataset(dataset_name, dataset_config, streaming=True, split=split)
rows = list(islice(ds, 3))
for i, r in enumerate(rows):
    print(f"Row {i} keys:", list(r.keys()))
    for cand in ["text","content","raw_content","document","body"]:
        if cand in r and isinstance(r[cand], str):
            print(f"  {cand}[:160]:", r[cand][:160].replace("\n"," "))

Row 0 keys: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'language_script', 'minhash_cluster_size', 'top_langs']
  text[:160]: KAMAL AMIR MENULIS MINGGU lalu negara kehilangan dua orang tokoh bergelar Tan Sri yang kembali kepangkuan Ilahi. Pemergian kedua-dua tokoh tersebut bagaikan suk
Row 1 keys: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'language_script', 'minhash_cluster_size', 'top_langs']
  text[:160]: Bagi ramai penganut Islam di Malaysia gambar mental yang dikaitkan dengan perkataan ‘bala’ ialah satu malapetaka – khususnya bencana alam -- yang menimpa seseor
Row 2 keys: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'language_script', 'minhash_cluster_size', 'top_langs']
  text[:160]: Dunia dunia . Semakin lama ia berputar . Semakin banyak manusia terpesong . Wow . Is this good ? ini ke yang patut ditunjukkan oleh kite untuk generasi akan dat


In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load processed CSV
results_path = "outputs/dtp_vs_zsm_results.csv"  # match output_path above
df = pd.read_csv(results_path)

target_lang = "dtp_Latn"
negative_lang = "zsm_Latn"

# Basic sanity
print(f"Total sentences scored: {len(df):,}")
print("Columns:", list(df.columns))

# Focus: cases where model thinks it's dtp and doesn't think it's zsm
# “Highest confidence items where our system thinks it's dtp and doesn't think it's zsm”
# We’ll rank by:
#  - predicted_lang == dtp
#  - large dtp_confidence (dtp >> next best if top1)
#  - small zsm_score (model doesn't think zsm)
#  - also ensure dtp_score is high
dtp_pred = df[df["predicted_lang"] == target_lang].copy()

# Thresholds (tune as needed)
min_dtp_score = dtp_pred["dtp_score"].quantile(0.75) if len(dtp_pred) > 50 else 0.5
max_zsm_score = dtp_pred["zsm_score"].quantile(0.25) if len(dtp_pred) > 50 else 0.2
min_dtp_conf = dtp_pred["dtp_confidence"].quantile(0.75) if len(dtp_pred) > 50 else 0.2

high_conf_dtp_not_zsm = dtp_pred[
    (dtp_pred["dtp_score"] >= min_dtp_score) &
    (dtp_pred["zsm_score"] <= max_zsm_score) &
    (dtp_pred["dtp_confidence"] >= min_dtp_conf)
].copy()

print("\nCounts")
print(f"- Predicted {target_lang}: {len(dtp_pred):,}")
print(f"- High-conf {target_lang} and low {negative_lang}: {len(high_conf_dtp_not_zsm):,}")

# Show top-N high-confidence examples (by dtp_confidence, then by dtp_score desc, then low zsm_score)
topN = high_conf_dtp_not_zsm.sort_values(
    by=["dtp_confidence", "dtp_score", "zsm_score"], ascending=[False, False, True]
).head(25)

print("\nTop 25 high-confidence dtp vs not zsm examples:")
display_cols = [
    "row_index","sentence_index","char_len",
    "dtp_score","dtp_confidence","zsm_score",
    "top1_lang","top1_score","top2_lang","top2_score","top3_lang","top3_score",
    "sentence"
]
print(topN[display_cols].to_string(index=False, max_colwidth=120))

# Insightful breakdowns
print("\nDistributions (describe):")
print(dtp_pred[["dtp_score","dtp_confidence","zsm_score","top_confidence"]].describe())

# Scatter: dtp_score vs zsm_score (only where model predicted dtp)
plt.figure(figsize=(6,5))
sns.scatterplot(
    data=dtp_pred.sample(min(len(dtp_pred), 5000), random_state=42),
    x="zsm_score", y="dtp_score", hue="dtp_confidence", palette="viridis", size="dtp_confidence", sizes=(10,80), alpha=0.6
)
plt.axvline(max_zsm_score, color="red", linestyle="--", label=f"zsm_score <= {max_zsm_score:.3f}")
plt.axhline(min_dtp_score, color="green", linestyle="--", label=f"dtp_score >= {min_dtp_score:.3f}")
plt.title(f"{target_lang} vs {negative_lang} scores (only where predicted {target_lang})")
plt.xlabel(f"{negative_lang} score")
plt.ylabel(f"{target_lang} score")
plt.legend()
plt.tight_layout()
plt.show()

# Histogram of dtp_confidence (only where predicted dtp)
plt.figure(figsize=(6,4))
sns.histplot(dtp_pred["dtp_confidence"], bins=40, kde=True, color="steelblue")
plt.axvline(min_dtp_conf, color="orange", linestyle="--", label=f"dtp_conf >= {min_dtp_conf:.3f}")
plt.title(f"Distribution of dtp_confidence (predicted {target_lang})")
plt.legend()
plt.tight_layout()
plt.show()

# Top competing languages against dtp (second-best when dtp is top1)
if "top2_lang" in dtp_pred.columns:
    comp = dtp_pred["top2_lang"].value_counts().head(15)
    plt.figure(figsize=(7,4))
    sns.barplot(x=comp.values, y=comp.index, orient="h", color="slateblue")
    plt.title(f"Most common competitors (2nd best) when predicted {target_lang}")
    plt.xlabel("Count")
    plt.tight_layout()
    plt.show()

# Error analysis: when model does NOT predict dtp, but dtp_score is still high (near-miss)
not_dtp = df[df["predicted_lang"] != target_lang].copy()
near_miss = not_dtp[not_dtp["dtp_score"] >= df["dtp_score"].quantile(0.9)].sort_values(
    by=["dtp_score","dtp_confidence"], ascending=[False, False]
).head(20)

print("\nNear-miss cases (model didn't predict dtp but dtp_score is high):")
cols2 = [
    "row_index","sentence_index","char_len",
    "predicted_lang","top1_score","top2_lang","top2_score",
    "dtp_score","dtp_confidence","zsm_score","sentence"
]
print(near_miss[cols2].to_string(index=False, max_colwidth=120))

# Summary metrics for the specific “misclassified as zsm” concern:
# High-confidence dtp predictions where zsm is distinctly low
ratio = (len(high_conf_dtp_not_zsm) / max(len(dtp_pred), 1)) * 100
print(f"\nShare of predicted-{target_lang} sentences that strongly reject {negative_lang}: {ratio:.2f}%")

FileNotFoundError: [Errno 2] No such file or directory: 'outputs/dtp_vs_zsm_results.csv'