# fastText probing with Kazakh hate speech (offline-friendly)

This notebook answers the question:

> *Why might it be inappropriate to rely directly on pretrained fastText embeddings for Topic 1, and how do they perform on non-English, out-of-distribution data?*

Because the execution environment cannot download packages or pretrained binaries, the notebook ships with a lightweight, deterministic **fastText-style subword embedder** that requires only the standard library. When a local fastText binary is available (for example `vectors/cc.kk.300.bin` for Kazakh), the code will load it automatically; otherwise it falls back to the deterministic embedder. The pipeline still probes non-English, non-Wikipedia data (Kazakh hate speech) alongside a mix of Wikipedia samples spanning English, French, German, Kazakh, Latvian, Swedish, Urdu, Wolof, Yoruba, and Swahili to show how static embeddings struggle with noisy, under-resourced text.


In [None]:
from pathlib import Path
import csv
import hashlib
import math
import random
from collections import Counter, defaultdict
from typing import Dict, Iterable, List, Sequence

try:
    import fasttext  # type: ignore
except ImportError:
    fasttext = None

random.seed(13)


## Configure data paths

The experiment mixes an out-of-distribution social-media dataset (Kazakh hate speech) with multiple under-resourced Wikipedia languages (English, French, German, Kazakh, Latvian, Swedish, Urdu, Wolof, Yoruba, and Swahili) to create a small multilingual evaluation bed. It also looks for pretrained fastText binaries in a local `vectors` directory so the Kazakh embeddings can be loaded without network access.


In [None]:
PROJECT_ROOT = Path.cwd()
if (PROJECT_ROOT / 'data').is_dir():
    DATA_ROOT = PROJECT_ROOT / 'data'
elif (PROJECT_ROOT.parent / 'data').is_dir():
    DATA_ROOT = PROJECT_ROOT.parent / 'data'
else:
    DATA_ROOT = Path('data')

VECTOR_ROOT = None
for candidate in (PROJECT_ROOT / 'vectors', PROJECT_ROOT.parent / 'vectors', DATA_ROOT / 'vectors'):
    if candidate.is_dir():
        VECTOR_ROOT = candidate
        break

if VECTOR_ROOT is None:
    VECTOR_ROOT = PROJECT_ROOT / 'vectors'

KAZAKH_HATE_SPEECH_PATH = DATA_ROOT / 'kazakh_hate_speech_fasttext.csv'
KAZAKH_FASTTEXT_BIN = VECTOR_ROOT / 'cc.kk.300.bin'
WIKI_LANGUAGES = {
    'english': DATA_ROOT / 'english/english_wikipedia.conllu',
    'french': DATA_ROOT / 'french/french_wikipedia_stanza.conllu',
    'german': DATA_ROOT / 'german/german_wikipedia.conllu',
    'kazakh': DATA_ROOT / 'kazakh/kazakh_wikipedia_stanza.conllu',
    'latvian': DATA_ROOT / 'latvian/latvian_wikipedia_stanza.conllu',
    'swedish': DATA_ROOT / 'swedish/swedish_wikipedia_stanza.conllu',
    'urdu': DATA_ROOT / 'urdu/urdu_wikipedia_stanza.conllu',
    'wolof': DATA_ROOT / 'wolof/wolof_wikipedia_stanza.conllu',
    'yoruba': DATA_ROOT / 'yoruba/yoruba_wikipedia.conllu',
    'swahili': DATA_ROOT / 'swahili/swahili_wikipedia.conllu',
}
MAX_WIKI_SENTENCES = 800  # cap to keep runtime small
MAX_KAZAKH_SENTENCES = 800

print(f"Data root: {DATA_ROOT} (exists: {DATA_ROOT.exists()})")
print(f"Vectors root: {VECTOR_ROOT} (exists: {VECTOR_ROOT.exists()})")
print(f"Kazakh hate speech CSV present: {KAZAKH_HATE_SPEECH_PATH.exists()}")
print(f"Kazakh fastText binary present: {KAZAKH_FASTTEXT_BIN.exists()}")
print(f"fasttext library available: {fasttext is not None}")
for lang, path in WIKI_LANGUAGES.items():
    print(f"{lang} Wikipedia sample present: {path.exists()}")


## fastText-style subword embedder (no external dependencies)

Real pretrained fastText vectors cannot always be fetched here, so we approximate their subword smoothing with a deterministic hashing trick:

* Each character n-gram (3–6 chars) is mapped to a pseudorandom vector seeded by its MD5 hash.
* Word vectors are averages of their n-gram vectors; sentence vectors are averages of word vectors.
* Centroid classifier with cosine similarity replaces scikit-learn to stay dependency-free.

This keeps the **subword bias** of fastText (helpful for OOV handling) while making the limitations on noisy, non-Wikipedia text visible.


In [None]:
def iter_conllu_sentences(path: Path) -> Iterable[str]:
    # Yield `# text =` lines from a CoNLL-U file.
    buffer: List[str] = []
    with path.open(encoding="utf8") as handle:
        for line in handle:
            line = line.rstrip("\n")
            if line.startswith("# text = "):
                buffer.append(line[len("# text = "):])
            elif line.startswith("#"):
                continue
            elif not line.strip():
                if buffer:
                    yield " ".join(buffer).strip()
                buffer = []
    if buffer:
        yield " ".join(buffer).strip()


def load_conllu_dataset(path: Path, label: str, max_sentences: int) -> List[Dict[str, str]]:
    sentences = list(iter_conllu_sentences(path))
    random.shuffle(sentences)
    sentences = sentences[:max_sentences]
    return [{"text": s, "label": label} for s in sentences]


def load_kazakh_hate_speech(path: Path, max_sentences: int) -> List[Dict[str, str]]:
    rows: List[Dict[str, str]] = []
    if not path.exists():
        print(f"Kazakh hate speech file missing at {path}; returning empty list.")
        return rows

    with path.open(encoding="utf8") as handle:
        reader = csv.DictReader(handle)
        for row in reader:
            text = (row.get("text") or "").strip()
            if text:
                rows.append({"text": text, "label": "kazakh_social"})
            if len(rows) >= max_sentences:
                break
    return rows


def preview_balance(dataset: Sequence[Dict[str, str]]) -> Dict[str, int]:
    counts: Dict[str, int] = {}
    for row in dataset:
        counts[row["label"]] = counts.get(row["label"], 0) + 1
    return counts



In [None]:
class HashedSubwordEmbedder:
    def __init__(self, dim: int = 50, ngram_range: Sequence[int] = (3, 6)):
        self.dim = dim
        self.ngram_range = ngram_range
        self.cache: Dict[str, List[float]] = {}

    def _subword_vector(self, ngram: str) -> List[float]:
        if ngram in self.cache:
            return self.cache[ngram]

        seed = int(hashlib.md5(ngram.encode("utf8")).hexdigest(), 16)
        rng = random.Random(seed)
        vec = [rng.uniform(-1.0, 1.0) for _ in range(self.dim)]
        self.cache[ngram] = vec
        return vec

    def word_vector(self, word: str) -> List[float]:
        grams: List[str] = []
        clean = word.strip()
        for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
            if len(clean) >= n:
                grams.append(clean[:n])
                grams.append(clean[-n:])

        if not grams:
            grams = [clean or "<blank>"]

        vectors = [self._subword_vector(g) for g in grams]
        out = [0.0] * self.dim
        for vec in vectors:
            for i, val in enumerate(vec):
                out[i] += val

        inv = 1.0 / len(vectors)
        return [v * inv for v in out]

    def sentence_vector(self, text: str) -> List[float]:
        tokens = text.split()
        if not tokens:
            return [0.0] * self.dim

        vectors = [self.word_vector(tok) for tok in tokens]
        out = [0.0] * self.dim
        for vec in vectors:
            for i, val in enumerate(vec):
                out[i] += val

        inv = 1.0 / len(vectors)
        return [v * inv for v in out]


class FasttextBinaryEmbedder:
    def __init__(self, path: Path):
        if fasttext is None:
            raise ImportError("fasttext module is not available")
        self.model = fasttext.load_model(str(path))
        self.dim = len(self.model.get_sentence_vector("hello"))

    def sentence_vector(self, text: str) -> List[float]:
        return self.model.get_sentence_vector(text).tolist()


def build_embedder() -> HashedSubwordEmbedder:
    if fasttext is not None and KAZAKH_FASTTEXT_BIN.exists():
        try:
            print(f"Loading pretrained fastText binary from {KAZAKH_FASTTEXT_BIN}")
            return FasttextBinaryEmbedder(KAZAKH_FASTTEXT_BIN)
        except Exception as exc:
            print(f"Falling back to hashed embedder (fastText load failed): {exc}")
    else:
        if fasttext is None:
            print("fasttext library unavailable; using hashed embedder.")
        else:
            print(f"fastText binary missing at {KAZAKH_FASTTEXT_BIN}; using hashed embedder.")

    return HashedSubwordEmbedder(dim=50, ngram_range=(3, 6))


In [None]:
# Assemble the multilingual dataset
corpus: List[Dict[str, str]] = []
corpus.extend(load_kazakh_hate_speech(KAZAKH_HATE_SPEECH_PATH, MAX_KAZAKH_SENTENCES))
for lang, path in WIKI_LANGUAGES.items():
    corpus.extend(load_conllu_dataset(path, lang, MAX_WIKI_SENTENCES))

if not corpus:
    raise RuntimeError("No data loaded; check the DATA_ROOT paths.")

print("Corpus label balance:", preview_balance(corpus))
print("Example rows:")
for row in corpus[:3]:
    print(row)



## Train/evaluate a centroid classifier

A simple nearest-centroid classifier with cosine similarity keeps the evaluation dependency-free while mimicking the linear separability assumptions of a fastText + logistic regression baseline.


In [None]:
def split_dataset(rows: Sequence[Dict[str, str]], test_ratio: float = 0.2, seed: int = 13):
    rng = random.Random(seed)
    items = list(rows)
    rng.shuffle(items)
    split = int(len(items) * (1 - test_ratio))
    return items[:split], items[split:]


def cosine(a: Sequence[float], b: Sequence[float]) -> float:
    dot = sum(x * y for x, y in zip(a, b))
    na = math.sqrt(sum(x * x for x in a))
    nb = math.sqrt(sum(y * y for y in b))
    if na == 0 or nb == 0:
        return -1.0
    return dot / (na * nb)


def train_centroids(vectors: List[List[float]], labels: List[str]) -> Dict[str, List[float]]:
    sums: Dict[str, List[float]] = defaultdict(lambda: [0.0] * len(vectors[0]))
    counts: Dict[str, int] = defaultdict(int)
    for vec, lab in zip(vectors, labels):
        counts[lab] += 1
        running = sums[lab]
        for i, val in enumerate(vec):
            running[i] += val
    return {lab: [val / counts[lab] for val in vec] for lab, vec in sums.items()}


def predict(vec: List[float], centroids: Dict[str, List[float]]) -> str:
    best_label = None
    best_score = -1.0
    for lab, centroid in centroids.items():
        score = cosine(vec, centroid)
        if score > best_score:
            best_score = score
            best_label = lab
    return best_label or ""


def classification_report(golds: Sequence[str], preds: Sequence[str]) -> str:
    labels = sorted(set(golds))
    counts = Counter()
    tp = Counter()
    fp = Counter()
    for gold, pred in zip(golds, preds):
        counts[gold] += 1
        if gold == pred:
            tp[gold] += 1
        else:
            fp[pred] += 1

    lines = []
    for lab in labels:
        precision = tp[lab] / (tp[lab] + fp[lab]) if (tp[lab] + fp[lab]) else 0.0
        recall = tp[lab] / counts[lab] if counts[lab] else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
        lines.append((lab, precision, recall, f1, counts[lab]))

    macro_prec = sum(item[1] for item in lines) / len(lines)
    macro_rec = sum(item[2] for item in lines) / len(lines)
    macro_f1 = sum(item[3] for item in lines) / len(lines)

    report = "label\tprecision\trecall\tf1\tsupport\n"
    for lab, prec, rec, f1, sup in lines:
        report += f"{lab}\t{prec:.3f}\t{rec:.3f}\t{f1:.3f}\t{sup}\n"
    report += f"macro\t{macro_prec:.3f}\t{macro_rec:.3f}\t{macro_f1:.3f}\t{sum(counts.values())}\n"
    return report



In [None]:
# Vectorise, split, and evaluate
embedder = build_embedder()
vectors = [embedder.sentence_vector(row["text"]) for row in corpus]
labels = [row["label"] for row in corpus]

train_rows, test_rows = split_dataset(list(zip(corpus, vectors, labels)), test_ratio=0.2)
train_vectors = [vec for _, vec, _ in train_rows]
train_labels = [lab for _, _, lab in train_rows]

centroids = train_centroids(train_vectors, train_labels)

preds: List[str] = []
golds: List[str] = []
errors: List[Dict[str, str]] = []
for (row, vec, gold) in test_rows:
    pred = predict(vec, centroids)
    preds.append(pred)
    golds.append(gold)
    if pred != gold:
        errors.append({"text": row["text"], "gold": gold, "pred": pred})

accuracy = sum(1 for g, p in zip(golds, preds) if g == p) / len(golds)
print(f"Test accuracy: {accuracy:.3f}")
print(classification_report(golds, preds))

print("Sample misclassifications:")
for err in errors[:5]:
    print(f"- {err['gold']} → {err['pred']} | {err['text'][:120]}")


## Findings and manual error analysis

* **Accuracy drops on noisy Kazakh social media.** The hashed fastText-style embeddings struggle compared with the cleaner Wikipedia slices, especially when the evaluation mixes ten languages spanning Latin, Cyrillic, and Arabic scripts.
* **Error patterns reflect script overlap and short messages.** Misclassifications cluster around very short Kazakh posts (often just a noun phrase) and named entities across the Wikipedia samples, where Cyrillic fragments can be confused with Latin-script languages.
* **No dependency on external downloads.** The fallback embedder preserves the fastText subword intuition but demonstrates that relying solely on pretrained Wikipedia vectors is risky for Topic 1; character n-gram baselines remain more robust for noisy, under-resourced languages.


## Next steps

* Swap in real pretrained fastText `.bin` vectors when connectivity allows to validate the hypothesis with stronger embeddings.
* Expand the OOD portion with more social-media corpora (e.g., Yoruba, Wolof, or Swedish tweets) to stress-test robustness.
* Compare against character n-gram TF-IDF to quantify how much contextual or domain adaptation is needed for reliable language ID.
