# Language Confidence Score

Cel: majac tylko `word_counts` (slownik `word -> count`) oraz liste najczestszych slow w jezyku (`word -> frequency`), wyznaczyc wynik dopasowania tekstu do jezyka.

Wymagane dane do eksperymentu (5 tekstow):
- `wiki_long` (z wiki, 5000+ slow)
- `wiki_short_bad` (z wiki, 20+ slow, jak najgorzej dopasowany do jezyka wiki)
- `ext_<lang>` (dluzszy tekst spoza wiki dla kazdego z 3 jezykow)

Wymagane jezyki: 3 (jezyk wybranej wiki + 2 inne).

Zrodlo listy najczestszych slow: `wordfreq` (min. 1000 slow na jezyk).


In [None]:
import sys
from pathlib import Path

_cwd = Path.cwd().resolve()
_root = None
for _p in [_cwd, *_cwd.parents]:
    if (_p / 'wiki_scraper').is_dir() and (_p / 'wiki_scraper' / '__init__.py').exists():
        _root = _p
        break
if _root is None:
    raise RuntimeError('Could not find project root containing wiki_scraper/')
if str(_root) not in sys.path:
    sys.path.insert(0, str(_root))

DATA_DIR = _root / 'data'
print('cwd:', _cwd)
print('root:', _root)
print('DATA_DIR:', DATA_DIR)


In [None]:
import json
import math
from collections import Counter
from pathlib import Path

import pandas as pd

from wiki_scraper.words import tokenize_words


## Konfiguracja

1. Pliki:
- `data/wiki_long.json`
- `data/wiki_short_bad.json`

2. Teksty zewnetrzne:
- `data/ext_en.txt`
- `data/ext_pl.txt`
- `data/ext_de.txt`

Jezyki w tym notatniku: `en`, `pl`, `de`


In [None]:
LANGUAGES = ['en', 'pl', 'de']
K_VALUES = [3, 10, 100, 1000]

PATH_WIKI_LONG = DATA_DIR / 'wiki_long.json'
PATH_WIKI_SHORT_BAD = DATA_DIR / 'wiki_short_bad.json'

PATH_EXT = {
    'en': DATA_DIR / 'ext_en.txt',
    'pl': DATA_DIR / 'ext_pl.txt',
    'de': DATA_DIR / 'ext_de.txt',
}

print('wiki_long exists:', PATH_WIKI_LONG.exists())
print('wiki_short_bad exists:', PATH_WIKI_SHORT_BAD.exists())
for k, v in PATH_EXT.items():
    print('ext', k, '->', v, 'exists:', v.exists())


## Ladowanie danych


In [None]:
def load_word_counts_json(path: Path) -> dict[str, int]:
    if not path.exists():
        raise FileNotFoundError(f'Missing file: {path}. Create it from wiki_scraper.py --count-words.')
    data = json.loads(path.read_text(encoding='utf-8'))
    if not isinstance(data, dict):
        raise ValueError(f'Invalid JSON object in {path}')
    out: dict[str, int] = {}
    for k, v in data.items():
        if isinstance(k, str) and isinstance(v, int):
            out[k] = v
    return out

def counts_from_text_file(path: Path) -> dict[str, int]:
    if not path.exists():
        raise FileNotFoundError(f'Missing file: {path}. Provide a longer external text.')
    text = path.read_text(encoding='utf-8', errors='replace')
    return dict(Counter(tokenize_words(text)))

def total_words(counts: dict[str, int]) -> int:
    return int(sum(counts.values()))

wiki_long = load_word_counts_json(PATH_WIKI_LONG)
wiki_short_bad = load_word_counts_json(PATH_WIKI_SHORT_BAD)

ext_counts: dict[str, dict[str, int]] = {}
for lang in LANGUAGES:
    ext_counts[lang] = counts_from_text_file(PATH_EXT[lang])

datasets: dict[str, dict[str, int]] = {
    'wiki_long': wiki_long,
    'wiki_short_bad': wiki_short_bad,
}
for lang, wc in ext_counts.items():
    datasets[f'ext_{lang}'] = wc

pd.DataFrame(
    [{'dataset': name, 'total_words': total_words(wc), 'unique_words': len(wc)} for name, wc in datasets.items()]
).sort_values('dataset')


In [None]:
assert total_words(datasets['wiki_long']) >= 5000, 'wiki_long must be 5000+ words'
assert total_words(datasets['wiki_short_bad']) >= 20, 'wiki_short_bad must be 20+ words'
'OK'


## Dane jezykowe (wordfreq)


In [None]:
from wordfreq import top_n_list, word_frequency

def get_language_frequency_list(language_code: str, n: int) -> list[tuple[str, float]]:
    words = top_n_list(language_code, n)
    pairs = [(w, float(word_frequency(w, language_code))) for w in words]
    pairs.sort(key=lambda x: x[1], reverse=True)
    return pairs

LANG_LIST_SIZE = 5000
language_lists: dict[str, list[tuple[str, float]]] = {
    lang: get_language_frequency_list(lang, LANG_LIST_SIZE) for lang in LANGUAGES
}
{lang: len(lst) for lang, lst in language_lists.items()}


## Funkcja lang_confidence_score

Cosine similarity miedzy rozkladem slow tekstu i rozkladem slow jezyka (top-k), oba znormalizowane i ograniczone do wspolnego vocab (top-k slow danego jezyka).


In [None]:
def lang_confidence_score(
    word_counts: dict[str, int],
    language_words_with_frequency: list[tuple[str, float]],
) -> float:
    if not language_words_with_frequency:
        return 0.0
    vocab = [w for (w, _) in language_words_with_frequency if w]
    lang_freq = {w: float(f) for (w, f) in language_words_with_frequency if w}
    lang_sum = sum(lang_freq.values())
    if lang_sum <= 0:
        return 0.0
    lang_vec = {w: lang_freq[w] / lang_sum for w in vocab}

    text_sum = 0
    text_raw: dict[str, float] = {}
    for w in vocab:
        c = int(word_counts.get(w, 0))
        if c > 0:
            text_raw[w] = float(c)
            text_sum += c
    if text_sum <= 0:
        return 0.0
    text_vec = {w: c / text_sum for w, c in text_raw.items()}

    dot = 0.0
    a2 = 0.0
    b2 = 0.0
    for w in vocab:
        a = text_vec.get(w, 0.0)
        b = lang_vec.get(w, 0.0)
        dot += a * b
        a2 += a * a
        b2 += b * b
    denom = math.sqrt(a2) * math.sqrt(b2)
    return 0.0 if denom <= 0 else (dot / denom)


## Wyniki dla k = 3, 10, 100, 1000


In [None]:
rows = []
for k in K_VALUES:
    for lang in LANGUAGES:
        topk = language_lists[lang][:k]
        for dataset_name, wc in datasets.items():
            rows.append({
                'k': k,
                'language': lang,
                'dataset': dataset_name,
                'score': lang_confidence_score(wc, topk),
            })
results = pd.DataFrame(rows)
results.sort_values(['dataset', 'k', 'language']).head(20)


In [None]:
pivot = results.pivot_table(index=['dataset', 'k'], columns='language', values='score')
pivot


## Wykresy


In [None]:
import matplotlib.pyplot as plt

charts_dir = DATA_DIR / 'charts'
charts_dir.mkdir(parents=True, exist_ok=True)

for dataset_name in sorted(datasets.keys()):
    sub = results[results['dataset'] == dataset_name].copy()
    fig, ax = plt.subplots(figsize=(10, 4.5))
    for lang in LANGUAGES:
        s = sub[sub['language'] == lang].sort_values('k')
        ax.plot(s['k'], s['score'], marker='o', label=lang)
    ax.set_xscale('log')
    ax.set_title(f'lang_confidence_score vs k ({dataset_name})')
    ax.set_xlabel('k (log scale)')
    ax.set_ylabel('score')
    ax.grid(alpha=0.25)
    ax.legend()

    out_path = charts_dir / f'{dataset_name}.png'
    fig.savefig(out_path, dpi=200, bbox_inches='tight')
    plt.show()
    plt.close(fig)

print('saved charts to:', charts_dir)
