# WEEK 37


In [1]:
import math
from collections import Counter
from typing import List, Dict, Iterable

import pandas as pd

# NLTK
from nltk.tokenize import wordpunct_tokenize
from nltk.util import ngrams
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.lm import MLE, Laplace, KneserNeyInterpolated,  WittenBellInterpolated


In [2]:
## K = 3

#DOWNLOAD DATASET

splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
print(df_train.head())
print(df_val.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


                                            question  \
0  উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...   
1           দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয় ?   
2  মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...   
3  আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...   
4          বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে ?   

                                             context lang  answerable  \
0  WikiLeaks () is an international non-profit or...   bn        True   
1  The war in Europe concluded with an invasion o...   bn        True   
2  Same-sex marriage in the United States expande...   bn       False   
3  The exact number of Arab casualties is unknown...   bn        True   
4  As Thomas Hall (2000) notes, "The Sung Empire ...   bn        True   

   answer_start        answer answer_inlang  
0           182          2006          None  
1            48       Germany          None  
2            -1            no          None  
3            39       unknown          N

In [3]:

QUESTION_FIELD = "question"

CONTEXT_FIELD = "context"

LANG_FIELD = "lang"

LANGUAGES = ["ar", "ko", "te", "en"]

N_ORDERS = [2, 3]

MIN_FREQ = 3

SMOOTHINGS= ["laplace", "kn", "wb"]


In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')  # ne
from collections import Counter
from typing import List, Dict

from camel_tools.tokenizers.word import simple_word_tokenize      # Arabic
from konlpy.tag import Okt                                        # Korean
from indicnlp.tokenize import indic_tokenize                      # Telugu

okt = Okt()

def get_tokenizer(lang: str):
    if lang == "en":
        return nltk.word_tokenize
    elif lang == "ar":
        return simple_word_tokenize
    elif lang == "ko":
        return okt.morphs
    elif lang == "te":
        return lambda text: list(indic_tokenize.trivial_tokenize(text))
    else:
        # fallback: split on words
        return nltk.word_tokenize

def tokenize_column(series, lang: str):
    tokenizer = get_tokenizer(lang)
    out = []
    for x in series:
        if not isinstance(x, str):
            x = "" if x is None else str(x)
        x = x.lower()
        out.append(tokenizer(x))
    return out

def build_vocab(tokenized: List[List[str]], min_freq: int = 1) -> Dict[str, int]:
    cnt = Counter(w for s in tokenized for w in s)
    vocab = {w: i for i, (w, c) in enumerate(cnt.items()) if c >= min_freq}
    vocab["<OOV>"] = len(vocab)
    return vocab

def apply_oov(tokenized: List[List[str]], vocab: Dict[str, int]) -> List[List[str]]:
    return [[w if w in vocab else "<OOV>" for w in s] for s in tokenized]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
def train_lm(tokenized: List[List[str]], order: int = 2, smoothing: str = "laplace"):
    smoothing = (smoothing or "").lower()
    if smoothing in {"", "mle"}:
        model = MLE(order)
    elif smoothing in {"laplace", "addone", "add-one"}:
        model = Laplace(order)
    elif smoothing in {"kn", "kneserney", "kneser-ney"}:
        model = KneserNeyInterpolated(order)
    elif smoothing in {"wb"}:
        model = WittenBellInterpolated(order)
    else:
        raise ValueError(f"Unsupported smoothing: {smoothing}")

    train_data, vocab = padded_everygram_pipeline(order, tokenized)

    model.fit(train_data, vocab)
    return model


In [6]:
def corpus_perplexity(model, tokenized: List[List[str]], order: int) -> dict:
    total_logp = 0.0
    total_count = 0
    for sent in tokenized:
        padded = list(pad_both_ends(sent, n=order))
        for ng in ngrams(padded, order):
            ctx = ng[:-1]
            w   = ng[-1]
            p = model.score(w, ctx)
            if p <= 0.0:
                p = 1e-12  # guard
            total_logp += math.log(p)
            total_count += 1
    ce  = - total_logp / max(total_count, 1) # cross-entropy
    ppl = math.exp(ce) # perplexity
    return {"cross_entropy": ce, "perplexity": ppl, "tokens_counted": total_count}


In [7]:
results = []

for lang in LANGUAGES:
    # pick the text columns
    if lang == "en":
        tr_series = df_train[CONTEXT_FIELD].dropna().astype(str)
        va_series = df_val[CONTEXT_FIELD].dropna().astype(str)
        # exclude KN for English
        smoothings_here = [s for s in SMOOTHINGS
                           if s.lower() not in {"kn"}]
    else:
        tr_series = df_train[df_train[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str)
        va_series = df_val[df_val[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str)
        # use all listed smoothings
        smoothings_here = SMOOTHINGS[:]

    print(f"\n Language: {lang} | train={len(tr_series)} | valid={len(va_series)}")

    # tokenize
    tr_tok = tokenize_column(tr_series, lang)
    va_tok = tokenize_column(va_series, lang)

    # Build vocabulary from training tokens then map OOV to <UNK>
    vocab     = build_vocab(tr_tok, min_freq=MIN_FREQ)
    tr_tok_u  = apply_oov(tr_tok, vocab)
    va_tok_u  = apply_oov(va_tok, vocab)

    # Train and evaluate n-gram LMs
    for n in N_ORDERS:
        for s in smoothings_here:
            lm = train_lm(tr_tok_u, order=n, smoothing=s)
            metrics = corpus_perplexity(lm, va_tok_u, order=n)
            results.append({
                "language": lang,
                "vocab_size" : len(vocab),
                "order": n,
                "smoothing": s,
                **metrics
            })
            print(f"n={n} {s}: PPL={round(metrics['perplexity'],2)} CE={round(metrics['cross_entropy'],4)} tokens={metrics['tokens_counted']}")



 Language: ar , train=2558 , valid=415


NameError: name 'tokenize_column' is not defined

In [None]:
res_df = pd.DataFrame(results).sort_values(["language","order"])
res_df


Unnamed: 0,language,vocab_size,order,smoothing,cross_entropy,perplexity,tokens_counted
0,ar,965,2,laplace,3.542808,34.563823,3457
1,ar,965,2,kn,2.582526,13.230522,3457
2,ar,965,2,wb,2.55188,12.831205,3457
3,ar,965,3,laplace,4.00228,54.722794,3872
4,ar,965,3,kn,2.48778,12.034535,3872
5,ar,965,3,wb,2.280074,9.777403,3872
18,en,29119,2,laplace,7.319152,1508.923553,353455
19,en,29119,2,wb,5.346028,209.773333,353455
20,en,29119,3,laplace,8.995849,8069.521425,356466
21,en,29119,3,wb,4.988482,146.713584,356466


### POTENTIAL NEURAL MODEL ???

In [None]:
# If needed (Colab/new env), uncomment:
# !pip install -q bpemb torch

import math, re, random
from typing import List, Dict, Tuple

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from bpemb import BPEmb


In [None]:
SEED      = 42
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"

# training knobs (start small, bump after it works)
EPOCHS    = 10
BATCH     = 128
SEQ_LEN   = 120
EMB_DIM   = 100
HIDDEN    = 256
LAYERS    = 2
DROPOUT   = 0.2
LR        = 2e-3
CLIP      = 1.0

# bpemb options
# ar/ko/te/en separately
BPE_PER_LANGUAGE = True
BP_VS   = 5000    # vocab size
BP_DIM  = EMB_DIM   # should equal EMB_DIM

# light text normalization
LOWERCASE    = True
FOLD_NUMBERS = True

# throttle English contexts (they’re huge)
EN_MAX_ROWS_TRAIN    = 30000
EN_MAX_ROWS_VALID    = 5000
EN_MAX_TOK_PER_DOC   = 400 # subword pieces per doc (for speed)
FREEZE_EMBEDDINGS    = True # False = fine-tune embeddings


In [None]:
def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)
    if LOWERCASE:
        s = s.lower()
    if FOLD_NUMBERS:
        s = re.sub(r"\d+", "<num>", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def get_series(lang):
    if lang == "en":
        tr = df_train[CONTEXT_FIELD].dropna().astype(str)
        va = df_val[CONTEXT_FIELD].dropna().astype(str)
        if len(tr) > EN_MAX_ROWS_TRAIN:
            tr = tr.sample(EN_MAX_ROWS_TRAIN, random_state=SEED)
        if len(va) > EN_MAX_ROWS_VALID:
            va = va.sample(EN_MAX_ROWS_VALID, random_state=SEED)
        tr = tr.apply(normalize_text)
        va = va.apply(normalize_text)
        return tr, va, EN_MAX_TOK_PER_DOC
    else:
        tr = df_train[df_train[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str).apply(normalize_text)
        va = df_val[df_val[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str).apply(normalize_text)
        return tr, va, None


In [None]:
def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        s = "" if s is None else str(s)
    if LOWERCASE:
        s = s.lower()
    if FOLD_NUMBERS:
        s = re.sub(r"\d+", "<num>", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def get_series(lang):
    if lang == "en":
        tr = df_train[CONTEXT_FIELD].dropna().astype(str)
        va = df_val[CONTEXT_FIELD].dropna().astype(str)
        if len(tr) > EN_MAX_ROWS_TRAIN:
            tr = tr.sample(EN_MAX_ROWS_TRAIN, random_state=SEED)
        if len(va) > EN_MAX_ROWS_VALID:
            va = va.sample(EN_MAX_ROWS_VALID, random_state=SEED)
        tr = tr.apply(normalize_text)
        va = va.apply(normalize_text)
        return tr, va, EN_MAX_TOK_PER_DOC
    else:
        tr = df_train[df_train[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str).apply(normalize_text)
        va = df_val[df_val[LANG_FIELD] == lang][QUESTION_FIELD].dropna().astype(str).apply(normalize_text)
        return tr, va, None


In [None]:
_bp_cache = {}

def load_bpemb_for_lang(lang: str):
    if BPE_PER_LANGUAGE:
        key = f"{lang}:{BP_VS}:{BP_DIM}"
        if key not in _bp_cache:
            _bp_cache[key] = BPEmb(lang=lang, vs=BP_VS, dim=BP_DIM)
        return _bp_cache[key]
    else:
        key = f"multi:{BP_VS}:{BP_DIM}"
        if key not in _bp_cache:
            _bp_cache[key] = BPEmb(lang="multi", vs=BP_VS, dim=BP_DIM)
        return _bp_cache[key]

def build_embedding_matrix(bp: BPEmb):
    # bp.emb.vectors shape: (vs, dim)
    pad_vec = np.zeros((1, BP_DIM), dtype=np.float32)
    eos_vec = np.zeros((1, BP_DIM), dtype=np.float32)
    mat = np.concatenate([bp.emb.vectors, pad_vec, eos_vec], axis=0).astype(np.float32)
    pad_id = bp.emb.vectors.shape[0] # PAD index
    eos_id = bp.emb.vectors.shape[0] + 1 # EOS index
    return torch.tensor(mat), pad_id, eos_id


In [None]:
def encode_docs_to_stream(bp: BPEmb, docs: List[str], eos_id: int, max_len_per_doc: int = None):
    ids = []
    for d in docs:
        wp = bp.encode_ids(d) # list of subword ids
        if max_len_per_doc:
            wp = wp[:max_len_per_doc]
        ids.extend(wp + [eos_id])
    return np.array(ids, dtype=np.int64)

class WPStreamDataset(Dataset):
    def __init__(self, id_stream: np.ndarray, seq_len: int):
        self.data = torch.tensor(id_stream, dtype=torch.long)
        self.seq_len = seq_len
        self.num_seq = max(0, (len(self.data) - 1) // seq_len)

    def __len__(self): return self.num_seq
    def __getitem__(self, i):
        s = i * self.seq_len
        x = self.data[s : s+self.seq_len]
        y = self.data[s+1 : s+1+self.seq_len]
        return x, y

def make_loaders(id_train: np.ndarray, id_valid: np.ndarray, seq_len: int, batch: int):
    tr_ds = WPStreamDataset(id_train, seq_len)
    va_ds = WPStreamDataset(id_valid, seq_len)
    tr_dl = DataLoader(tr_ds, batch_size=batch, shuffle=True,  drop_last=True)
    va_dl = DataLoader(va_ds, batch_size=batch, shuffle=False, drop_last=False)
    return tr_dl, va_dl


In [None]:
def encode_docs_to_stream(bp: BPEmb, docs: List[str], eos_id: int, max_len_per_doc: int = None):
    ids = []
    for d in docs:
        wp = bp.encode_ids(d) # list of subword ids
        if max_len_per_doc:
            wp = wp[:max_len_per_doc]
        ids.extend(wp + [eos_id])
    return np.array(ids, dtype=np.int64)

class WPStreamDataset(Dataset):
    def __init__(self, id_stream: np.ndarray, seq_len: int):
        self.data = torch.tensor(id_stream, dtype=torch.long)
        self.seq_len = seq_len
        self.num_seq = max(0, (len(self.data) - 1) // seq_len)

    def __len__(self): return self.num_seq
    def __getitem__(self, i):
        s = i * self.seq_len
        x = self.data[s : s+self.seq_len]
        y = self.data[s+1 : s+1+self.seq_len]
        return x, y

def make_loaders(id_train: np.ndarray, id_valid: np.ndarray, seq_len: int, batch: int):
    tr_ds = WPStreamDataset(id_train, seq_len)
    va_ds = WPStreamDataset(id_valid, seq_len)
    tr_dl = DataLoader(tr_ds, batch_size=batch, shuffle=True,  drop_last=True)
    va_dl = DataLoader(va_ds, batch_size=batch, shuffle=False, drop_last=False)
    return tr_dl, va_dl


In [None]:
class WPLSTMLM(nn.Module):
    def __init__(self, emb_matrix: torch.Tensor, pad_id: int, hidden=256, layers=2, dropout=0.2, freeze=True):
        super().__init__()
        V, D = emb_matrix.shape
        self.embed = nn.Embedding.from_pretrained(emb_matrix, freeze=freeze, padding_idx=pad_id)
        self.lstm  = nn.LSTM(D, hidden, num_layers=layers, dropout=dropout, batch_first=True)
        self.drop  = nn.Dropout(dropout)
        self.head  = nn.Linear(hidden, V)

    def forward(self, x, h=None):
        x = self.embed(x)
        x, h = self.lstm(x, h)
        x = self.drop(x)
        logits = self.head(x)
        return logits, h


In [None]:
def set_seed(s=SEED):
    random.seed(s); np.random.seed(s); torch.manual_seed(s); torch.cuda.manual_seed_all(s)

@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    total_loss, total_tok = 0.0, 0
    crit = nn.CrossEntropyLoss(reduction="sum")
    for x, y in loader:
        x = x.to(DEVICE); y = y.to(DEVICE)
        logits, _ = model(x)
        loss = crit(logits.reshape(-1, logits.size(-1)), y.reshape(-1))
        total_loss += loss.item()
        total_tok  += y.numel()
    ce  = total_loss / max(total_tok, 1)
    ppl = math.exp(ce)
    return ce, ppl, total_tok

def train_bpemb_lm(lang, train_series, valid_series, max_tok_per_doc=None):
    set_seed()
    bp   = load_bpemb_for_lang(lang if BPE_PER_LANGUAGE else "multi")
    embM, pad_id, eos_id = build_embedding_matrix(bp)

    ids_tr = encode_docs_to_stream(bp, list(train_series), eos_id, max_tok_per_doc)
    ids_va = encode_docs_to_stream(bp, list(valid_series), eos_id, max_tok_per_doc)

    tr_dl, va_dl = make_loaders(ids_tr, ids_va, SEQ_LEN, BATCH)

    model = WPLSTMLM(embM, pad_id, hidden=HIDDEN, layers=LAYERS, dropout=DROPOUT, freeze=FREEZE_EMBEDDINGS).to(DEVICE)
    opt   = torch.optim.AdamW(model.parameters(), lr=LR)
    sched = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=EPOCHS)

    print(f"[{lang}] subword_vocab={embM.size(0)} train_batches={len(tr_dl)} valid_batches={len(va_dl)} device={DEVICE}")

    best = float("inf")
    for ep in range(1, EPOCHS+1):
        model.train()
        total, steps = 0.0, 0
        crit = nn.CrossEntropyLoss()
        for x, y in tr_dl:
            x = x.to(DEVICE); y = y.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            logits, _ = model(x)
            loss = crit(logits.reshape(-1, logits.size(-1)), y.reshape(-1))
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), CLIP)
            opt.step()
            total += loss.item(); steps += 1
        sched.step()
        ce, ppl, tok = evaluate(model, va_dl)
        best = min(best, ppl)
        print(f"epoch {str(ep).zfill(2)} train_ce={round(total/max(steps,1),4)} val_ce={round(ce,4)} val_ppl={round(ppl,2)} tokens={tok}")
    return best


In [None]:
bpemb_results = []
for lang in LANGUAGES:
    tr, va, max_len = get_series(lang)
    print(f"\n BPEmb LSTM on {lang}: train_rows={len(tr)} valid_rows={len(va)}")
    best_ppl = train_bpemb_lm(lang if lang!="en" else "en", tr, va, max_tok_per_doc=max_len)
    bpemb_results.append({"language": lang, "model": "bpemb_lstm", "best_ppl": best_ppl})

bpemb_results



▶ BPEmb LSTM on ar: train_rows=2558 valid_rows=415
downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs5000.model


100%|██████████| 327600/327600 [00:00<00:00, 690565.45B/s]


downloading https://nlp.h-its.org/bpemb/ar/ar.wiki.bpe.vs5000.d100.w2v.bin.tar.gz


100%|██████████| 1891414/1891414 [00:00<00:00, 2023257.75B/s]


[ar] subword_vocab=5002 train_batches=2 valid_batches=1 device=cuda
  epoch 01  train_ce=8.4895  val_ce=8.3945  val_ppl=4422.85  tokens=5520
  epoch 02  train_ce=8.2661  val_ce=7.4366  val_ppl=1696.89  tokens=5520
  epoch 03  train_ce=7.1892  val_ce=6.6884  val_ppl=803.03  tokens=5520
  epoch 04  train_ce=6.5913  val_ce=6.4668  val_ppl=643.40  tokens=5520
  epoch 05  train_ce=6.4039  val_ce=6.4193  val_ppl=613.57  tokens=5520
  epoch 06  train_ce=6.3435  val_ce=6.4001  val_ppl=601.91  tokens=5520
  epoch 07  train_ce=6.3064  val_ce=6.3890  val_ppl=595.29  tokens=5520
  epoch 08  train_ce=6.2916  val_ce=6.3832  val_ppl=591.83  tokens=5520
  epoch 09  train_ce=6.2834  val_ce=6.3798  val_ppl=589.84  tokens=5520
  epoch 10  train_ce=6.2718  val_ce=6.3786  val_ppl=589.12  tokens=5520

▶ BPEmb LSTM on ko: train_rows=2422 valid_rows=356
downloading https://nlp.h-its.org/bpemb/ko/ko.wiki.bpe.vs5000.model


100%|██████████| 298782/298782 [00:00<00:00, 634951.68B/s]


downloading https://nlp.h-its.org/bpemb/ko/ko.wiki.bpe.vs5000.d100.w2v.bin.tar.gz


100%|██████████| 1875484/1875484 [00:00<00:00, 2009551.21B/s]


[ko] subword_vocab=5002 train_batches=2 valid_batches=1 device=cuda
  epoch 01  train_ce=8.4980  val_ce=8.3483  val_ppl=4223.04  tokens=6360
  epoch 02  train_ce=8.1097  val_ce=7.0354  val_ppl=1136.15  tokens=6360
  epoch 03  train_ce=6.7482  val_ce=6.0519  val_ppl=424.93  tokens=6360
  epoch 04  train_ce=5.9002  val_ce=5.5998  val_ppl=270.38  tokens=6360
  epoch 05  train_ce=5.5192  val_ce=5.4357  val_ppl=229.46  tokens=6360
  epoch 06  train_ce=5.3762  val_ce=5.4165  val_ppl=225.09  tokens=6360
  epoch 07  train_ce=5.3617  val_ce=5.4164  val_ppl=225.08  tokens=6360
  epoch 08  train_ce=5.3457  val_ce=5.4094  val_ppl=223.50  tokens=6360
  epoch 09  train_ce=5.3397  val_ce=5.4024  val_ppl=221.95  tokens=6360
  epoch 10  train_ce=5.3428  val_ce=5.3999  val_ppl=221.39  tokens=6360

▶ BPEmb LSTM on te: train_rows=1355 valid_rows=384
downloading https://nlp.h-its.org/bpemb/te/te.wiki.bpe.vs5000.model


100%|██████████| 348667/348667 [00:00<00:00, 741674.58B/s]


downloading https://nlp.h-its.org/bpemb/te/te.wiki.bpe.vs5000.d100.w2v.bin.tar.gz


100%|██████████| 1896297/1896297 [00:00<00:00, 2023109.37B/s]


[te] subword_vocab=5002 train_batches=1 valid_batches=1 device=cuda
  epoch 01  train_ce=8.5142  val_ce=8.4695  val_ppl=4766.94  tokens=4800
  epoch 02  train_ce=8.4707  val_ce=8.3996  val_ppl=4445.14  tokens=4800
  epoch 03  train_ce=8.3974  val_ce=8.2085  val_ppl=3672.07  tokens=4800
  epoch 04  train_ce=8.1985  val_ce=7.6921  val_ppl=2191.04  tokens=4800
  epoch 05  train_ce=7.6548  val_ce=7.2525  val_ppl=1411.62  tokens=4800
  epoch 06  train_ce=7.1784  val_ce=6.9869  val_ppl=1082.32  tokens=4800
  epoch 07  train_ce=6.8820  val_ce=6.8540  val_ppl=947.70  tokens=4800
  epoch 08  train_ce=6.7316  val_ce=6.7848  val_ppl=884.27  tokens=4800
  epoch 09  train_ce=6.6538  val_ce=6.7536  val_ppl=857.10  tokens=4800
  epoch 10  train_ce=6.6274  val_ce=6.7455  val_ppl=850.23  tokens=4800

▶ BPEmb LSTM on en: train_rows=15343 valid_rows=3011
downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs5000.model


100%|██████████| 315918/315918 [00:00<00:00, 679171.15B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs5000.d100.w2v.bin.tar.gz


100%|██████████| 1888515/1888515 [00:00<00:00, 1984814.10B/s]


[en] subword_vocab=5002 train_batches=173 valid_batches=35 device=cuda
  epoch 01  train_ce=6.4081  val_ce=5.9306  val_ppl=376.39  tokens=533520
  epoch 02  train_ce=5.7721  val_ce=5.5547  val_ppl=258.46  tokens=533520
  epoch 03  train_ce=5.4890  val_ce=5.3378  val_ppl=208.06  tokens=533520
  epoch 04  train_ce=5.3074  val_ce=5.1824  val_ppl=178.11  tokens=533520
  epoch 05  train_ce=5.1819  val_ce=5.0780  val_ppl=160.45  tokens=533520
  epoch 06  train_ce=5.0938  val_ce=5.0059  val_ppl=149.29  tokens=533520
  epoch 07  train_ce=5.0333  val_ce=4.9567  val_ppl=142.12  tokens=533520
  epoch 08  train_ce=4.9940  val_ce=4.9276  val_ppl=138.05  tokens=533520
  epoch 09  train_ce=4.9709  val_ce=4.9133  val_ppl=136.09  tokens=533520
  epoch 10  train_ce=4.9605  val_ce=4.9092  val_ppl=135.54  tokens=533520


[{'language': 'ar', 'model': 'bpemb_lstm', 'best_ppl': 589.1164641212491},
 {'language': 'ko', 'model': 'bpemb_lstm', 'best_ppl': 221.3915942631238},
 {'language': 'te', 'model': 'bpemb_lstm', 'best_ppl': 850.2307748852978},
 {'language': 'en', 'model': 'bpemb_lstm', 'best_ppl': 135.5374765242528}]