In [29]:
# ==== 0) Setup: imports, config ====
import os, io, gzip, math, random, unicodedata, urllib.request, zipfile
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# cc.* are larger and robust; you can switch to wiki.* if you prefer
EN_URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz"
HI_URL = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz"

# MUSE-style bilingual dictionary (one word pair per line: "english hindi")
DICT_URL = "https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt"

#DATA_DIR = "./data_xling"
#os.makedirs(DATA_DIR, exist_ok=True)

# Keep vocab sizes modest for speed/RAM; adjust if you want stronger results
MAX_VOCAB_EN = 120_000
MAX_VOCAB_HI = 120_000

# CSLS params
CSLS_K = 10
N_SRC_FOR_CSLS_R_T = 5000   # subsample of mapped source vectors to compute r_T

# Batch size for similarity computations
BATCH = 2048


In [30]:
# ==== 1) Download files  ====
import os, urllib.request

def download(url, out_path):
    if not os.path.exists(out_path):
        print(f"Downloading {url} -> {out_path}")
        urllib.request.urlretrieve(url, out_path)
    else:
        print(f"Found {out_path}")



BASE_DATA = "./data"
EXTERNAL_DIR = os.path.join(BASE_DATA, "external")
os.makedirs(EXTERNAL_DIR, exist_ok=True)

# Write downloads into ./data/external
en_path   = os.path.join(EXTERNAL_DIR, os.path.basename(EN_URL))
hi_path   = os.path.join(EXTERNAL_DIR, os.path.basename(HI_URL))
dict_path = os.path.join(EXTERNAL_DIR, os.path.basename(DICT_URL))

download(EN_URL, en_path)
download(HI_URL, hi_path)
download(DICT_URL, dict_path)

print("Saved to:", EXTERNAL_DIR)


Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz -> ./data/external/cc.en.300.vec.gz
Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz -> ./data/external/cc.hi.300.vec.gz
Downloading https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt -> ./data/external/en-hi.txt
Saved to: ./data/external


In [2]:
# ==== 1) Download files ====
'''
def download(url, out_path):
    if not os.path.exists(out_path):
        print(f"Downloading {url} -> {out_path}")
        urllib.request.urlretrieve(url, out_path)
    else:
        print(f"Found {out_path}")

en_path = os.path.join(DATA_DIR, os.path.basename(EN_URL))
hi_path = os.path.join(DATA_DIR, os.path.basename(HI_URL))
dict_path = os.path.join(DATA_DIR, os.path.basename(DICT_URL))

download(EN_URL, en_path)
download(HI_URL, hi_path)
download(DICT_URL, dict_path)
'''

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz -> ./data_xling/cc.en.300.vec.gz
Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.hi.300.vec.gz -> ./data_xling/cc.hi.300.vec.gz
Downloading https://dl.fbaipublicfiles.com/arrival/dictionaries/en-hi.txt -> ./data_xling/en-hi.txt


In [3]:
# ==== 2) Load fastText .vec(.gz) into numpy (top-N) ====
def l2_normalize(M, eps=1e-8):
    nrm = np.linalg.norm(M, axis=1, keepdims=True)
    nrm = np.maximum(nrm, eps)
    return M / nrm

def try_parse_header(line):
    # Some .vec have "vocab dim" in first line
    parts = line.strip().split()
    if len(parts) == 2:
        try:
            return int(parts[0]), int(parts[1])
        except:
            return None
    return None

def load_vec_gz(path, max_vocab=None, dtype=np.float32, lowercase=False, normalize_unicode=True):
    words, vecs = [], []
    with gzip.open(path, "rt", encoding="utf-8", errors="ignore") as f:
        first = f.readline()
        # If first line is header, skip it
        header = try_parse_header(first)
        if header is None:
            # first line is actually data; process it
            parts = first.rstrip("\n").split(" ")
            w, vals = parts[0], parts[1:]
            if lowercase: w = w.lower()
            if normalize_unicode: w = unicodedata.normalize("NFC", w)
            words.append(w); vecs.append(np.array(vals, dtype=dtype))
        # now rest
        for i, line in enumerate(f, start=2):
            if max_vocab is not None and len(words) >= max_vocab:
                break
            parts = line.rstrip("\n").split(" ")
            if len(parts) < 10:  # skip broken lines
                continue
            w, vals = parts[0], parts[1:]
            if lowercase: w = w.lower()
            if normalize_unicode: w = unicodedata.normalize("NFC", w)
            words.append(w); vecs.append(np.array(vals, dtype=dtype))
    W = np.vstack(vecs)
    return words, W

print("Loading EN vectors...")
en_words, en_vecs = load_vec_gz(en_path, max_vocab=MAX_VOCAB_EN, lowercase=True)
print("Loading HI vectors...")
hi_words, hi_vecs = load_vec_gz(hi_path, max_vocab=MAX_VOCAB_HI, lowercase=False)

print("Shapes:", en_vecs.shape, hi_vecs.shape)


Loading EN vectors...
Loading HI vectors...
Shapes: (120000, 300) (120000, 300)


In [4]:
# ==== 3) Build word-to-index maps + normalize once ====
en2i = {w:i for i,w in enumerate(en_words)}
hi2i = {w:i for i,w in enumerate(hi_words)}

# Pre-normalize to unit length (we'll also renormalize after mapping)
en_vecs = l2_normalize(en_vecs)
hi_vecs = l2_normalize(hi_vecs)


In [5]:
# ==== 4) Load dictionary and filter to words we actually have ====
pairs = []
with io.open(dict_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip(): continue
        src, tgt = line.strip().split()
        src = unicodedata.normalize("NFC", src).lower()
        tgt = unicodedata.normalize("NFC", tgt)
        if src in en2i and tgt in hi2i:
            pairs.append((src, tgt))

print("Total dictionary pairs present in our vocabs:", len(pairs))

# Shuffle and train/dev/test split (80/10/10)
rng = np.random.default_rng(SEED)
rng.shuffle(pairs)
n = len(pairs)
n_train = int(0.8 * n)
n_dev   = int(0.1 * n)
train_pairs = pairs[:n_train]
dev_pairs   = pairs[n_train:n_train+n_dev]
test_pairs  = pairs[n_train+n_dev:]

print(len(train_pairs), len(dev_pairs), len(test_pairs))


Total dictionary pairs present in our vocabs: 24697
19757 2469 2471


In [6]:
# ==== 5) Build aligned matrices for seed pairs ====
def pairs_to_mats(pairs, src2i, tgt2i, src_vecs, tgt_vecs):
    xs = np.stack([src_vecs[src2i[s]] for s,t in pairs], axis=0)
    ys = np.stack([tgt_vecs[tgt2i[t]] for s,t in pairs], axis=0)
    return xs, ys

Xtr, Ytr = pairs_to_mats(train_pairs, en2i, hi2i, en_vecs, hi_vecs)
Xdv, Ydv = pairs_to_mats(dev_pairs,   en2i, hi2i, en_vecs, hi_vecs)
Xte, Yte = pairs_to_mats(test_pairs,  en2i, hi2i, en_vecs, hi_vecs)

print("Train/dev/test matrices:", Xtr.shape, Xdv.shape, Xte.shape)


Train/dev/test matrices: (19757, 300) (2469, 300) (2471, 300)


In [7]:
# ==== 6) Simple mappings ====
def fit_lls(X, Y):
    # Solve min_W ||XW - Y||_F via least squares
    # lstsq handles rank issues gracefully
    W, *_ = np.linalg.lstsq(X, Y, rcond=None)
    return W  # shape (d, d)

def fit_procrustes(X, Y):
    # Solve min_W ||XW - Y|| with W orthonormal; W = U V^T, SVD of X^T Y
    M = X.T @ Y
    U, _, Vt = np.linalg.svd(M, full_matrices=False)
    W = U @ Vt
    return W

# Learn mappings on train
W_lls  = fit_lls(Xtr, Ytr)
W_proc = fit_procrustes(Xtr, Ytr)


In [8]:
# ==== 7) Retrieval helpers (cosine + CSLS) ====
# Note: All vectors assumed L2-normalized.

def cosine_topk(queries, targets, k=10, batch=BATCH):
    # returns (indices, sims) for top-k targets per query
    nQ = queries.shape[0]
    top_idx = np.empty((nQ, k), dtype=np.int32)
    top_sim = np.empty((nQ, k), dtype=np.float32)
    for i in range(0, nQ, batch):
        q = queries[i:i+batch]
        S = q @ targets.T  # (b, Nt)
        # partial top-k
        idx = np.argpartition(-S, kth=k-1, axis=1)[:, :k]
        part = np.take_along_axis(S, idx, axis=1)
        # sort within the k
        ordk = np.argsort(-part, axis=1)
        idx = np.take_along_axis(idx, ordk, axis=1)
        part = np.take_along_axis(part, ordk, axis=1)
        top_idx[i:i+q.shape[0]] = idx
        top_sim[i:i+q.shape[0]] = part
    return top_idx, top_sim

def mean_topk_cos_to_A_per_target(B, A, k=CSLS_K, batch=BATCH):
    # r_T(y) = mean of top-k cos(y, A) for each y in B
    Nt = B.shape[0]
    rT = np.empty(Nt, dtype=np.float32)
    for i in range(0, Nt, batch):
        b = B[i:i+batch]
        S = b @ A.T  # (b, Na)
        idx = np.argpartition(-S, kth=k-1, axis=1)[:, :k]
        part = np.take_along_axis(S, idx, axis=1)
        rT[i:i+b.shape[0]] = part.mean(axis=1)
    return rT

def csls_scores(queries, targets, rT, k=CSLS_K, batch=BATCH):
    # returns full CSLS score matrices in batches (to pick argmax/argtopk)
    # For memory, we compute per-batch and return top-k like cosine_topk
    nQ = queries.shape[0]
    k_out = 10  # we want at least up to P@10
    top_idx = np.empty((nQ, k_out), dtype=np.int32)
    top_scr = np.empty((nQ, k_out), dtype=np.float32)
    for i in range(0, nQ, batch):
        q = queries[i:i+batch]
        # r_S(q): mean of top-k cos(q, targets)
        S = q @ targets.T  # (b, Nt)
        idx_q = np.argpartition(-S, kth=k-1, axis=1)[:, :k]
        part_q = np.take_along_axis(S, idx_q, axis=1)
        rS = part_q.mean(axis=1, keepdims=True)  # (b,1)

        # CSLS: 2*cos - rT - rS
        CS = 2.0 * S - rT[None, :] - rS  # broadcast
        idx = np.argpartition(-CS, kth=k_out-1, axis=1)[:, :k_out]
        part = np.take_along_axis(CS, idx, axis=1)
        ordk = np.argsort(-part, axis=1)
        idx = np.take_along_axis(idx, ordk, axis=1)
        part = np.take_along_axis(part, ordk, axis=1)
        top_idx[i:i+q.shape[0]] = idx
        top_scr[i:i+q.shape[0]] = part
    return top_idx, top_scr


In [9]:
# ==== 8) BLI evaluation (P@1/5/10, MRR) ====
def compute_metrics(topk_idx, gold_tgt_idx, k_list=(1,5,10)):
    # gold_tgt_idx: shape (nQ,), each the index in the target vocab
    nQ = len(gold_tgt_idx)
    hits = {k:0 for k in k_list}
    rr_sum = 0.0
    for i in range(nQ):
        pred = topk_idx[i]
        # find rank of gold
        # where does gold appear?
        where = np.where(pred == gold_tgt_idx[i])[0]
        if where.size > 0:
            rank = int(where[0]) + 1
        else:
            rank = None
        if rank is not None:
            for k in k_list:
                if rank <= k:
                    hits[k] += 1
            rr_sum += 1.0 / rank
        else:
            # no hit within returned top-k; MRR contributes 0
            pass
    res = {f"P@{k}": hits[k]/nQ for k in k_list}
    res["MRR"] = rr_sum / nQ
    return res

# Helper to prepare test queries and gold target indices
def test_queries_and_gold(test_pairs, src2i, tgt2i, src_vecs, tgt_vecs, W=None):
    # map EN -> HI
    Xq = np.stack([src_vecs[src2i[s]] for s,t in test_pairs], axis=0)
    if W is not None:
        Xq = Xq @ W
        Xq = l2_normalize(Xq)
    gold = np.array([tgt2i[t] for s,t in test_pairs], dtype=np.int32)
    return Xq, gold


In [10]:
# ==== 9) Run BLI for the methods and scorers ====

# Target matrix for retrieval
TGT = hi_vecs  # (Nt, d) already normalized

# Prepare test queries per method
methods = {
    "NoMap": None,               # use EN as-is (will perform poorly; baseline)
    "LLS": W_lls,                # learned linear map
    "Procrustes": W_proc,        # orthogonal map
}

# Optional "NormOnly" baseline is essentially the same as "NoMap" here,
# since we already normalized both spaces. If you want it explicitly:
# methods["NormOnly"] = None

results = []

# Precompute r_T for CSLS once per method (depends on mapped source distribution)
def compute_rT_for_method(W):
    # Use a subset of mapped train EN vectors to define neighbor density
    Xmap = Xtr if W is None else l2_normalize(Xtr @ W)
    if len(Xmap) > N_SRC_FOR_CSLS_R_T:
        Xmap = Xmap[:N_SRC_FOR_CSLS_R_T]
    return mean_topk_cos_to_A_per_target(TGT, Xmap, k=CSLS_K, batch=BATCH)

for name, W in methods.items():
    print(f"\n=== Method: {name} ===")
    # Build test queries
    Xq, gold = test_queries_and_gold(test_pairs, en2i, hi2i, en_vecs, hi_vecs, W=W)

    # Cosine retrieval
    top_idx_cos, _ = cosine_topk(Xq, TGT, k=10, batch=BATCH)
    m_cos = compute_metrics(top_idx_cos, gold)
    print("Cosine:", m_cos)

    # CSLS retrieval
    rT = compute_rT_for_method(W)
    top_idx_csls, _ = csls_scores(Xq, TGT, rT, k=CSLS_K, batch=BATCH)
    m_csls = compute_metrics(top_idx_csls, gold)
    print("CSLS:", m_csls)

    results.append((name, m_cos, m_csls))

# Print compact table
print("\n=== Summary (P@1 / P@5 / P@10 / MRR) ===")
for name, m_cos, m_csls in results:
    def fmt(m): return f"{m['P@1']:.3f} / {m['P@5']:.3f} / {m['P@10']:.3f} / {m['MRR']:.3f}"
    print(f"{name:12s}  COS  {fmt(m_cos)}    |   CSLS {fmt(m_csls)}")

# Coverage =
# fraction of dictionary pairs present is shown earlier ("Total dictionary pairs present...").



=== Method: NoMap ===
Cosine: {'P@1': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'MRR': 0.0}
CSLS: {'P@1': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'MRR': 0.0}

=== Method: LLS ===
Cosine: {'P@1': 0.11493322541481182, 'P@5': 0.2630513961958721, 'P@10': 0.34075273168757586, 'MRR': 0.17958782190874453}
CSLS: {'P@1': 0.20518008903278026, 'P@5': 0.4196681505463375, 'P@10': 0.49575070821529743, 'MRR': 0.2947640245900063}

=== Method: Procrustes ===
Cosine: {'P@1': 0.17280453257790368, 'P@5': 0.3597733711048159, 'P@10': 0.43261837312828816, 'MRR': 0.2524013476967745}
CSLS: {'P@1': 0.21448806151355726, 'P@5': 0.41278834479967624, 'P@10': 0.4925131525698098, 'MRR': 0.30003276097974635}

=== Summary (P@1 / P@5 / P@10 / MRR) ===
NoMap         COS  0.000 / 0.000 / 0.000 / 0.000    |   CSLS 0.000 / 0.000 / 0.000 / 0.000
LLS           COS  0.115 / 0.263 / 0.341 / 0.180    |   CSLS 0.205 / 0.420 / 0.496 / 0.295
Procrustes    COS  0.173 / 0.360 / 0.433 / 0.252    |   CSLS 0.214 / 0.413 / 0.493 / 0.300


In [12]:
# A) Coverage & quick stats
total_pairs_raw = sum(1 for _ in open(dict_path, encoding="utf-8"))
total_pairs_kept = len(train_pairs) + len(dev_pairs) + len(test_pairs)
print(f"Dictionary lines (raw): {total_pairs_raw}")
print(f"Pairs kept (in-vocab): {total_pairs_kept} ({100*total_pairs_kept/max(1,total_pairs_raw):.1f}%)")
print(f"Train/Dev/Test sizes: {len(train_pairs)}/{len(dev_pairs)}/{len(test_pairs)}")
print(f"EN vocab used: {len(en_words)}  |  HI vocab used: {len(hi_words)}")


Dictionary lines (raw): 38221
Pairs kept (in-vocab): 24697 (64.6%)
Train/Dev/Test sizes: 19757/2469/2471
EN vocab used: 120000  |  HI vocab used: 120000


In [14]:
# B) Nearest neighbors (EN→HI) with COS or CSLS
from collections import defaultdict
gold_hi_by_en = defaultdict(list)
for s,t in (train_pairs + dev_pairs + test_pairs):
    gold_hi_by_en[s].append(t)

def _compute_rT_for(method_name):
    W = methods[method_name]
    Xmap = Xtr if W is None else l2_normalize(Xtr @ W)
    Xsub = Xmap[:min(5000, len(Xmap))]
    return mean_topk_cos_to_A_per_target(hi_vecs, Xsub, k=CSLS_K, batch=BATCH)

_rT_cache = {}

def show_neighbors_en2hi(word_en, method="Procrustes", scorer="csls", k=10):
    w = word_en.strip().lower()
    if w not in en2i:
        print(f"'{word_en}' not in EN vocab."); return
    x = en_vecs[en2i[w]][None,:]
    W = methods[method]
    if W is not None:
        x = l2_normalize(x @ W)
    if scorer.lower() == "cos":
        idx, sims = cosine_topk(x, hi_vecs, k=k, batch=BATCH)
        scores = sims[0]
    else:
        if method not in _rT_cache:
            _rT_cache[method] = _compute_rT_for(method)
        idx, scr = csls_scores(x, hi_vecs, _rT_cache[method], k=CSLS_K, batch=BATCH)
        scores = scr[0]
    preds = [hi_words[j] for j in idx[0]]
    print(f"\nEN: {w}")
    print("Gold HI:", gold_hi_by_en[w] if gold_hi_by_en[w] else "(none)")
    print(f"Top-{k} ({method}, {scorer.upper()}):")
    for r,(tok,sc) in enumerate(zip(preds, scores), 1):
        mark = " <- GOLD" if tok in gold_hi_by_en[w] else ""
        print(f"{r:2d}. {tok:20s}  {sc:.4f}{mark}")

# Examples:
show_neighbors_en2hi("river", method="Procrustes", scorer="csls", k=10)
show_neighbors_en2hi("computer", method="LLS", scorer="cos", k=10)



EN: river
Gold HI: ['नदी']
Top-10 (Procrustes, CSLS):
 1. नदी                   0.1804 <- GOLD
 2. किनारे                0.1739
 3. समीप                  0.1043
 4. घाट                   0.0303
 5. गंगा                  0.0288
 6. रिवर                  0.0275
 7. नदिया                 0.0222
 8. झील                   0.0186
 9. जलप्रपात              0.0181
10. ज्वारनदीमुख           0.0044

EN: computer
Gold HI: ['कंप्यूटर', 'कम्प्यूटर', 'संगणक']
Top-10 (LLS, COS):
 1. कंप्यूटर              0.7457 <- GOLD
 2. सॉफ्टवेयर             0.6296
 3. आभूषणपुस्तकेंकंप्यूटर  0.5972
 4. मोबाइल                0.5827
 5. कम्प्यूटर             0.5810 <- GOLD
 6. कम्पयूटर              0.5748
 7. तकनीकी                0.5732
 8. कंप्यूट               0.5695
 9. ख़बरट्रैवलफोटोशब्दकोशज्योतिषस्वास्थ्यगानेसिनेमाकूपनहिन्दी  0.5590
10. इंटरनेट               0.5557


In [21]:
# C) Inspect a test word: where does gold land?
def _rank_of(pred_indices, gold_index):
    pos = np.where(pred_indices == gold_index)[0]
    return int(pos[0])+1 if pos.size else None

def inspect_test_word(src_en, method="Procrustes", scorer="csls", k=10):
    w = src_en.strip().lower()
    if (w, ) not in [(s,) for s,_ in test_pairs]:
        print(f"'{w}' not in test set."); return
    gold_hi = [t for s,t in test_pairs if s==w][0]
    x = en_vecs[en2i[w]][None,:]
    W = methods[method]
    if W is not None:
        x = l2_normalize(x @ W)
    if scorer.lower()=="cos":
        idx, sims = cosine_topk(x, hi_vecs, k=k, batch=BATCH)
    else:
        if method not in _rT_cache:
            _rT_cache[method] = _compute_rT_for(method)
        idx, sims = csls_scores(x, hi_vecs, _rT_cache[method], k=CSLS_K, batch=BATCH)
    preds = [hi_words[j] for j in idx[0]]
    r = _rank_of(idx[0], hi2i[gold_hi])
    print(f"\nTest word EN: {w}  | GOLD HI: {gold_hi}  | Rank: {r}")
    for i,(tok,sc) in enumerate(zip(preds, sims[0]), 1):
        print(f"{i:2d}. {tok:20s} {sc:.4f}{' <- GOLD' if tok==gold_hi else ''}")

# Example:
inspect_test_word("River", method="Procrustes", scorer="csls", k=10)


'river' not in test set.


In [None]:
# Enable a font that supports Devanagari (Hindi) in Colab
!wget -q https://github.com/google/fonts/raw/main/ofl/notosansdevanagari/NotoSansDevanagari-Regular.ttf -O /usr/share/fonts/truetype/NotoSansDevanagari-Regular.ttf

import matplotlib
matplotlib.rcParams['font.family'] = 'Noto Sans Devanagari'


In [25]:
# E) HI→EN mapping + evaluation (minimal)
def fit_lls(X, Y):  # (re)define tiny helper here for clarity
    W, *_ = np.linalg.lstsq(X, Y, rcond=None); return W

def fit_procrustes(X, Y):
    U, _, Vt = np.linalg.svd(X.T @ Y, full_matrices=False); return U @ Vt

# Train HI->EN mappings on the same pairs
W_lls_hi2en  = fit_lls(Ytr, Xtr)
W_proc_hi2en = fit_procrustes(Ytr, Xtr)
methods_hi2en = {"NoMap": None, "LLS": W_lls_hi2en, "Procrustes": W_proc_hi2en}

def test_queries_and_gold_hi2en(pairs_):
    Xq = np.stack([hi_vecs[hi2i[t]] for s,t in pairs_], axis=0)
    gold = np.array([en2i[s] for s,t in pairs_], dtype=np.int32)
    return Xq, gold

def run_bli_hi2en():
    TGT = en_vecs
    for name, W in methods_hi2en.items():
        Xq, gold = test_queries_and_gold_hi2en(test_pairs)
        if W is not None: Xq = l2_normalize(Xq @ W)
        # cosine
        top_idx_cos, _ = cosine_topk(Xq, TGT, k=10, batch=BATCH)
        m_cos = compute_metrics(top_idx_cos, gold)
        # csls
        Xmap = Ytr if W is None else l2_normalize(Ytr @ W)
        rT = mean_topk_cos_to_A_per_target(TGT, Xmap[:min(5000,len(Xmap))], k=CSLS_K, batch=BATCH)
        top_idx_csls, _ = csls_scores(Xq, TGT, rT, k=CSLS_K, batch=BATCH)
        m_csls = compute_metrics(top_idx_csls, gold)
        print(f"{name:11s}  COS {m_cos}  |  CSLS {m_csls}")

# Run:
run_bli_hi2en()


NoMap        COS {'P@1': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'MRR': 0.0}  |  CSLS {'P@1': 0.0, 'P@5': 0.0, 'P@10': 0.0, 'MRR': 0.0}
LLS          COS {'P@1': 0.17280453257790368, 'P@5': 0.35127478753541075, 'P@10': 0.4087414002428167, 'MRR': 0.24750342063170902}  |  CSLS {'P@1': 0.2237960339943343, 'P@5': 0.44071226224200727, 'P@10': 0.5240793201133145, 'MRR': 0.3157580312578293}
Procrustes   COS {'P@1': 0.18251719951436665, 'P@5': 0.3986240388506678, 'P@10': 0.48725212464589235, 'MRR': 0.2741069421158456}  |  CSLS {'P@1': 0.17280453257790368, 'P@5': 0.3925536220153784, 'P@10': 0.48927559692432215, 'MRR': 0.26594672807744457}


In [28]:
# F) Tiny sentence retrieval (toy parallel)
RUN_SENT_RETRIEVAL = True
if RUN_SENT_RETRIEVAL:
    en_sents = [
        "the river flows near the village",
        "students study computer science",
        "a green forest surrounds the lake",
        "she bought fresh vegetables from the market",
        "the weather is pleasant today",
    ]
    hi_sents = [
        "नदी गाँव के पास बहती है",
        "छात्र कंप्यूटर विज्ञान पढ़ते हैं",
        "एक हरा जंगल झील को घेरता है",
        "उसने बाजार से ताज़ी सब्जियाँ खरीदीं",
        "आज मौसम सुहावना है",
    ]
    def sent_vec_en(s):
        idx = [en2i.get(w, None) for w in s.lower().split()]
        idx = [i for i in idx if i is not None]
        return np.mean(en_vecs[idx], axis=0) if idx else np.zeros(en_vecs.shape[1])
    def sent_vec_hi(s):
        idx = [hi2i.get(w, None) for w in s.split()]
        idx = [i for i in idx if i is not None]
        return np.mean(hi_vecs[idx], axis=0) if idx else np.zeros(hi_vecs.shape[1])

    EN = np.stack([sent_vec_en(s) for s in en_sents])
    HI = np.stack([sent_vec_hi(s) for s in hi_sents])
    EN_map = l2_normalize(EN @ W_proc)   # map EN→HI using Procrustes you learned

    idx, _ = cosine_topk(EN_map, HI, k=1, batch=64)
    hits1 = (idx[:,0] == np.arange(len(hi_sents))).mean()
    print(f"Sentence retrieval Recall@1 (toy): {hits1:.2f}")


Sentence retrieval Recall@1 (toy): 0.80
