# Week 38: Answerability Classification with Machine Learning



---

## Setup & Dependencies

In [2]:
# Environment detection
import os
import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab")
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("Running in local environment")

Running in Google Colab
Mounted at /content/drive


### Library Imports

In [None]:
# Core Python libraries
import os
import re
import pickle
from pathlib import Path
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Any

# Data manipulation
import pandas as pd
import numpy as np

# Machine Learning & Deep Learning
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

# NLP & Transformers
import nltk
from transformers import (
    pipeline, 
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments,
    DataCollatorWithPadding
)

# Scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix, 
    classification_report,
    accuracy_score,
    precision_recall_fscore_support
)
from sklearn.utils.class_weight import compute_class_weight

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# PyTorch device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("✓ All libraries imported successfully")

PyTorch device: cuda
Libraries imported successfully


### Dataset Configuration & Loading

In [None]:
# Dataset configuration
LANGUAGES = ["ar", "ko", "te"]
LANGUAGE_NAMES = {"ar": "Arabic", "ko": "Korean", "te": "Telugu"}

# Dataset paths (auto-detect Colab vs local)
if IN_COLAB:
    BASE_DIR = Path("/content/drive/MyDrive/Colab_Notebooks/NLP/tydi_xor_rc")
else:
    # Local path - dataset is in parent directory
    BASE_DIR = Path("../tydi_xor_rc")

TRAIN_PATH = BASE_DIR / "train.parquet"
VAL_PATH = BASE_DIR / "validation.parquet"

print(f"Dataset directory: {BASE_DIR.resolve()}")

# Load datasets
if TRAIN_PATH.exists() and VAL_PATH.exists():
    df_train = pd.read_parquet(TRAIN_PATH)
    df_val = pd.read_parquet(VAL_PATH)

    # Filter for target languages (Arabic, Korean, Telugu)
    df_train = df_train[df_train["lang"].isin(LANGUAGES)].copy()
    df_val = df_val[df_val["lang"].isin(LANGUAGES)].copy()

    print(f"Training examples: {len(df_train):,}")
    print(f"Validation examples: {len(df_val):,}")
    print("✓ Dataset loaded successfully")
else:
    print("⚠ Dataset files not found. Please ensure the data is downloaded.")
    print(f"  Expected location: {BASE_DIR.resolve()}")
    df_train = None
    df_val = None

Dataset directory: /content/drive/MyDrive/Colab_Notebooks/NLP/tydi_xor_rc
Training examples: 6,335
Validation examples: 1,155
Dataset loaded successfully


In [None]:
# Download required NLTK resources
print("Downloading NLTK resources...")
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Initialize translation pipeline for multilingual questions → English
device_id = 0 if torch.cuda.is_available() else -1
print("\nLoading translation model (facebook/nllb-200-distilled-600M)...")
translator = pipeline(
    "translation",
    model="facebook/nllb-200-distilled-600M",
    device=device_id,
    torch_dtype=torch.float16 if device_id == 0 else torch.float32
)

print("✓ Translation model loaded successfully")

In [None]:
# Training configuration
N_TRAIN_MAX = 20000  # Maximum training examples per language
N_VAL_MAX = 5000     # Maximum validation examples per language
MAX_CONTEXT_CHARS = 1500  # Truncate contexts to this length

# Model hyperparameters
LR_LSTM = 2e-3       # Learning rate for BiLSTM model
EPOCHS_LSTM = 8      # Training epochs for BiLSTM

LR_XLMR = 2e-5       # Learning rate for XLM-RoBERTa
EPOCHS_XLMR = 3      # Training epochs for XLM-RoBERTa
MAX_SEQ_LEN_TRANSFORMER = 512  # Maximum sequence length for transformer

print(f"Configuration loaded:")
print(f"  Training: {N_TRAIN_MAX:,} max samples per language")
print(f"  Validation: {N_VAL_MAX:,} max samples per language")
print(f"  Context truncation: {MAX_CONTEXT_CHARS} chars")
print("✓ Setup complete")

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6
Week 38 using existing data: Train=6,335, Val=1,155
Week 38 setup complete


---

## Data Preparation Utilities

In [8]:
# Simple utility functions
def prep_lang_df(df, lang):
    """Prepare data for a specific language."""
    sub = df[df["lang"] == lang].copy()
    sub["context_trunc"] = sub["context"].str[:MAX_CONTEXT_CHARS]
    sub["text"] = sub["question"] + " [SEP] " + sub["context_trunc"]
    sub["label"] = sub["answerable"].astype(int)
    return sub[["text", "label"]]

def maybe_cap(df, nmax):
    """Cap dataset size if needed."""
    return df.sample(n=nmax, random_state=42) if nmax and len(df) > nmax else df

---

## Model A: TF-IDF + Logistic Regression


In [None]:
# Model A: TF-IDF + Logistic Regression

def train_eval_model_a(train_df, val_df, lang):
    print(f"\n[Model A | {LANGUAGE_NAMES[lang]}] TF-IDF char 2–5 + LogisticRegression")

    # Simple pipeline
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(2,5), min_df=3, max_features=50000)),
        ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42))
    ])

    # Train and predict
    pipe.fit(train_df["text"], train_df["label"])
    pred = pipe.predict(val_df["text"])
    prob = pipe.predict_proba(val_df["text"])[:, 1]

    # Metrics
    acc = accuracy_score(val_df["label"], pred)
    prec, rec, f1, _ = precision_recall_fscore_support(val_df["label"], pred, average="binary", zero_division=0)

    print(classification_report(val_df["label"], pred, digits=3))
    return {"acc": acc, "prec": prec, "rec": rec, "f1": f1}, pred, prob

# Run Model A for all languages
print("=" * 50)
print("MODEL A: TF-IDF + LOGISTIC REGRESSION")
print("=" * 50)

model_a_results = []
for lang in LANGUAGES:
    print(f"\n=== {LANGUAGE_NAMES[lang]} ===")

    # Prepare data
    tr = prep_lang_df(df_train, lang)
    va = prep_lang_df(df_val, lang)
    tr = maybe_cap(tr, N_TRAIN_MAX)
    va = maybe_cap(va, N_VAL_MAX)

    print(f"Train: {len(tr):,}, Val: {len(va):,}")

    # Train and evaluate
    metrics, preds, probs = train_eval_model_a(tr, va, lang)
    model_a_results.append({"lang": lang, "model": "A_TFIDF", **metrics})

    print(f"→ F1: {metrics['f1']:.3f}, Acc: {metrics['acc']:.3f}")

print(f"\nModel A Summary:\n{pd.DataFrame(model_a_results)}")

MODEL A: TF-IDF + LOGISTIC REGRESSION

=== Arabic ===
Train: 2,558, Val: 415

[Model A | Arabic] TF-IDF char 2–5 + LogisticRegression
              precision    recall  f1-score   support

           0      0.481     0.500     0.491        52
           1      0.928     0.923     0.925       363

    accuracy                          0.870       415
   macro avg      0.705     0.711     0.708       415
weighted avg      0.872     0.870     0.871       415

→ F1: 0.925, Acc: 0.870

=== Korean ===
Train: 2,422, Val: 356

[Model A | Korean] TF-IDF char 2–5 + LogisticRegression
              precision    recall  f1-score   support

           0      0.200     0.053     0.083        19
           1      0.949     0.988     0.968       337

    accuracy                          0.938       356
   macro avg      0.574     0.520     0.526       356
weighted avg      0.909     0.938     0.921       356

→ F1: 0.968, Acc: 0.938

=== Telugu ===
Train: 1,355, Val: 384

[Model A | Telugu] TF-IDF ch

---

## Model B: BiLSTM + Mean Pooling



In [None]:
# Model B: BiLSTM + Mean Pooling

# Simple tokenization
def simple_tokenize(text):
    return text.split()

class SimpleVocab:
    def __init__(self, texts, max_size=50000):
        words = Counter()
        for text in texts:
            words.update(simple_tokenize(text))

        self.stoi = {"<pad>": 0, "<unk>": 1}
        for word, _ in words.most_common(max_size - 2):
            self.stoi[word] = len(self.stoi)
        self.itos = {i: w for w, i in self.stoi.items()}

    def encode(self, text, max_len=300):
        tokens = simple_tokenize(text)[:max_len]
        return [self.stoi.get(t, 1) for t in tokens]

class SimpleLSTMDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.X = [vocab.encode(t) for t in texts]
        self.y = labels

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return torch.tensor(self.X[i]), torch.tensor(self.y[i])

def collate_fn(batch):
    xs, ys = zip(*batch)
    max_len = max(len(x) for x in xs)
    padded = torch.zeros(len(xs), max_len, dtype=torch.long)
    for i, x in enumerate(xs):
        padded[i, :len(x)] = x
    return padded, torch.tensor(ys)

class SimpleBiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden=128):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden * 2, 2)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        emb = self.emb(x)
        out, _ = self.lstm(emb)
        # Mean pooling: average hidden states across sequence
        mask = (x != 0).float().unsqueeze(-1)
        pooled = (out * mask).sum(1) / mask.sum(1)
        return self.fc(self.dropout(pooled))

def train_eval_model_b(train_df, val_df, lang):
    print(f"\n[Model B | {LANGUAGE_NAMES[lang]}] BiLSTM + mean pooling")

    # Build vocab and datasets
    vocab = SimpleVocab(train_df["text"].tolist())
    train_ds = SimpleLSTMDataset(train_df["text"].tolist(), train_df["label"].tolist(), vocab)
    val_ds = SimpleLSTMDataset(val_df["text"].tolist(), val_df["label"].tolist(), vocab)

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_ds, batch_size=64, shuffle=False, collate_fn=collate_fn)

    # Model and training
    model = SimpleBiLSTM(len(vocab.stoi)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR_LSTM)

    # Train
    for epoch in range(EPOCHS_LSTM):
        model.train()
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        if epoch % 2 == 0:
            print(f"Epoch {epoch}: loss={total_loss/len(train_loader):.3f}")

    # Evaluate
    model.eval()
    all_preds, all_labels, all_probs = [], [], []
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            probs = torch.softmax(logits, dim=1)
            preds = logits.argmax(1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
            all_probs.extend(probs[:, 1].cpu().numpy())

    # Metrics
    acc = accuracy_score(all_labels, all_preds)
    prec, rec, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary", zero_division=0)

    print(classification_report(all_labels, all_preds, digits=3))
    return {"acc": acc, "prec": prec, "rec": rec, "f1": f1}, all_preds, all_probs

# Run Model B for all languages
print("=" * 50)
print("MODEL B: BiLSTM + MEAN POOLING")
print("=" * 50)

model_b_results = []
for lang in LANGUAGES:
    print(f"\n=== {LANGUAGE_NAMES[lang]} ===")

    # Prepare data
    tr = prep_lang_df(df_train, lang)
    va = prep_lang_df(df_val, lang)
    tr = maybe_cap(tr, N_TRAIN_MAX)
    va = maybe_cap(va, N_VAL_MAX)

    print(f"Train: {len(tr):,}, Val: {len(va):,}")

    # Train and evaluate
    metrics, preds, probs = train_eval_model_b(tr, va, lang)
    model_b_results.append({"lang": lang, "model": "B_BiLSTM", **metrics})

    print(f"→ F1: {metrics['f1']:.3f}, Acc: {metrics['acc']:.3f}")

print(f"\nModel B Summary:\n{pd.DataFrame(model_b_results)}")

MODEL B: BiLSTM + ATTENTION

=== Arabic ===
Train: 2,558, Val: 415

[Model B | Arabic] BiLSTM + attention
Epoch 0: loss=0.320
Epoch 2: loss=0.078
Epoch 4: loss=0.015
Epoch 6: loss=0.005
              precision    recall  f1-score   support

           0      1.000     0.115     0.207        52
           1      0.888     1.000     0.940       363

    accuracy                          0.889       415
   macro avg      0.944     0.558     0.574       415
weighted avg      0.902     0.889     0.849       415

→ F1: 0.940, Acc: 0.889

=== Korean ===
Train: 2,422, Val: 356

[Model B | Korean] BiLSTM + attention
Epoch 0: loss=0.160
Epoch 2: loss=0.052
Epoch 4: loss=0.006
Epoch 6: loss=0.002


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.000     0.000     0.000        19
           1      0.947     1.000     0.973       337

    accuracy                          0.947       356
   macro avg      0.473     0.500     0.486       356
weighted avg      0.896     0.947     0.921       356

→ F1: 0.973, Acc: 0.947

=== Telugu ===
Train: 1,355, Val: 384

[Model B | Telugu] BiLSTM + attention
Epoch 0: loss=0.215
Epoch 2: loss=0.105
Epoch 4: loss=0.006
Epoch 6: loss=0.002
              precision    recall  f1-score   support

           0      0.417     0.054     0.095        93
           1      0.763     0.976     0.857       291

    accuracy                          0.753       384
   macro avg      0.590     0.515     0.476       384
weighted avg      0.679     0.753     0.672       384

→ F1: 0.857, Acc: 0.753

Model B Summary:
  lang     model       acc      prec       rec        f1
0   ar  B_BiLSTM  0.889157  0.887531  1.000000  0.940415
1   ko  

---

## Model C: XLM-RoBERTa Fine-tuning


In [None]:
# Model C: XLM-RoBERTa Fine-tuning

def train_eval_model_c(train_df, val_df, lang):
    print(f"\n[Model C | {LANGUAGE_NAMES[lang]}] XLM-RoBERTa fine-tuning")

    model_name = "xlm-roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_fn(examples):
        return tokenizer(examples["text"], truncation=True, max_length=MAX_SEQ_LEN_TRANSFORMER)

    # Convert to HF datasets
    import datasets as ds
    tr_ds = ds.Dataset.from_pandas(train_df.reset_index(drop=True))
    va_ds = ds.Dataset.from_pandas(val_df.reset_index(drop=True))

    tr_ds = tr_ds.map(tokenize_fn, batched=True)
    va_ds = va_ds.map(tokenize_fn, batched=True)

    tr_ds = tr_ds.rename_column("label", "labels")
    va_ds = va_ds.rename_column("label", "labels")

    # Model and trainer
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        acc = (predictions == labels).mean()
        prec, rec, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary", zero_division=0)
        return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

    training_args = TrainingArguments(
        output_dir=f"./temp_{lang}",
        learning_rate=LR_XLMR,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        num_train_epochs=EPOCHS_XLMR,
        eval_strategy="epoch",
        save_strategy="no",
        logging_steps=100,
        report_to="none",
        fp16=torch.cuda.is_available()
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tr_ds,
        eval_dataset=va_ds,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics
    )

    # Train and evaluate
    trainer.train()
    results = trainer.evaluate()

    # Predictions
    predictions = trainer.predict(va_ds)
    preds = np.argmax(predictions.predictions, axis=1)
    probs = torch.softmax(torch.tensor(predictions.predictions), dim=1)[:, 1].numpy()

    return {
        "acc": results["eval_accuracy"],
        "prec": results["eval_precision"],
        "rec": results["eval_recall"],
        "f1": results["eval_f1"]
    }, preds, probs

# Run Model C for all languages
print("=" * 50)
print("MODEL C: XLM-RoBERTa FINE-TUNING")
print("=" * 50)

model_c_results = []
for lang in LANGUAGES:
    print(f"\n=== {LANGUAGE_NAMES[lang]} ===")

    # Prepare data
    tr = prep_lang_df(df_train, lang)
    va = prep_lang_df(df_val, lang)
    tr = maybe_cap(tr, N_TRAIN_MAX)
    va = maybe_cap(va, N_VAL_MAX)

    print(f"Train: {len(tr):,}, Val: {len(va):,}")

    try:
        # Train and evaluate
        metrics, preds, probs = train_eval_model_c(tr, va, lang)
        model_c_results.append({"lang": lang, "model": "C_XLM-R", **metrics})

        print(f"→ F1: {metrics['f1']:.3f}, Acc: {metrics['acc']:.3f}")

    except Exception as e:
        print(f"Error: {e}")
        model_c_results.append({"lang": lang, "model": "C_XLM-R", "acc": 0.0, "prec": 0.0, "rec": 0.0, "f1": 0.0})

print(f"\nModel C Summary:\n{pd.DataFrame(model_c_results)}")

MODEL C: XLM-RoBERTa FINE-TUNING

=== Arabic ===
Train: 2,558, Val: 415

[Model C | Arabic] XLM-RoBERTa fine-tuning


Map:   0%|          | 0/2558 [00:00<?, ? examples/s]

Map:   0%|          | 0/415 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.339,0.20478,0.949398,0.954787,0.988981,0.971583
2,0.1307,0.085843,0.978313,0.994413,0.980716,0.987517
3,0.1015,0.082244,0.980723,0.994429,0.983471,0.98892


→ F1: 0.989, Acc: 0.981

=== Korean ===
Train: 2,422, Val: 356

[Model C | Korean] XLM-RoBERTa fine-tuning


Map:   0%|          | 0/2422 [00:00<?, ? examples/s]

Map:   0%|          | 0/356 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1329,0.315325,0.946629,0.946629,1.0,0.972583
2,0.1169,0.27961,0.946629,0.946629,1.0,0.972583
3,0.0871,0.268853,0.946629,0.946629,1.0,0.972583


→ F1: 0.973, Acc: 0.947

=== Telugu ===
Train: 1,355, Val: 384

[Model C | Telugu] XLM-RoBERTa fine-tuning


Map:   0%|          | 0/1355 [00:00<?, ? examples/s]

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.235,1.465331,0.757812,0.757812,1.0,0.862222
2,0.1812,1.263596,0.757812,0.757812,1.0,0.862222
3,0.1627,1.228007,0.757812,0.757812,1.0,0.862222


→ F1: 0.862, Acc: 0.758

Model C Summary:
  lang    model       acc      prec       rec        f1
0   ar  C_XLM-R  0.980723  0.994429  0.983471  0.988920
1   ko  C_XLM-R  0.946629  0.946629  1.000000  0.972583
2   te  C_XLM-R  0.757812  0.757812  1.000000  0.862222
