# Week 36: Multilingual Statistical Analysis & Rule-Based Classification


## Setup & Dependencies

In [3]:
# Environment detection
import os
import sys

IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("Running in Google Colab")
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("Running in local environment")

Running in Google Colab
Mounted at /content/drive


## Package Installation

In [4]:
# Core packages
!pip install -q transformers torch nltk pyarrow fastparquet
!pip install -q matplotlib seaborn pandas scikit-learn
!pip install -q datasets accelerate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m6.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m29.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25h

## Library Imports

In [None]:
# Core Python libraries
import re
import os
import sys
from pathlib import Path
from collections import Counter, defaultdict
from typing import List, Tuple, Dict, Any

# Data manipulation and analysis
import pandas as pd
import numpy as np

# Machine Learning and NLP
import torch
from transformers import pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import nltk
from nltk.corpus import stopwords

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Download NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

# PyTorch setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch device: {device}")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Setup stopwords for English
stop_words = set(stopwords.words('english'))

print("Libraries imported successfully")

PyTorch device: cuda
Libraries imported successfully


## Dataset Configuration & Loading

In [None]:
# Dataset configuration
LANGUAGES = ["ar", "ko", "te"]
LANGUAGE_NAMES = {"ar": "Arabic", "ko": "Korean", "te": "Telugu"}

# Dataset paths (auto-detect Colab vs local)
if IN_COLAB:
    BASE_DIR = Path("/content/drive/MyDrive/Colab_Notebooks/NLP/tydi_xor_rc")
else:
    BASE_DIR = Path("./tydi_xor_rc")

TRAIN_PATH = BASE_DIR / "train.parquet"
VAL_PATH = BASE_DIR / "validation.parquet"

print(f"Dataset directory: {BASE_DIR}")

# Load datasets
if TRAIN_PATH.exists() and VAL_PATH.exists():
    df_train = pd.read_parquet(TRAIN_PATH)
    df_val = pd.read_parquet(VAL_PATH)

    # Filter for target languages (Arabic, Korean, Telugu)
    df_train = df_train[df_train["lang"].isin(LANGUAGES)].copy()
    df_val = df_val[df_val["lang"].isin(LANGUAGES)].copy()

    print(f"Training examples: {len(df_train):,}")
    print(f"Validation examples: {len(df_val):,}")
    print("Dataset loaded successfully")
else:
    print("⚠ Dataset files not found. Please ensure the data is downloaded.")
    df_train = None
    df_val = None

Dataset directory: /content/drive/MyDrive/Colab_Notebooks/NLP/tydi_xor_rc
Training examples: 6,335
Validation examples: 1,155
Dataset loaded successfully


## Translation Model Setup

In [None]:
# Initialize translation model for multilingual analysis
device_id = 0 if torch.cuda.is_available() else -1
TARGET_LANG = 'eng_Latn'

print("Loading NLLB translation model...")
translator = pipeline(
    "translation",
    model="facebook/nllb-200-distilled-600M",
    device=device_id,
    torch_dtype=torch.float16 if device_id == 0 else torch.float32
)

print("✓ Translation model loaded successfully")
print("✓ Setup complete - Ready for Week 36 tasks")

---

## Tasks

### Task (a): Dataset Statistics

Summarize data statistics (size, word count, etc.) for training and validation data in Arabic, Korean, and Telugu.

In [None]:
def compute_language_statistics(df_train_lang, df_val_lang, language_name):
    """Compute comprehensive statistics for a specific language."""

    # Training statistics
    train_total = len(df_train_lang)
    train_answerable = len(df_train_lang[df_train_lang['answerable'] == True])
    train_ans_pct = (train_answerable / train_total * 100) if train_total > 0 else 0

    # Validation statistics
    val_total = len(df_val_lang)
    val_answerable = len(df_val_lang[df_val_lang['answerable'] == True])
    val_ans_pct = (val_answerable / val_total * 100) if val_total > 0 else 0

    # Text length statistics
    train_q_len = df_train_lang['question'].str.len()
    train_c_len = df_train_lang['context'].str.len()
    val_q_len = df_val_lang['question'].str.len()
    val_c_len = df_val_lang['context'].str.len()

    return {
        'language': language_name,
        'train_total': train_total,
        'train_answerable': train_answerable,
        'train_ans_pct': train_ans_pct,
        'val_total': val_total,
        'val_answerable': val_answerable,
        'val_ans_pct': val_ans_pct,
        'train_q_mean': train_q_len.mean(),
        'train_c_mean': train_c_len.mean(),
        'val_q_mean': val_q_len.mean(),
        'val_c_mean': val_c_len.mean()
    }

# Compute statistics for all languages
all_statistics = []
for lang_code in LANGUAGES:
    lang_name = LANGUAGE_NAMES[lang_code]
    train_lang = df_train[df_train['lang'] == lang_code]
    val_lang = df_val[df_val['lang'] == lang_code]
    stats = compute_language_statistics(train_lang, val_lang, lang_name)
    all_statistics.append(stats)

# Display results
print("DATASET STATISTICS")
print("=" * 60)

# Basic statistics table
for stats in all_statistics:
    print(f"\n{stats['language']}:")
    print(f"  Training: {stats['train_total']:,} ({stats['train_answerable']:,} answerable, {stats['train_ans_pct']:.1f}%)")
    print(f"  Validation: {stats['val_total']:,} ({stats['val_answerable']:,} answerable, {stats['val_ans_pct']:.1f}%)")
    print(f"  Avg question length: {stats['train_q_mean']:.0f} chars")
    print(f"  Avg context length: {stats['train_c_mean']:.0f} chars")

# Summary insights
total_train = sum(s['train_total'] for s in all_statistics)
total_val = sum(s['val_total'] for s in all_statistics)
avg_ans_rate = sum(s['train_ans_pct'] for s in all_statistics) / len(all_statistics)

print(f"\nSUMMARY:")
print(f"Total examples: {total_train:,} train, {total_val:,} validation")
print(f"Average answerability rate: {avg_ans_rate:.1f}%")

DATASET STATISTICS

Arabic:
  Training: 2,558 (2,303 answerable, 90.0%)
  Validation: 415 (363 answerable, 87.5%)
  Avg question length: 35 chars
  Avg context length: 644 chars

Korean:
  Training: 2,422 (2,359 answerable, 97.4%)
  Validation: 356 (337 answerable, 94.7%)
  Avg question length: 21 chars
  Avg context length: 600 chars

Telugu:
  Training: 1,355 (1,310 answerable, 96.7%)
  Validation: 384 (291 answerable, 75.8%)
  Avg question length: 42 chars
  Avg context length: 540 chars

SUMMARY:
Total examples: 6,335 train, 1,155 validation
Average answerability rate: 94.7%


### Task (b): Multilingual Word Frequency Analysis


In [9]:
# =============================================================================
# TASK (B): SETUP FOR MULTILINGUAL WORD FREQUENCY ANALYSIS
# =============================================================================

# Install language-specific tokenizers
import subprocess
import sys

try:
    # Korean tokenizer
    subprocess.check_call([sys.executable, "-m", "pip", "install", "konlpy", "-q"])
    from konlpy.tag import Okt
    korean_tokenizer = Okt()
    print("Korean tokenizer (KoNLPy) loaded successfully")
except:
    korean_tokenizer = None
    print("Korean tokenizer not available - will use basic tokenization")

try:
    # Arabic tokenizer
    subprocess.check_call([sys.executable, "-m", "pip", "install", "camel-tools", "-q"])
    from camel_tools.tokenizers.word import simple_word_tokenize
    arabic_tokenizer = simple_word_tokenize
    print("Arabic tokenizer (CAMeL Tools) loaded successfully")
except:
    arabic_tokenizer = None
    print("Arabic tokenizer not available - will use basic tokenization")

try:
    # Telugu tokenizer
    subprocess.check_call([sys.executable, "-m", "pip", "install", "indic-nlp-library", "-q"])
    from indicnlp.tokenize import indic_tokenize
    telugu_tokenizer = lambda text: indic_tokenize.trivial_tokenize(text, lang='te')
    print("Telugu tokenizer (Indic NLP) loaded successfully")
except:
    telugu_tokenizer = None
    print("Telugu tokenizer not available - will use basic tokenization")

# Language-specific tokenizer mapping
NATIVE_TOKENIZERS = {
    'ar': arabic_tokenizer,
    'ko': korean_tokenizer,
    'te': telugu_tokenizer
}

# Translation source language codes for NLLB
NLLB_LANG_CODES = {
    'ar': 'arb_Arab',
    'ko': 'kor_Hang',
    'te': 'tel_Telu'
}

print("Multilingual tokenization setup complete")

Korean tokenizer (KoNLPy) loaded successfully
Arabic tokenizer (CAMeL Tools) loaded successfully
Telugu tokenizer (Indic NLP) loaded successfully
Multilingual tokenization setup complete


#### Sub-version 1: Translate-then-Tokenize Approach


In [None]:
def analyze_frequencies_translate_first(questions, language_code, language_name, top_k=5):
    """
    Approach 1: Translate all questions first, then tokenize and count in English.
    Returns both top words (all) and top words (no stopwords).
    """
    print(f"\nSub-version 1 - {language_name} ({language_code})")
    print("Method: Translate-then-Tokenize")

    if not questions:
        return None

    # Sample questions if too many (for efficiency)
    sample_size = min(1000, len(questions))
    sampled_questions = questions[:sample_size]

    # Translate all questions to English
    print(f"  Translating {len(sampled_questions)} questions...")
    translated_questions = []

    try:
        # Batch translation for efficiency
        batch_size = 50
        for i in range(0, len(sampled_questions), batch_size):
            batch = sampled_questions[i:i+batch_size]

            if i % 200 == 0:
                print(f"    Progress: {i}/{len(sampled_questions)}")

            if language_code in NLLB_LANG_CODES:
                translations = translator(
                    batch,
                    src_lang=NLLB_LANG_CODES[language_code],
                    tgt_lang=TARGET_LANG
                )
                batch_translated = [t['translation_text'].strip() for t in translations]
            else:
                batch_translated = batch  # Already in English

            translated_questions.extend(batch_translated)

    except Exception as e:
        print(f"    Translation error: {e}")
        translated_questions = sampled_questions

    # Tokenize translated text using NLTK English tokenization
    print("  Tokenizing translated questions...")
    all_words = []
    all_words_no_stop = []
    for translated_text in translated_questions:
        tokens = nltk.word_tokenize(translated_text.lower())
        words_all = [token for token in tokens if token.isalnum() and len(token) > 2]
        words_no_stop = [word for word in words_all if word not in stop_words]
        all_words.extend(words_all)
        all_words_no_stop.extend(words_no_stop)

    # Count word frequencies
    word_counts_all = Counter(all_words)
    word_counts_no_stop = Counter(all_words_no_stop)
    top_words_all = word_counts_all.most_common(top_k)
    top_words_no_stop = word_counts_no_stop.most_common(top_k)

    if not top_words_all:
        return None

    # Prepare results
    results = {
        'approach': 'Translate-then-Tokenize',
        'language_name': language_name,
        'total_questions': len(sampled_questions),
        'total_words_all': len(all_words),
        'unique_words_all': len(word_counts_all),
        'total_words_no_stop': len(all_words_no_stop),
        'unique_words_no_stop': len(word_counts_no_stop),
        'top_words_all': [],
        'top_words_no_stop': []
    }

    for i, (word, count) in enumerate(top_words_all):
        results['top_words_all'].append({
            'rank': i + 1,
            'word': word,
            'count': count
        })

    for i, (word, count) in enumerate(top_words_no_stop):
        results['top_words_no_stop'].append({
            'rank': i + 1,
            'word': word,
            'count': count
        })

    return results

# Execute Sub-version 1 for all languages
print("=" * 70)
print("SUB-VERSION 1: TRANSLATE-THEN-TOKENIZE APPROACH")
print("=" * 70)

subversion1_results = []

for lang_code in LANGUAGES:
    lang_name = LANGUAGE_NAMES[lang_code]
    questions = df_train[df_train['lang'] == lang_code]['question'].tolist()

    results = analyze_frequencies_translate_first(questions, lang_code, lang_name)
    if results:
        subversion1_results.append(results)

# Display Sub-version 1 results
for results in subversion1_results:
    print(f"\n{results['language_name']} - {results['approach']}:")
    print(f"  Questions analyzed: {results['total_questions']:,}")
    print(f"  Total words (all): {results['total_words_all']:,}, Unique: {results['unique_words_all']:,}")
    print("  Top 5 English words (all):")
    for word_info in results['top_words_all']:
        print(f"    {word_info['rank']}. {word_info['word']} - {word_info['count']:,}")
    print(f"  Total words (no stopwords): {results['total_words_no_stop']:,}, Unique: {results['unique_words_no_stop']:,}")
    print("  Top 5 English words (no stopwords):")
    for word_info in results['top_words_no_stop']:
        print(f"    {word_info['rank']}. {word_info['word']} - {word_info['count']:,}")

SUB-VERSION 1: TRANSLATE-THEN-TOKENIZE APPROACH

Sub-version 1 - Arabic (ar)
Method: Translate-then-Tokenize
  Translating 1000 questions...
    Progress: 0/1000
    Progress: 200/1000
    Progress: 400/1000


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


    Progress: 600/1000
    Progress: 800/1000
  Tokenizing translated questions...

Sub-version 1 - Korean (ko)
Method: Translate-then-Tokenize
  Translating 1000 questions...
    Progress: 0/1000
    Progress: 200/1000
    Progress: 400/1000
    Progress: 600/1000
    Progress: 800/1000
  Tokenizing translated questions...

Sub-version 1 - Telugu (te)
Method: Translate-then-Tokenize
  Translating 1000 questions...
    Progress: 0/1000
    Progress: 200/1000
    Progress: 400/1000
    Progress: 600/1000
    Progress: 800/1000
  Tokenizing translated questions...

Arabic - Translate-then-Tokenize:
  Questions analyzed: 1,000
  Total words (all): 7,055, Unique: 1,920
  Top 5 English words (all):
    1. the - 989
    2. what - 256
    3. who - 214
    4. when - 209
    5. was - 209
  Total words (no stopwords): 4,266, Unique: 1,860
  Top 5 English words (no stopwords):
    1. first - 138
    2. many - 70
    3. president - 52
    4. world - 49
    5. united - 40

Korean - Translate-then-T

#### Sub-version 2: Tokenize-Native-then-Translate-Top Approach


In [None]:
def tokenize_native_language(text, language_code):
    """Tokenize text using language-specific tokenizers."""
    try:
        if language_code == 'ko' and korean_tokenizer:
            # Korean: extract morphemes
            tokens = korean_tokenizer.morphs(text)
            return [token for token in tokens if len(token) > 1 and token.isalnum()]

        elif language_code == 'ar' and arabic_tokenizer:
            # Arabic: word tokenization
            tokens = arabic_tokenizer(text)
            return [token for token in tokens if len(token) > 2 and token.isalnum()]

        elif language_code == 'te' and telugu_tokenizer:
            # Telugu: basic tokenization
            tokens = telugu_tokenizer(text)
            return [token for token in tokens if len(token) > 1 and token.isalnum()]

        else:
            # Fallback: basic NLTK tokenization
            tokens = nltk.word_tokenize(text)
            return [token for token in tokens if len(token) > 1 and token.isalnum()]

    except Exception:
        # Ultimate fallback: simple split
        return [word for word in text.split() if len(word) > 1 and word.isalnum()]

def analyze_frequencies_native_first(questions, language_code, language_name, top_k=5):
    """
    Approach 2: Tokenize in native language first, then translate only top words.
    """
    print(f"\nSub-version 2 - {language_name} ({language_code})")
    print("Method: Tokenize-Native-then-Translate-Top")

    if not questions:
        return None

    # Sample questions if too many
    sample_size = min(1000, len(questions))
    sampled_questions = questions[:sample_size]

    # Tokenize in native language
    print(f"  Tokenizing {len(sampled_questions)} questions in {language_name}...")
    all_words = []

    for i, question in enumerate(sampled_questions):
        if i % 200 == 0:
            print(f"    Progress: {i}/{len(sampled_questions)}")

        tokens = tokenize_native_language(question, language_code)
        all_words.extend(tokens)

    # Count word frequencies in native language
    word_counts = Counter(all_words)
    top_words_native = word_counts.most_common(top_k)

    if not top_words_native:
        return None

    # Translate only the top words
    print(f"  Translating top {top_k} words...")
    words_to_translate = [word for word, count in top_words_native]

    try:
        if language_code in NLLB_LANG_CODES:
            translations = translator(
                words_to_translate,
                src_lang=NLLB_LANG_CODES[language_code],
                tgt_lang=TARGET_LANG
            )
            translated_words = [t['translation_text'].strip() for t in translations]
        else:
            translated_words = words_to_translate

    except Exception as e:
        print(f"    Translation error: {e}")
        translated_words = words_to_translate

    # Prepare results
    results = {
        'approach': 'Tokenize-Native-then-Translate-Top',
        'language_name': language_name,
        'total_questions': len(sampled_questions),
        'total_words': len(all_words),
        'unique_words': len(word_counts),
        'top_words': []
    }

    for i, ((original, count), translated) in enumerate(zip(top_words_native, translated_words)):
        results['top_words'].append({
            'rank': i + 1,
            'original': original,
            'translated': translated,
            'count': count
        })

    return results

# Execute Sub-version 2 for all languages
print("\n" + "=" * 70)
print("SUB-VERSION 2: TOKENIZE-NATIVE-THEN-TRANSLATE-TOP APPROACH")
print("=" * 70)

subversion2_results = []

for lang_code in LANGUAGES:
    lang_name = LANGUAGE_NAMES[lang_code]
    questions = df_train[df_train['lang'] == lang_code]['question'].tolist()

    results = analyze_frequencies_native_first(questions, lang_code, lang_name)
    if results:
        subversion2_results.append(results)

# Display Sub-version 2 results
for results in subversion2_results:
    print(f"\n{results['language_name']} - {results['approach']}:")
    print(f"  Questions analyzed: {results['total_questions']:,}")
    print(f"  Total words: {results['total_words']:,}, Unique: {results['unique_words']:,}")
    print("  Top 5 words (Original → English):")
    for word_info in results['top_words']:
        print(f"    {word_info['rank']}. {word_info['original']} → {word_info['translated']} - {word_info['count']:,}")


SUB-VERSION 2: TOKENIZE-NATIVE-THEN-TRANSLATE-TOP APPROACH

Sub-version 2 - Arabic (ar)
Method: Tokenize-Native-then-Translate-Top
  Tokenizing 1000 questions in Arabic...
    Progress: 0/1000
    Progress: 200/1000
    Progress: 400/1000
    Progress: 600/1000
    Progress: 800/1000
  Translating top 5 words...

Sub-version 2 - Korean (ko)
Method: Tokenize-Native-then-Translate-Top
  Tokenizing 1000 questions in Korean...
    Progress: 0/1000
    Progress: 200/1000
    Progress: 400/1000
    Progress: 600/1000
    Progress: 800/1000
  Translating top 5 words...

Sub-version 2 - Telugu (te)
Method: Tokenize-Native-then-Translate-Top
  Tokenizing 1000 questions in Telugu...
    Progress: 0/1000
    Progress: 200/1000
    Progress: 400/1000
    Progress: 600/1000
    Progress: 800/1000
  Translating top 5 words...

Arabic - Tokenize-Native-then-Translate-Top:
  Questions analyzed: 1,000
  Total words: 5,436, Unique: 2,705
  Top 5 words (Original → English):
    1. متى → When ? - 189
   

### Task (c): Rule-Based Answerability Classifier


In [None]:
# =============================================================================
# TASK (C): RULE-BASED ANSWERABILITY CLASSIFIER
# =============================================================================

class CrossLingualAnswerabilityClassifier:
    """Rule-based classifier using TF-IDF weighted keyword overlap."""

    def __init__(self, threshold=0.1, min_keyword_length=3):
        self.threshold = threshold
        self.min_keyword_length = min_keyword_length
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)

    def extract_keywords(self, question_translated):
        """Extract meaningful keywords from translated question."""
        words = nltk.word_tokenize(question_translated.lower())
        keywords = [
            word for word in words
            if (word.isalnum() and
                len(word) > self.min_keyword_length and
                word not in stop_words)
        ]
        return keywords

    def compute_overlap_score(self, keywords, context):
        """Compute TF-IDF weighted overlap score."""
        if not keywords:
            return 0.0

        question_text = ' '.join(keywords)
        documents = [question_text, context.lower()]

        try:
            tfidf_matrix = self.tfidf_vectorizer.fit_transform(documents)
            vocab = self.tfidf_vectorizer.vocabulary_

            overlap_score = 0.0
            context_words = set(nltk.word_tokenize(context.lower()))

            for keyword in keywords:
                if keyword in vocab and keyword in context_words:
                    keyword_idx = vocab[keyword]
                    tfidf_score = tfidf_matrix[0, keyword_idx]
                    overlap_score += tfidf_score

            return overlap_score

        except Exception:
            # Fallback: simple overlap
            context_words = set(nltk.word_tokenize(context.lower()))
            simple_overlap = sum(1 for kw in keywords if kw in context_words)
            return simple_overlap / len(keywords) if keywords else 0.0

    def predict_single(self, question, context, language_code):
        """Predict answerability for a single question-context pair."""
        try:
            # Translate question
            if language_code == 'en':
                translated = question
            else:
                src = NLLB_LANG_CODES.get(language_code, language_code)
                translation = translator(question, src_lang=src, tgt_lang=TARGET_LANG)
                translated = translation[0]['translation_text'] if isinstance(translation, list) else translation['translation_text']

            # Extract keywords and compute score
            keywords = self.extract_keywords(translated)
            score = self.compute_overlap_score(keywords, context)
            prediction = score > self.threshold

            return prediction, score, translated, keywords

        except Exception:
            return False, 0.0, question, []

    def evaluate_language(self, df_lang, language_code, language_name):
        """Evaluate classifier on language-specific dataset."""
        print(f"\nEvaluating {language_name}")

        predictions = []
        true_labels = df_lang['answerable'].tolist()

        for idx, (_, row) in enumerate(df_lang.iterrows()):
            if idx % 200 == 0:
                print(f"  Progress: {idx}/{len(df_lang)}")

            pred, score, _, _ = self.predict_single(
                row['question'],
                row['context'],
                language_code
            )
            predictions.append(pred)

        # Compute metrics
        accuracy = sum(p == t for p, t in zip(predictions, true_labels)) / len(true_labels)
        precision = precision_score(true_labels, predictions, zero_division=0)
        recall = recall_score(true_labels, predictions, zero_division=0)
        f1 = f1_score(true_labels, predictions, zero_division=0)

        tn, fp, fn, tp = confusion_matrix(true_labels, predictions).ravel()

        return {
            'language_name': language_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'true_positives': int(tp),
            'true_negatives': int(tn),
            'false_positives': int(fp),
            'false_negatives': int(fn)
        }

# Initialize and evaluate classifier
print("RULE-BASED ANSWERABILITY CLASSIFIER")
print("=" * 60)

classifier = CrossLingualAnswerabilityClassifier(threshold=0.1)
evaluation_results = []

for lang_code in LANGUAGES:
    lang_name = LANGUAGE_NAMES[lang_code]
    lang_val_data = df_val[df_val['lang'] == lang_code].copy()
    results = classifier.evaluate_language(lang_val_data, lang_code, lang_name)
    evaluation_results.append(results)

# Display results
print("\nCLASSIFIER RESULTS:")
for results in evaluation_results:
    print(f"\n{results['language_name']}:")
    print(f"  Accuracy: {results['accuracy']:.3f}")
    print(f"  Precision: {results['precision']:.3f}")
    print(f"  Recall: {results['recall']:.3f}")
    print(f"  F1-Score: {results['f1_score']:.3f}")
    print(f"  TP: {results['true_positives']}, TN: {results['true_negatives']}")
    print(f"  FP: {results['false_positives']}, FN: {results['false_negatives']}")

# Overall summary
avg_accuracy = sum(r['accuracy'] for r in evaluation_results) / len(evaluation_results)
avg_f1 = sum(r['f1_score'] for r in evaluation_results) / len(evaluation_results)

print(f"\nOVERALL PERFORMANCE:")
print(f"Average accuracy: {avg_accuracy:.3f}")
print(f"Average F1-score: {avg_f1:.3f}")

RULE-BASED ANSWERABILITY CLASSIFIER

Evaluating Arabic
  Progress: 0/415
  Progress: 200/415
  Progress: 400/415

Evaluating Korean
  Progress: 0/356
  Progress: 200/356

Evaluating Telugu
  Progress: 0/384
  Progress: 200/384

CLASSIFIER RESULTS:

Arabic:
  Accuracy: 0.764
  Precision: 0.875
  Recall: 0.851
  F1-Score: 0.863
  TP: 309, TN: 8
  FP: 44, FN: 54

Korean:
  Accuracy: 0.806
  Precision: 0.947
  Recall: 0.843
  F1-Score: 0.892
  TP: 284, TN: 3
  FP: 16, FN: 53

Telugu:
  Accuracy: 0.792
  Precision: 0.813
  Recall: 0.942
  F1-Score: 0.873
  TP: 274, TN: 30
  FP: 63, FN: 17

OVERALL PERFORMANCE:
Average accuracy: 0.787
Average F1-score: 0.876
