In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:0

In [None]:
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack

# Load the dataset
dataset = load_dataset("coastalcph/tydi_xor_rc")
train_set = dataset["train"]
validation_set = dataset["validation"]

# Function to filter dataset by language
def filter_by_language(dataset, lang):
    return dataset.filter(lambda example: example['lang'] == lang)

# Filter the data for each language
languages = ['fi', 'ja', 'ru']
train_sets = {lang: filter_by_language(train_set, lang) for lang in languages}
val_sets = {lang: filter_by_language(validation_set, lang) for lang in languages}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/9.85k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/6.87M [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15326 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3028 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15326 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15326 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15326 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3028 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3028 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3028 [00:00<?, ? examples/s]

In [None]:
# Bag-of-Words vectorizer for context (English) and question (language-specific)
context_vectorizer = CountVectorizer()
question_vectorizer = CountVectorizer()

def prepare_data(train_dataset, val_dataset):
    # Separate context and question for BoW (for training set)
    train_contexts = [example['context'] for example in train_dataset]
    train_questions = [example['question'] for example in train_dataset]
    train_labels = [example['answerable'] for example in train_dataset]

    # Fit and transform the BoW vectorizer for context on the training set
    X_train_context = context_vectorizer.fit_transform(train_contexts)

    # Fit and transform the BoW vectorizer for question on the training set
    X_train_question = question_vectorizer.fit_transform(train_questions)

    # Concatenate the two BoW representations for training data
    X_train = hstack([X_train_context, X_train_question])
    y_train = train_labels

    # Now transform the validation set using the already-fitted vectorizers
    val_contexts = [example['context'] for example in val_dataset]
    val_questions = [example['question'] for example in val_dataset]
    val_labels = [example['answerable'] for example in val_dataset]

    # Transform the validation contexts and questions using the previously fit vectorizers
    X_val_context = context_vectorizer.transform(val_contexts)
    X_val_question = question_vectorizer.transform(val_questions)

    # Concatenate the two BoW representations for validation data
    X_val = hstack([X_val_context, X_val_question])
    y_val = val_labels

    return (X_train, y_train), (X_val, y_val)


In [None]:
# Training and evaluation function
def train_and_evaluate(train_data, val_data):
    # Prepare training and validation data using the updated function
    (X_train, y_train), (X_val, y_val) = prepare_data(train_data, val_data)

    # Initialize the classifier
    clf = LogisticRegression(max_iter=1000)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on the validation set
    y_pred = clf.predict(X_val)

    # Generate classification report
    report = classification_report(y_val, y_pred, target_names=['Unanswerable', 'Answerable'], output_dict=True)

    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()

    # Calculate TPR and FPR
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0

    # Add TPR and FPR to the report
    report['TPR'] = tpr
    report['FPR'] = fpr

    return report


In [None]:
# Evaluate the classifiers for each language
results = {}
for lang in languages:
    print(f"Training and evaluating for {lang.capitalize()}:")
    train_data = train_sets[lang]
    val_data = val_sets[lang]

    report = train_and_evaluate(train_data, val_data)
    results[lang] = report
    print(report)


Training and evaluating for Fi:
{'Unanswerable': {'precision': 0.7804878048780488, 'recall': 0.21621621621621623, 'f1-score': 0.3386243386243386, 'support': 148.0}, 'Answerable': {'precision': 0.7618069815195072, 'recall': 0.9763157894736842, 'f1-score': 0.8558246828143022, 'support': 380.0}, 'accuracy': 0.7632575757575758, 'macro avg': {'precision': 0.771147393198778, 'recall': 0.5962660028449502, 'f1-score': 0.5972245107193204, 'support': 528.0}, 'weighted avg': {'precision': 0.767043272915462, 'recall': 0.7632575757575758, 'f1-score': 0.7108518590640852, 'support': 528.0}, 'TPR': 0.9763157894736842, 'FPR': 0.7837837837837838}
Training and evaluating for Ja:
{'Unanswerable': {'precision': 0.6086956521739131, 'recall': 0.08284023668639054, 'f1-score': 0.14583333333333334, 'support': 169.0}, 'Answerable': {'precision': 0.6420323325635104, 'recall': 0.9686411149825784, 'f1-score': 0.7722222222222223, 'support': 287.0}, 'accuracy': 0.6403508771929824, 'macro avg': {'precision': 0.6253639