In [None]:
!pip install tqdm
from google.colab import files
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import pandas as pd
from tqdm import tqdm  # Import tqdm for the progress bar




In [None]:

## Embedding and Encoding Functions using distilBERT


# Function to compute mean word embeddings for a given text
def get_mean_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the token embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.squeeze().numpy()


# Returnes a features dataframe (X): for every row, X_i = concat (mean_embedding(question), mean_embedding(context))
def featurize(df):
  # Load the pre-trained multilingual DistilBERT model and tokenizer
  tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
  model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')

  feature_vectors = []
  for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    question_embedding = get_mean_embedding(row['question'], tokenizer, model)
    context_embedding = get_mean_embedding(row['context'], tokenizer, model)
     # Concatenate the question and context embeddings
    feature_vector = torch.cat((torch.tensor(question_embedding), torch.tensor(context_embedding)), dim=0)
    feature_vectors.append(feature_vector.numpy())
    # Convert the list of feature vectors into a dataframe
  feature_df = pd.DataFrame(feature_vectors)
  return feature_df




In [None]:
def get_train_df():
  splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
  df = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
  return df



def get_validation_df():
  splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
  df = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
  return df

In [None]:
df = get_train_df()
df_ru = df[df["lang"] == "ru"]

df_ja = df[df["lang"] == "ja"]
feature_df = featurize(df)

NameError: name 'get_train_df' is not defined

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report


from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import confusion_matrix
import random

import tensorflow as tf
from tensorflow.keras import layers, models

def train_model_keras(X_df, y_df):
    # Process Data
    input_tensor = tf.convert_to_tensor(X_df.values, dtype=tf.float32)
    label_tensor = tf.convert_to_tensor(y_df.values, dtype=tf.float32)

    # Create a Keras Input Layer
    input_layer = layers.Input(shape=(X_df.shape[1],))

    # Hidden layer
    hidden_layer = layers.Dense(128, activation='relu')(input_layer)

    # Output layer (binary classification, use a single unit with sigmoid activation)
    output = layers.Dense(1, activation='sigmoid')(hidden_layer)

    # Define the model
    model = models.Model(inputs=input_layer, outputs=output)

    # Compile the model (using binary crossentropy loss for binary classification)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_df.values, y_df.values, epochs=10, batch_size=32)

    return model


def evaluate_model_performance(model, X_test, y_test, threshold = 0.5):
      # Forward pass to get the predictions
      test_outputs = model(X_test)
      predicted_probs = test_outputs.numpy()
      predicted = (predicted_probs > threshold).astype(int)  # Convert probabilities to binary predictions


      # Report:
      tn, fp, fn, tp = confusion_matrix(y_test, predicted, normalize = 'true').ravel()
      accuracy = accuracy_score(y_test, predicted)

      # Calculate TPR and FPR
      tpr = tp / (tp + fn)  # True Positive Rate (Recall)
      fpr = fp / (fp + tn)  # False Positive Rate

      print(f"TPR: {tpr}")
      print(f"FPR: {fpr}")
      print(f"Accuracy: {accuracy}")
      print(classification_report(y_test, predicted))
      return



In [None]:
train_df = get_train_df()
train_df_ru = train_df[train_df["lang"] == "ru"]

X = pd.read_csv("train_ru_dBERT.csv")

input_tensor = torch.tensor(X.values, dtype=torch.float32)


X.iloc[0].shape
y = train_df_ru['answerable']
model = train_model_keras(X,y)


Epoch 1/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8204 - loss: 0.4210
Epoch 2/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8847 - loss: 0.2882
Epoch 3/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8986 - loss: 0.2547
Epoch 4/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9218 - loss: 0.2063
Epoch 5/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9007 - loss: 0.2292
Epoch 6/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9264 - loss: 0.1961
Epoch 7/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9363 - loss: 0.1686
Epoch 8/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9219 - loss: 0.1849
Epoch 9/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [None]:
evaluate_model_performance(model, X, y, threshold = 0.5)

[[0.77253219 0.22746781]
 [0.01142857 0.98857143]]
              precision    recall  f1-score   support

       False       0.90      0.77      0.83       233
        True       0.97      0.99      0.98      1750

    accuracy                           0.96      1983
   macro avg       0.94      0.88      0.91      1983
weighted avg       0.96      0.96      0.96      1983



In [None]:
# Evaluate on test set:

test_df = get_validation_df()
test_df_ru = test_df[test_df["lang"] == "ru"]
# print(test_df_ru)
X_test = pd.read_csv("test_ru_dBERT.csv")
y_test = test_df_ru['answerable']
evaluate_model_performance(model, X_test, y_test, threshold = 0.5)

[[0.3125     0.6875    ]
 [0.04577465 0.95422535]]
              precision    recall  f1-score   support

       False       0.73      0.31      0.44       112
        True       0.78      0.95      0.86       284

    accuracy                           0.77       396
   macro avg       0.75      0.63      0.65       396
weighted avg       0.76      0.77      0.74       396



In [None]:
# Main:
# For each language in {fi, ja}:
#   1. Embed both train and test and save
#   2. Train network, evaluate performace on test


def main_with_embedding():
  train_df = get_train_df()
  test_df = get_validation_df()

  languages = ["fi", "ja"]
  lang_dict = {"fi": "Finnish", "ja":"Japanese"}
  for lang in languages:
    print("Starting Proccessing on ", lang_dict[lang])
    print("Starting Embedding")

    # Train_data
    #   Embedding:
    train_df_lang = train_df[train_df["lang"] == lang]
    X = featurize(train_df_lang)
    X.to_csv(f"train_{lang}_dBERT.csv", index = False)
    files.download(f"train_{lang}_dBERT.csv")

    print("Finished Embedding and saving, beggining training")

    y = train_df_lang['answerable']
    model = train_model_keras(X,y)

    print("Finished training, now embedding test data:")
    # Test Data: embed and evaluate
    test_df_lang = test_df[test_df["lang"] == lang]
    X_test = featurize(test_df_lang)
    X_test.to_csv(f"test_{lang}_dBERT.csv", index = False)
    files.download(f"test_{lang}_dBERT.csv")

    print("Finished Embedding and saving, now evaluating:")
    y_test = test_df_lang['answerable']
    evaluate_model_performance(model, X_test, y_test, threshold = 0.5)

  return


Starting Proccessing on  Finnish
Starting Embedding


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Processing rows: 100%|██████████| 2126/2126 [12:33<00:00,  2.82it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Finished Embedding and saving, beggining training
Epoch 1/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8755 - loss: 0.3860
Epoch 2/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8854 - loss: 0.2791
Epoch 3/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9155 - loss: 0.2194
Epoch 4/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9317 - loss: 0.1932
Epoch 5/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9397 - loss: 0.1714
Epoch 6/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9371 - loss: 0.1819
Epoch 7/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9431 - loss: 0.1542
Epoch 8/10
[1m67/67[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9415 - loss: 0.1409
Epoch 

Processing rows: 100%|██████████| 528/528 [03:17<00:00,  2.68it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Finished Embedding and saving, now evaluating:
[[0.60135135 0.39864865]
 [0.05263158 0.94736842]]
              precision    recall  f1-score   support

       False       0.82      0.60      0.69       148
        True       0.86      0.95      0.90       380

    accuracy                           0.85       528
   macro avg       0.84      0.77      0.80       528
weighted avg       0.85      0.85      0.84       528

Starting Proccessing on  Japanese
Starting Embedding


Processing rows: 100%|██████████| 2301/2301 [13:53<00:00,  2.76it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Finished Embedding and saving, beggining training
Epoch 1/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8180 - loss: 0.4430
Epoch 2/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8671 - loss: 0.3256
Epoch 3/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8878 - loss: 0.2764
Epoch 4/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8787 - loss: 0.2685
Epoch 5/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9040 - loss: 0.2358
Epoch 6/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9111 - loss: 0.2302
Epoch 7/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8977 - loss: 0.2492
Epoch 8/10
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9100 - loss: 0.2268
Epoch 

Processing rows: 100%|██████████| 456/456 [02:36<00:00,  2.91it/s]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Finished Embedding and saving, now evaluating:
[[0.5443787  0.4556213 ]
 [0.09407666 0.90592334]]
              precision    recall  f1-score   support

       False       0.77      0.54      0.64       169
        True       0.77      0.91      0.83       287

    accuracy                           0.77       456
   macro avg       0.77      0.73      0.74       456
weighted avg       0.77      0.77      0.76       456



In [None]:
def main_without_embedding():
  train_df = get_train_df()
  test_df = get_validation_df()

  languages = ["ru", "fi", "ja"]
  lang_dict = {"ru": "Russian", "fi": "Finnish", "ja":"Japanese"}

  models_dict = {}
  for lang in languages:
    print("Starting Proccessing on ", lang_dict[lang])

    train_df_lang = train_df[train_df["lang"] == lang]
    test_df_lang = test_df[test_df["lang"] == lang]

    # Train_data
    #   Embedding:
    X_train = pd.read_csv(f"train_{lang}_dBERT.csv")
    y_train = train_df_lang['answerable']

    X_test = pd.read_csv(f"test_{lang}_dBERT.csv")
    y_test = test_df_lang['answerable']

    model = train_model_keras(X,y)

    models_dict[lang] = model
  return models_dict

models_dict = main_without_embedding()

Starting Proccessing on  Russian
Epoch 1/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.8294 - loss: 0.4200
Epoch 2/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8839 - loss: 0.3040
Epoch 3/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8911 - loss: 0.2574
Epoch 4/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8951 - loss: 0.2533
Epoch 5/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9162 - loss: 0.2136
Epoch 6/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9382 - loss: 0.1882
Epoch 7/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9370 - loss: 0.1664
Epoch 8/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9434 - loss: 0.1577
Epoch 9/10
[1m62/62[0

In [None]:
# Evaluation:
lang_dict = {"ru": "Russian", "fi": "Finnish", "ja":"Japanese"}
def evaluate_models(models_dict):

  test_df = get_validation_df()
  for lang in models_dict.keys():
    print("Evaluating: ", lang_dict[lang])

    test_df_lang = test_df[test_df["lang"] == lang]

    model = models_dict[lang]
    X_test = pd.read_csv(f"test_{lang}_dBERT.csv")
    y_test = test_df_lang['answerable']
    evaluate_model_performance(model, X_test, y_test, threshold = 0.5)
  return

evaluate_models(models_dict)

Evaluating:  Russian
TPR: 0.9471830985915493
FPR: 0.6964285714285714
Accuracy: 0.7651515151515151
              precision    recall  f1-score   support

       False       0.69      0.30      0.42       112
        True       0.78      0.95      0.85       284

    accuracy                           0.77       396
   macro avg       0.73      0.63      0.64       396
weighted avg       0.75      0.77      0.73       396

Evaluating:  Finnish
TPR: 0.9578947368421052
FPR: 0.9054054054054054
Accuracy: 0.7159090909090909
              precision    recall  f1-score   support

       False       0.47      0.09      0.16       148
        True       0.73      0.96      0.83       380

    accuracy                           0.72       528
   macro avg       0.60      0.53      0.49       528
weighted avg       0.66      0.72      0.64       528

Evaluating:  Japanese
TPR: 0.7073170731707317
FPR: 0.5207100591715976
Accuracy: 0.6228070175438597
              precision    recall  f1-score   suppo