In [6]:
!pip install tqdm
from google.colab import files
import pandas as pd
from tqdm import tqdm  # Import tqdm for the progress bar




In [23]:
LANGUAGE_DICT = {"ru": "Russian", "ja": "Japanese", "fi": "Finnish"}
def get_train_df():
  splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
  df = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
  return df



def get_validation_df():
  splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
  df = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])
  return df


In [28]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluation_metrics(y_pred, y_val):
    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    # Generate confusion matrix: [[TN, FP], [FN, TP]]
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()

    # Calculate true positive rate (TPR) and false positive rate (FPR)
    tpr = tp / (tp + fn)  # TPR = TP / (TP + FN)
    fpr = fp / (fp + tn)  # FPR = FP / (FP + TN)

    # Print evaluation metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"True Positive Rate (TPR): {tpr:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")

    return {
        'accuracy': accuracy,
        'tpr': tpr,
        'fpr': fpr
    }

def evaluate_model(model, X_val, y_val):
    # Make predictions on the validation set
    y_pred = model.predict(X_val)

    # Evaluate performance
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    conf_matrix = confusion_matrix(y_val, y_pred)

    performance = evaluation_metrics(y_pred, y_val)
    return performance


In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def train_evaluate_random_forest(X, y):


    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Random Forest classifier
    rf_model = RandomForestClassifier(random_state=42, max_features=50)
    rf_model.fit(X_train, y_train)

    # Make predictions
    y_pred = rf_model.predict(X_test)

    performance = evaluation_metrics(y_pred, y_test)

    return rf_model


In [31]:
def main():
  train_df = get_train_df()
  validation_df = get_validation_df()
  for lang in ["ru", "ja", "fi"]:
    print(f"************ {LANGUAGE_DICT[lang]} ************")
    print("----- Training -----")
    # Get Train Data and Train
    X_train = pd.read_csv(f"train_{lang}_dBERT.csv")
    y_train = train_df[train_df['lang'] == lang]["answerable"]
    rf_model = train_evaluate_random_forest(X_train, y_train)

    print("----- Validation -----")
    # Get Test data and evaluate
    X_test = pd.read_csv(f"test_{lang}_dBERT.csv")
    y_test = validation_df[validation_df['lang'] == lang]["answerable"]
    evaluate_model(rf_model, X_test, y_test)

main()

************ Russian ************
----- Training -----
Accuracy: 0.8690
True Positive Rate (TPR): 1.0000
False Positive Rate (FPR): 0.9630
----- Validation -----
Accuracy: 0.7601
True Positive Rate (TPR): 0.9718
False Positive Rate (FPR): 0.7768
************ Japanese ************
----- Training -----
Accuracy: 0.8525
True Positive Rate (TPR): 0.9974
False Positive Rate (FPR): 0.9306
----- Validation -----
Accuracy: 0.7083
True Positive Rate (TPR): 0.9826
False Positive Rate (FPR): 0.7574
************ Finnish ************
----- Training -----
Accuracy: 0.8826
True Positive Rate (TPR): 1.0000
False Positive Rate (FPR): 0.9259
----- Validation -----
Accuracy: 0.7443
True Positive Rate (TPR): 0.9868
False Positive Rate (FPR): 0.8784


In [32]:
df = pd.read_csv("train_ru_dBERT.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1983 entries, 0 to 1982
Columns: 1536 entries, 0 to 1535
dtypes: float64(1536)
memory usage: 23.2 MB


In [34]:
1536/2

768.0