In [16]:
# import relevant packages

import pandas as pd
import os
import random

pd.set_option('display.max_colwidth', None)

## Uncertainty-based active learning for data selection

In [12]:
# assumes we have model predictions (pred_label) and uncertainty (pred_score) for each entry
# could also do cross-entropy for uncertainty
# we only use the train set
# the test set remains completely held-out

In [13]:
PATH = "./data/1_clean"

df_dict = dict()

for file in os.listdir(PATH):
    if "ipynb" not in file and "english" not in file:
        print(f"loading {file}")
        for file2 in os.listdir(f"{PATH}/{file}"):
            if "train_" in file2:
                df_dict[file] = pd.read_csv(f"{PATH}/{file}/{file2}")

loading ousidhoum2019_french
loading ousidhoum2019_arabic
loading fortuna2019_portuguese
loading sanguinetti2020_italian
loading basile2019_spanish


In [20]:
# create dummy column for uncertainty while waiting for real results
for dataset in df_dict:
    df_dict[dataset]["pred_score"] = df_dict[dataset].label.apply(lambda x: random.uniform(0,1))

In [26]:
# select top-n entries based on active learning
# this is deterministic, so no need for multiple random seeds

N_RANGE = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset)
    df_dict[dataset].sort_values(by="pred_score", inplace=True)
    for n in N_RANGE:
        if n<len(df_dict[dataset]):
            print(f"  saving n = {n} training set")
    print()

ousidhoum2019_french
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

ousidhoum2019_arabic
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

fortuna2019_portuguese
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

sanguinetti2020_italian
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1