In [12]:
# import relevant packages

import pandas as pd
import os
import random
import numpy as np
import glob as glob

from ast import literal_eval

pd.set_option('display.max_colwidth', None)

## Load original train datasets that active learner was applied to

In [13]:
df_dict = dict()

PATH = "../0_data/main/1_clean"
    
for dataset in os.listdir(PATH):
    for f in glob.glob(f"{PATH}/{dataset}/train*.csv"):
        if "dyn21" not in f and "ipynb" not in f:
            print(dataset[:8])
            df_dict[dataset[:8]] = pd.read_csv(f)

for19_pt
ous19_fr
bas19_es
ous19_ar
san20_it


## Merge train datasets with prediction logits

In [14]:
PATH = "../0_data/main/2_active_learning"
AL_MODEL = "xlmt_dyn21_en_20000_rs1"

for dataset in os.listdir(PATH):
    print(dataset[:8])
    df_dict[dataset[:8]] = df_dict[dataset[:8]].merge(pd.read_csv(f"{PATH}/{dataset}/{AL_MODEL}.csv")[["prediction", "logits"]], left_index=True, right_index=True)

ous19_ar
for19_pt
san20_it
ous19_fr
bas19_es


## Create columns for selection

In [16]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

for dataset in df_dict:
    df_dict[dataset]["softmax_scores"] = df_dict[dataset].logits.apply(lambda x: softmax(literal_eval(x)))
    df_dict[dataset]["softmax_diff"] = df_dict[dataset].softmax_scores.apply(lambda x: abs(x[0]-x[1]))

## Select based on difference in softmax scores across classes

In [24]:
# create differently-sized train portions from rest of data

N_RANGE = [10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        print(f"  saving n = {n} training set (selected by active learning)")
        export_dict = df_dict[dataset].sort_values("softmax_diff")[["text", "label"]][:n]
        for file in glob.glob(f"../0_data/main/2_active_learning/{dataset}*"):
            export_dict.to_csv(f"{file}/train/train_{n}_al.csv",index=False)

    print()

FOR19_PT
  saving n = 10 training set (selected by active learning)
  saving n = 20 training set (selected by active learning)
  saving n = 30 training set (selected by active learning)
  saving n = 40 training set (selected by active learning)
  saving n = 50 training set (selected by active learning)
  saving n = 100 training set (selected by active learning)
  saving n = 200 training set (selected by active learning)
  saving n = 300 training set (selected by active learning)
  saving n = 400 training set (selected by active learning)
  saving n = 500 training set (selected by active learning)
  saving n = 1000 training set (selected by active learning)
  saving n = 2000 training set (selected by active learning)

OUS19_FR
  saving n = 10 training set (selected by active learning)
  saving n = 20 training set (selected by active learning)
  saving n = 30 training set (selected by active learning)
  saving n = 40 training set (selected by active learning)
  saving n = 50 training set

In [30]:
df_dict["san20_it"]

Unnamed: 0,index,label,prediction,logits
0,0,0,1,"(-0.88551676, 0.7890035)"
1,1,0,1,"(-0.9222206, 0.8876971)"
2,2,1,1,"(-1.284241, 1.1302452)"
3,3,1,1,"(-0.16130793, 0.031392984)"
4,4,0,0,"(0.9738463, -1.1099527)"
...,...,...,...,...
5595,5595,1,1,"(-0.50333995, 0.3105694)"
5596,5596,0,1,"(-2.8639174, 2.5761027)"
5597,5597,1,0,"(1.1118305, -1.1924204)"
5598,5598,1,1,"(-1.2798216, 1.0581983)"


In [29]:
softmax(literal_eval(df_dict["san20_it"].logits[0]))

array([0.15782244, 0.84217756])

In [23]:
np.asarray(df_dict["san20_it"].logits[0])

array('(-0.88551676, 0.7890035)', dtype='<U24')

In [22]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

softmax(np.asarray(df_dict["san20_it"].logits[0]))

UFuncTypeError: ufunc 'maximum' did not contain a loop with signature matching types (dtype('<U24'), dtype('<U24')) -> None

In [12]:
# assumes we have model predictions (pred_label) and uncertainty (pred_score) for each entry
# could also do §cross-entropy for uncertainty
# we only use the train set
# the test set remains completely held-out

In [20]:
# create dummy column for uncertainty while waiting for real results
for dataset in df_dict:
    df_dict[dataset]["pred_score"] = df_dict[dataset].label.apply(lambda x: random.uniform(0,1))

In [26]:
# select top-n entries based on active learning
# this is deterministic, so no need for multiple random seeds

N_RANGE = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset)
    df_dict[dataset].sort_values(by="pred_score", inplace=True)
    for n in N_RANGE:
        if n<len(df_dict[dataset]):
            print(f"  saving n = {n} training set")
    print()

ousidhoum2019_french
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

ousidhoum2019_arabic
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

fortuna2019_portuguese
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

sanguinetti2020_italian
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1