In [1]:
!pip install simpletransformers==0.70.1

Collecting simpletransformers==0.70.1
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers==0.70.1)
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting seqeval (from simpletransformers==0.70.1)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers==0.70.1)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers==0.70.1)
  Downloading streamlit-1.44.1-py3-none-any.whl.metadata (8.9 kB)
Collectin

# Imports
First we import the training data files, then we use the downsampling indices to isolate

In [2]:
import os
import sys

user = "mss423"
repo = "ACS-LessIsMore"

if os.path.isdir(repo):
  !rm -rf {repo}

!git clone https://github.com/{user}/{repo}.git

for dirpath, dirnames, filenames in os.walk("ACS-LessIsMore"):
  sys.path.append(dirpath)

Cloning into 'ACS-LessIsMore'...
remote: Enumerating objects: 793, done.[K
remote: Counting objects: 100% (42/42), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 793 (delta 31), reused 37 (delta 30), pack-reused 751 (from 1)[K
Receiving objects: 100% (793/793), 44.03 MiB | 27.87 MiB/s, done.
Resolving deltas: 100% (491/491), done.


In [3]:
import pickle
import pandas as pd
import numpy as np
from train_model import run_bert_train, run_ner_train
from IPython.display import clear_output
from load_data import *

datadir = "/content/ACS-LessIsMore/datasets"
savedir = "/content/ACS-LessIsMore/results"
NUM_REPS = 3

def load_idx_dict(path):
  with open(path, "rb") as f:
    idx_dict = pickle.load(f)
  return idx_dict

# SST2

Data loading declarations

In [5]:
import torch
import os

sst2_path = "/content/ACS-LessIsMore/datasets/SST2"
sst2_idx_path = os.path.join(sst2_path, "downsample")

base_output_dir = "/content/ACS-LessIsMore/datasets/SST2"

# Load data from .txt
sst2_human_train, _ = load_train_data(datadir, "sst2", synthetic=False)
# sst2_human_train    = sst2_human_train.sample(n=5000, random_state=42)
sst2_synthetic, _   = load_train_data(datadir, "sst2", synthetic=True)
# sst2_synthetic      = sst2_synthetic.sample(n=5000, random_state=42)
# sst2_human_dev      = load_dev_data(datadir, "sst2")
sst2_human_test     = load_test_data(datadir, "sst2")

def get_sst2_data(data_df, K, sample_method="acs"):
  if sample_method == "random":
    return data_df.sample(n=K, random_state=42)

  elif sample_method == "acs":
    sst2_idx_file = os.path.join(sst2_path, "acs/acs_subsample_idx.pkl")
    sst2_idx_dict = load_idx_dict(sst2_idx_file)
    K_idx = sst2_idx_dict[K]
    return data_df.iloc[K_idx]

  else:
    sst2_idx_file = os.path.join(sst2_path, "scores/" + sample_method + "_sst2.pkl")
    sst2_idx = load_idx_dict(sst2_idx_file)
    return data_df.iloc[sst2_idx[:K]]
  return []

def fit_model(train_df, test_df, K, r=5, epochs=5):
    results = []

    for i in range(r):
        run_seed= 42 * (i+1)
        run_output_dir = f"{base_output_dir}/run_{i+1}_seed_{run_seed}"
        print(f"\n--- Starting Run {i+1}/{r} with Seed {run_seed} ---")
        print(f"Outputting to: {run_output_dir}")

        use_cuda = torch.cuda.is_available()
        if use_cuda:
            torch.cuda.empty_cache() # Attempt to clear cache before initializing new model

        # train model
        metrics, acc = run_bert_train(train_df, test_df,
                                      num_labels=2,
                                      output_dir = run_output_dir,
                                      seed=42*(i+1),
                                      epochs=epochs)
        results.append({'K': K, \
                        'acc': acc, \
                        'precision': metrics['macro avg']['precision'], \
                        'recall': metrics['macro avg']['recall'], \
                        'f1': metrics['macro avg']['f1-score']})
        clear_output(wait=True)
        if use_cuda:
            torch.cuda.empty_cache()

    print("\n--- All Runs Completed ---")
    return results

## Model training

### Human Baseline

In [None]:
results = []
for i in range(NUM_REPS):
  metrics, acc = run_bert_train(sst2_human_train, sst2_human_test, num_labels=2, seed=42*(i+1))
  results.append({'acc': acc, \
                  'precision': metrics['macro avg']['precision'], \
                  'recall': metrics['macro avg']['recall'], \
                  'f1': metrics['macro avg']['f1-score']})
  clear_output(wait=True)

# Save results
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv(os.path.join(savedir, "sst2", "sst2_human_baseline.csv"))

'/content/ACS-LessIsMore/results/sst2/sst2_human_baseline.csv'

### Synthetic downsampling

In [None]:
reps = 1
sample_methods = ["acs", "random", "aum", "conf_agree", "confident_learning", \
                  "data_iq", "data_maps", "el2n", "forgetting", "large_loss", \
                  "prototypicality", "vog"] #
Ks = np.round(np.linspace(100, len(sst2_synthetic), 11)).astype(int)
# Ks = Ks[:-1]

for K in Ks:
  if K == len(sst2_human_train):
    train_df = sst2_synthetic
    full_results = fit_model(train_df, sst2_human_test, K)

    # Save full K results
    with open(os.path.join(sst2_idx_path, f"full_results.pkl"), "wb") as f:
        pickle.dump(full_results, f)
    continue

  else:
    # Results variable
    method_results = {}
    for method in sample_methods:
        train_df = get_sst2_data(sst2_synthetic, K, method)
        torch.set_num_threads(1)
        method_results[method] = fit_model(train_df, sst2_human_test, K, r=reps, epochs=3)

    # Save results for K with pickle
    # with open(os.path.join(sst2_path, f"{K}_results.pkl"), "wb") as f:
    #     pickle.dump(method_results, f)



# Save results
# results_df = pd.DataFrame(results)
# print(results_df)
# results_df.to_csv(os.path.join(savedir, "sst2", f"sst2_{sample_method}-32epochs.csv"))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- All Runs Completed ---

--- Starting Run 1/1 with Seed 42 ---
Outputting to: /content/ACS-LessIsMore/datasets/SST2/run_1_seed_42




0it [00:00, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 3:   0%|          | 0/13 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 3:   0%|          | 0/13 [00:00<?, ?it/s]

In [12]:
print(Ks[3:])

[1870 2460 3050 3640 4230 4820 5410 6000]


In [16]:
# Load from pickle
with open(os.path.join(sst2_path, "2460_results.pkl"), "rb") as f:
    full_results = pickle.load(f)

results_df = pd.DataFrame(full_results)
print(results_df)

                                                 acs  \
0  {'K': 2460, 'acc': 0.7478021978021978, 'precis...   
1  {'K': 2460, 'acc': 0.7516483516483516, 'precis...   
2  {'K': 2460, 'acc': 0.7450549450549451, 'precis...   

                                              random  \
0  {'K': 2460, 'acc': 0.7862637362637362, 'precis...   
1  {'K': 2460, 'acc': 0.7653846153846153, 'precis...   
2  {'K': 2460, 'acc': 0.7653846153846153, 'precis...   

                                                 aum  \
0  {'K': 2460, 'acc': 0.7593406593406593, 'precis...   
1  {'K': 2460, 'acc': 0.6835164835164835, 'precis...   
2  {'K': 2460, 'acc': 0.756043956043956, 'precisi...   

                                          conf_agree  \
0  {'K': 2460, 'acc': 0.7857142857142857, 'precis...   
1  {'K': 2460, 'acc': 0.8065934065934066, 'precis...   
2  {'K': 2460, 'acc': 0.782967032967033, 'precisi...   

                                  confident_learning  \
0  {'K': 2460, 'acc': 0.8060439560439561, 'p

# FewRel

Data loading and declarations

In [None]:
fewrel_path = "/content/ACS-LessIsMore/datasets/FewRel"
fewrel_idx_path = os.path.join(fewrel_path, "downsample")

# Load data from .txt
fewrel_human_train, _ = load_train_data(datadir, "fewrel", synthetic=False)
fewrel_human_train    = fewrel_human_train.sample(n=5000, random_state=42)
fewrel_synthetic, _   = load_train_data(datadir, "fewrel", synthetic=True)
fewrel_synthetic      = fewrel_synthetic.sample(n=5000, random_state=42)
# fewrel_human_dev      = load_dev_data(datadir, "fewrel")
fewrel_human_test     = load_test_data(datadir, "fewrel")

def get_fewrel_data(data_df, K, sample_method="acs"):
  if sample_method == "random":
    return data_df.sample(n=K, random_state=42)

  fewrel_idx_file = os.path.join(fewrel_idx_path, sample_method + "_subsample_idx-w-labels.pkl")
  fewrel_idx_dict = load_idx_dict(fewrel_idx_file)
  K_idx = fewrel_idx_dict[K]
  return data_df.iloc[K_idx]

## Model training

### Human baseline

In [None]:
results = []
for i in range(NUM_REPS):
  metrics, acc = run_bert_train(fewrel_human_train, fewrel_human_test, num_labels=64, seed=42*(i+1))
  results.append({'acc': acc, \
                  'precision': metrics['macro avg']['precision'], \
                  'recall': metrics['macro avg']['recall'], \
                  'f1': metrics['macro avg']['f1-score']})
  clear_output(wait=True)

# Save results
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv("fewrel_human_baseline.csv")

### Synthetic downsampling

In [None]:
results = []
sample_method = "acs" # "kmeans", "random", "acs"
Ks = np.round(np.linspace(100, len(fewrel_human_train), 15)).astype(int)

for K in Ks:
  if K == len(fewrel_human_train):
    # train_df = fewrel_synthetic
    continue
  else:
    train_df = get_fewrel_data(fewrel_synthetic, K, sample_method)

  for i in range(NUM_REPS):
    print(f"K = {K}, Iteration {i+1} of {NUM_REPS}")
    metrics, acc = run_bert_train(train_df, fewrel_human_test, num_labels=64, seed=42*(i+1))
    results.append({'K': K, \
                    'acc': acc, \
                    'precision': metrics['macro avg']['precision'], \
                    'recall': metrics['macro avg']['recall'], \
                    'f1': metrics['macro avg']['f1-score']})
    clear_output(wait=True)

# Save results
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv(f"fewrel_{sample_method}-1.csv")

       K       acc  precision    recall        f1
0    100  0.021652   0.006084  0.021652  0.005678
1    100  0.015402   0.000369  0.015402  0.000646
2    100  0.014063   0.019663  0.014062  0.004182
3    100  0.018973   0.003998  0.018973  0.005560
4    100  0.029911   0.003354  0.029911  0.005535
5    450  0.194866   0.179261  0.194866  0.141123
6    450  0.225223   0.133937  0.225223  0.145004
7    450  0.195089   0.172740  0.195089  0.128911
8    450  0.171875   0.119556  0.171875  0.112476
9    450  0.186607   0.113159  0.186607  0.109300
10   800  0.194196   0.144898  0.194196  0.133847
11   800  0.286161   0.231041  0.286161  0.216916
12   800  0.270313   0.234011  0.270312  0.203797
13   800  0.247321   0.221797  0.247321  0.179619
14   800  0.285268   0.221164  0.285268  0.210686
15  1150  0.325446   0.289370  0.325446  0.251853
16  1150  0.347321   0.274053  0.347321  0.278844
17  1150  0.349554   0.302833  0.349554  0.274944
18  1150  0.322098   0.281070  0.322098  0.246489


# ASTE

### Clear above variables and import ASTE codes

In [None]:
# clear all variables in memory
%reset -f

In [None]:
!git clone https://github.com/chiayewken/Span-ASTE.git
!cd Span-ASTE && git checkout f53ec3c
!cp -a Span-ASTE/* .
!echo boto3==1.16.46 >> requirements.txt
!bash setup.sh

Cloning into 'Span-ASTE'...
remote: Enumerating objects: 194, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 194 (delta 55), reused 39 (delta 39), pack-reused 116 (from 1)[K
Receiving objects: 100% (194/194), 615.04 KiB | 9.61 MiB/s, done.
Resolving deltas: 100% (87/87), done.
Note: switching to 'f53ec3c'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at f53ec3c Add command-line scoring instructions in README.md
Collecting Cython==0.29

In [None]:
!pip install overrides
!pip install allennlp
!pip install -U huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Downloading huggingface_hub-0.27.1-py3-none-any.whl (450 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.7/450.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.10.1
    Uninstalling huggingface-hub-0.10.1:
      Successfully uninstalled huggingface-hub-0.10.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cached-path 1.1.6 requires huggingface-hub<0.11.0,>=0.8.1, but you have huggingface-hub 0.27.1 which is incompatible.
peft 0.14.0 requires torch>=1.13.0, but you have torch 1.12.1 which is incompatible.
sentence-transformers 3.3.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.

In [None]:
import sys
import os

for dirpath, dirnames, filenames in os.walk("Span-ASTE"):
  sys.path.append(dirpath)

## Model Training

### Human Baseline

In [None]:
import json
from span_model.models import SpanModel
from IPython.display import clear_output

random_seed = 42
path_train = os.path.join(datadir, "ASTE", "train.txt")
path_dev = os.path.join(datadir, "ASTE", "dev.txt")
save_dir = os.path.join(datadir, "ASTE", "outputs") # /{data_name}/seed_{random_seed}") # fix!

results = []

for i in range(NUM_REPS):
    # Train model
    model = SpanModel(save_dir=save_dir, random_seed=random_seed*(i+1))
    model.fit(path_train, path_dev)

    # Test error
    path_pred = "pred.txt"
    path_test = os.path.join(datadir, "ASTE", "test.txt")
    model.predict(path_in=path_test, path_out=path_pred)
    metrics = model.score(path_pred, path_test)
    results.append({'precision': metrics['precision'], \
                    'recall': metrics['recall'], \
                    'f1': metrics['score']})
    clear_output(wait=True)

results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv(os.path.join(savedir, "aste", f"aste_human_baseline.csv"))

TypeError: BiAffineSpanExtractor.forward: `sequence_mask` is not present.

In [None]:
# Evaluate SpanModel F1 Score
import json

path_pred = "pred.txt"
path_test = f"aste/data/triplet_data/{data_name}/test.txt"
model.predict(path_in=path_test, path_out=path_pred)
results = model.score(path_pred, path_test)
print(json.dumps(results, indent=2))

# CrossNER

In [None]:
import random

ner_path = "/content/ACS-LessIsMore/datasets/CrossNER"
ner_idx_path = os.path.join(ner_path, "downsample")

# Load data from .txt
ner_human_train = pd.DataFrame(load_crossner_train(ner_path + "/train.txt"), \
                                            columns=["sentence_id", "words", "labels"])
ner_syn         = load_crossner_train(ner_path + "/syn-train.txt")
ner_dev         = load_crossner_train(ner_path + "/dev.txt", dev=True)
ner_synthetic   = pd.DataFrame(ner_syn + ner_dev, \
                                            columns=["sentence_id", "words", "labels"])
ner_human_test  = pd.DataFrame(load_crossner_train(ner_path + "/test.txt"), \
                                            columns=["sentence_id", "words", "labels"])

def get_ner_data(data_df, K, sample_method="acs"):
    n = data_df['sentence_id'].max()
    if sample_method == "random":
        random.seed(42)
        n_set = list(range(0, n))
        k_set = random.sample(n_set, K)
        return data_df[data_df['sentence_id'].isin(k_set)]

    ner_idx_file = os.path.join(ner_idx_path, sample_method + "_subsample_idx.pkl")
    ner_idx_dict = load_idx_dict(ner_idx_file)
    K_idx = ner_idx_dict[K]
    return data_df[data_df['sentence_id'].isin(K_idx)]

In [None]:
results = []
for i in range(NUM_REPS):
    print(f"Iteration {i+1} of {NUM_REPS}")
    metrics = run_ner_train(ner_human_train, ner_human_test, seed=42*(i+1), epochs=10)
    # print(metrics)
    results.append({'precision': metrics['precision'], \
                    'recall': metrics['recall'], \
                    'f1': metrics['f1_score']})
    clear_output(wait=True)

# Save results
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv("ner_human_baseline.csv")

   precision    recall        f1
0   0.410927  0.494748  0.448959
1   0.379206  0.485904  0.425975
2   0.400357  0.495301  0.442797
3   0.395434  0.478718  0.433108
4   0.387590  0.476506  0.427473


In [None]:
results = []
sample_method = "acs" # "kmeans", "random", "acs"
Ks = np.round(np.linspace(100, ner_synthetic['sentence_id'].max()+1, 15)).astype(int)
# Ks = [3000]

for K in Ks:
  if K == ner_synthetic['sentence_id'].max()+1:
    continue
    # train_df = ner_synthetic
  else:
    train_df = get_ner_data(ner_synthetic, K, sample_method)

  for i in range(NUM_REPS):
    print(f"K = {K}, Iteration {i+1} of {NUM_REPS}")
    metrics = run_ner_train(train_df, ner_human_test, seed=42*(i+1), epochs=25)
    # print(metrics)
    results.append({'K': K, \
                    'precision': metrics['precision'], \
                    'recall': metrics['recall'], \
                    'f1': metrics['f1_score']})
    clear_output(wait=True)

# Save results
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv(f"ner_{sample_method}-debug2.csv")

       K  precision    recall        f1
0    100   0.327119  0.407407  0.362875
1    100   0.326135  0.409066  0.362923
2    100   0.332171  0.421227  0.371436
3    100   0.330472  0.425650  0.372071
4    100   0.325210  0.427861  0.369539
..   ...        ...       ...       ...
65  2793   0.430802  0.335544  0.377253
66  2793   0.438010  0.326147  0.373891
67  2793   0.436578  0.327253  0.374092
68  2793   0.422467  0.338861  0.376074
69  2793   0.449038  0.348259  0.392279

[70 rows x 4 columns]


In [None]:
from google.colab import files
files.download('ner_acs-debug2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>