<a href="https://colab.research.google.com/github/mss423/ACS-LessIsMore/blob/main/downsample-driver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Clone GitHub project

In [1]:
import os

# GitHub cloning...
user = "mss423"
repo = "ACS-LessIsMore"

# remove local directory if it already exists
if os.path.isdir(repo):
    !rm -rf {repo}

# clone repo
!git clone https://github.com/{user}/{repo}.git

Cloning into 'ACS-LessIsMore'...
remote: Enumerating objects: 288, done.[K
remote: Counting objects: 100% (288/288), done.[K
remote: Compressing objects: 100% (197/197), done.[K
remote: Total 288 (delta 157), reused 202 (delta 83), pack-reused 0 (from 0)[K
Receiving objects: 100% (288/288), 16.50 MiB | 9.09 MiB/s, done.
Resolving deltas: 100% (157/157), done.


Add codes to pwd

In [2]:
import sys

for dirpath, dirnames, filenames in os.walk("ACS-LessIsMore"):
        sys.path.append(dirpath)

Authenticate notebook for Google packages

In [3]:
# if it's Colab runtime, authenticate the user with Google Cloud
if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

LOCATION = "us-central1"
PROJECT_ID = "synthetic-data-432701"
! gcloud services enable compute.googleapis.com aiplatform.googleapis.com storage.googleapis.com bigquery.googleapis.com --project {PROJECT_ID}

Operation "operations/acat.p2-29476227164-db79622d-76b1-4bc1-b6ae-be5d47b34e34" finished successfully.


# Run experiments
## Load Data

In [11]:
from load_data import *
import pandas as pd

'''
    Load the different dataset's sentences for downsampling
    FewRel --> synthetic n = 12800 (cut to 5k)
    SST2 --> synthetic n =  6000 (cut to 5k)
    ASTE --> synthetic n = 2000
    CrossNER --> synthetic n = 3000
'''

datadir = "/content/ACS-LessIsMore/datasets"
# savedir = "/content/ACS-LessIsMore/results"
dataset = 'sst2' # 'fewrel', 'sst2', 'aste', 'crossner'
namemap = {'fewrel': 'FewRel', 'sst2': 'SST2', 'aste': 'ASTE', 'crossner': 'CrossNER'}
savedir = os.path.join(datadir, namemap[dataset] , "downsample")

# Load data_df, train and test
train_df, num_labels = load_train_data(datadir, dataset, synthetic=True)
test_df  = load_test_data(datadir, dataset)

#
# num_labels = test_df['label'].nunique()
print(f"Number of training samples: {len(train_df)}")

{'path': '/content/ACS-LessIsMore/datasets/FewRel/syn-train.json', 'data': (12801, 2), 'unique_labels': 64}
{'path': '/content/ACS-LessIsMore/datasets/FewRel/test.json', 'data': (4480, 2), 'unique_labels': 64}
Number of training samples: 12801


In [12]:
# Downsample train_df to 1000 randomly selected samples for testing
if dataset in ["sst2", "fewrel"]:
    train_df = train_df.sample(n=5000, random_state=42)

## Run subsampling

In [7]:
import numpy as np

# Ks = [len(train_df)//10]
Ks = np.round(np.linspace(100, len(train_df), 15)).astype(int)
# remove final entry as we dont need to run ACS to reobtain the dataset
Ks = Ks[:-1]

### Downsample

In [8]:
from acs import acs_sample
from kmeans_utils import kmeans_sample

# train_subsample_idx = acs_sample(train_df, Ks)
train_subsample_idx = kmeans_sample(train_df, Ks)

100%|██████████| 313/313 [00:59<00:00,  5.25it/s]
Processing Ks...: 100%|██████████| 14/14 [36:41<00:00, 157.25s/it]


Saving...

In [10]:
import pickle

save_name = "acs_subsample_idx.pkl"  # Choose your desired save name
save_path = os.path.join(savedir, save_name)  # Construct the full save path

# Check if a file with the same name already exists
if os.path.exists(save_path):
    print(f"Warning: File '{save_name}' already exists.")
else:
    # Save train_subsample_idx using pickle
    with open(save_path, 'wb') as f:
        pickle.dump(train_subsample_idx, f)
    print(f"Saved train_subsample_idx to '{save_path}'.")

Saved train_subsample_idx to '/content/ACS-LessIsMore/datasets/SST2/downsample/acs_subsample_idx.pkl'.
