In the following notebook, we investigate the use of SAS (subsets that maximize expected augmentation similarity) to select representative subsets that improve self-supervised learning in a text categorization task. We also investigate the use of multilingual embeddings to further strengthen the SAS selection.

# Setup

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install nlpaug
!pip install gensim

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/521.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/521.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m


In [None]:
import csv
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
import gensim.downloader as api
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from tqdm.auto import tqdm
# Delete some data from RAM to free up space for later processes
import gc

# Ensure you have the necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Import Google Drive for locally saved files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Global variables

In [None]:
# Max length of text embedding tensors
global_seed = 0
max_length = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set seed for reproducibility
np.random.seed(global_seed)
random.seed(global_seed)

## Helper functions

In [None]:
# Basic preprocessing: lowercasing and removing non-alphanumeric characters
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

## Load dataset

In [None]:
# Load the AG News dataset
dataset = load_dataset("ag_news")

# The dataset is divided into 'train' and 'test' splits
train_data = dataset['train']
test_data = dataset['test']

# For the parts that use GPT, we will use only 2.5% of the data due to the cost of running GPT.
train_data = train_data.train_test_split(test_size=0.975, stratify_by_column="label", seed = global_seed)
train_data = train_data['train']

# Example: Viewing the first training sample
print(train_data[0])
print(len(train_data))

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

{'text': '9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.', 'label': 0}
3000


## Create NLPAug augmented texts

In [None]:
# aug_1 = naw.WordEmbsAug(
#     model_type='word2vec',
#     model_path='/content/drive/MyDrive/COM SCI 260D/word2vec-google-news-300.bin',
#     top_k = 5, # Only use top 5 options
#     aug_p = 0.2 # 20% of the words augmented
# )
#
# aug_2 = naw.WordEmbsAug(
#     model_type='word2vec',
#     model_path='/content/drive/MyDrive/COM SCI 260D/word2vec-google-news-300.bin',
#     top_k = 10, # Only use top 10 options
#     aug_p = 0.4 # 40% of the words augmented
# )
#
# def eda_augmentation_with_word2vec(sentence, type):
#     if type == 1:
#         return aug_1.augment(sentence)[0]
#     else:
#         return aug_2.augment(sentence)[0]

In [None]:
# Add augmented texts for each example using NLPAug
# def add_nlp_aug_columns(data):
#     data['augment_1'] = eda_augmentation_with_word2vec(data['text'], 1)
#     data['augment_2'] = eda_augmentation_with_word2vec(data['text'], 2)
#     return data

# train_data = train_data.map(
#     lambda example: add_nlp_aug_columns(example)
# )

In [None]:
# Save the augmentations as the process takes 2 hours
# train_df = train_data.to_pandas()
# train_df.to_json("/content/drive/MyDrive/COM SCI 260D/augmented_dataset.json")

In [None]:
import datasets

# Load dataset with augmentations (pre-prepared)
train_df = pd.read_json("/content/drive/MyDrive/COM SCI 260D/augmented_dataset.json")
augmented_data = datasets.Dataset.from_pandas(train_df)

In [None]:
print(augmented_data)
print(augmented_data['text'][-1])
print(augmented_data['text'][-1])

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', '__index_level_0__'],
    num_rows: 3000
})
Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. unveiled Monday a new memory-packed personal digital assistant that can double as a portable data storage drive.
Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. unveiled Monday a new memory-packed personal digital assistant that can double as a portable data storage drive.


In [None]:
# Merge datasets by columns
def add_augmented_columns(data, augments, idx):
    data['augment_1'] = augments['augment_1'][idx]
    data['augment_2'] = augments['augment_2'][idx]
    return data

train_data = train_data.map(
    lambda example, idx: add_augmented_columns(example, augmented_data, idx),
    with_indices=True
)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

## Load GPT augmented text and Spanish translations

In [None]:
gpt_dataset = load_dataset("csv", data_files="/content/drive/MyDrive/COM SCI 260D/gpt_dataset.csv", split = 'train')

print(gpt_dataset)
print(gpt_dataset[0:5])
print(train_data[0:5])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['Original Text', 'Translation', 'Paraphrase 1', 'Paraphrase 2'],
    num_rows: 3000
})


In [None]:
# Ensure both datasets have the same number of rows
assert len(gpt_dataset) == len(train_data), "Datasets must have the same number of rows"

# Merge datasets by columns
def add_gpt_columns(data, augments, idx):
    data['translation'] = augments['Translation'][idx]
    data['gpt_1'] = augments['Paraphrase 1'][idx]
    existing_augment = augments['Paraphrase 2'][idx]
    if not existing_augment or not existing_augment.strip():
        data['gpt_2'] = data['augment_1'] # Fallback to EDA augmentation
    else:
        data['gpt_2'] = existing_augment
    return data

train_data = train_data.map(
    lambda example, idx: add_gpt_columns(example, gpt_dataset, idx),
    with_indices=True
)

print(train_data)
print(train_data[0])

# There are 8 examples that are not correctly being imported. For now, we just remove them.
error_data = train_data.filter(lambda row: row['gpt_2'] is None or row['gpt_1'] is None)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2'],
    num_rows: 3000
})
{'text': '9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.', 'label': 0, 'augment_1': '9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.', 'augment_2': "2 hurt in explosion Tuesdayat Indonesian Embassy LYON_France - - An explosion struck in Indonesian charge_d'_affaires in Paris today, tad wounding nine people, a Algerian radio Finsbury_Park_Tube reported.", 'translation': '9 heridos en explosión en la Embajada de Indonesia PARÍS -- Una explosión golpeó la Embajada de Indonesia en París hoy, hiriendo levemente a nueve personas, según informó una estación de radio francesa.', 'gpt_1': '1. The Indonesian Embassy

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
print(error_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2'],
    num_rows: 0
})


In [None]:
print(train_data['text'][0])
print(train_data['gpt_1'][0])
print(train_data['gpt_2'][0])
print(train_data['augment_1'][0])
print(train_data['augment_2'][0])
print("")
print(train_data['text'][-1])
print(train_data['gpt_1'][-1])
print(train_data['gpt_2'][-1])
print(train_data['augment_1'][-1])
print(train_data['augment_2'][-1])

9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.
1. The Indonesian Embassy in Paris was struck by an explosion today, leading to minor injuries for nine individuals, as reported by a French radio station.
9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.
9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.
2 hurt in explosion Tuesdayat Indonesian Embassy LYON_France - - An explosion struck in Indonesian charge_d'_affaires in Paris today, tad wounding nine people, a Algerian radio Finsbury_Park_Tube reported.

Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. un

# Latent class discovery

First, we use 1% of the data randomly selected as a small piece of labelled information to train a basic pre-trained classifier and assign latent classes to each training data point.

In [None]:
# Select 1% of the data randomly
shuffled_dataset = train_data.shuffle(seed = global_seed)
sample_size = int(0.01 * len(shuffled_dataset))
labeled_data = shuffled_dataset.select(range(sample_size))
rest_of_data = shuffled_dataset.select(range(sample_size, len(shuffled_dataset)))

# Get number of classes from the 1% data
unique_labels = set(labeled_data['label'])
num_classes = len(unique_labels)

print(num_classes)

4


In [None]:
# Split the 1% dataset into training and validation sets. We only use 20% as validation set due to size of data.
labeled_data_for_training = labeled_data.train_test_split(test_size=0.2, stratify_by_column="label", seed = global_seed)

In [None]:
# Pre-process and tokenize data
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')

def tokenize(examples):
    preprocessed_texts = [preprocess(text) for text in examples['text']]
    return tokenizer(preprocessed_texts, padding='max_length', truncation=True, max_length = max_length, return_tensors="pt")

tokenized_dataset = labeled_data_for_training.map(tokenize, batched=True)

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [None]:
# Import model for linear classification using the number of classes in 1% data.
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-small', num_labels=num_classes)

pytorch_model.bin:   0%|          | 0.00/116M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Use the 1% labelled data to train a BERT model
latent_num_epochs = 20

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=latent_num_epochs,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['test'])

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.257777
2,1.471200,1.255899
3,1.471200,1.252272
4,1.394200,1.247187
5,1.408400,1.241248
6,1.408400,1.234999
7,1.377800,1.226443
8,1.377800,1.214229
9,1.317600,1.202122
10,1.291200,1.192617


TrainOutput(global_step=120, training_loss=1.1886330604553224, metrics={'train_runtime': 3.8426, 'train_samples_per_second': 124.915, 'train_steps_per_second': 31.229, 'total_flos': 9492677591040.0, 'train_loss': 1.1886330604553224, 'epoch': 20.0})

In [None]:
# Save the model to a directory for local use
model.save_pretrained('./model')
tokenizer.save_pretrained('./tokenizer')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [None]:
# Use the model to predict latent classes for rest of training data
def predict_batch(batch):
    # Tokenize the examples
    preprocessed_texts = [preprocess(text) for text in batch['text']]
    inputs = tokenizer(preprocessed_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Perform the prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class indices
    predictions = outputs.logits.argmax(-1).cpu().numpy()
    return {'predictions': predictions}

# Create a DataLoader for the rest of the data
rest_of_data_loader = DataLoader(rest_of_data, batch_size=32)

# Run predictions in batches
results = []
for batch in tqdm(rest_of_data_loader, desc="Predicting latent classes"):
    batch_results = predict_batch(batch)
    results.extend(batch_results['predictions'])

Predicting latent classes:   0%|          | 0/93 [00:00<?, ?it/s]

In [None]:
# Save the predicted classes for local use
file_name = './latent_classes.csv'

with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)

    for integer in results:
        writer.writerow([integer])

In [None]:
# Add the latent class to the dataset
def add_new_column(example, idx, new_data):
    example['latent_class'] = new_data[idx]
    return example

labeled_rest_of_data = rest_of_data.map(
    lambda example, idx: add_new_column(example, idx, results),
    with_indices=True
)

# For the labelled data, we use the label as the latent class as it is already known.
def insert_column_with_same_value(example):
    example['latent_class'] = example['label']
    return example

labeled_data = labeled_data.map(
    lambda example: insert_column_with_same_value(example)
)

Map:   0%|          | 0/2970 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [None]:
# How well does the latent classification work?
def check_columns_equality(example, column1, column2):
    return example[column1] == example[column2]

check_accuracy_dataset = labeled_rest_of_data.filter(lambda example: check_columns_equality(example, 'label', 'latent_class'))

matching_count = len(check_accuracy_dataset)
print(f"Accuracy of latent classification: {matching_count / len(labeled_rest_of_data)}")

Filter:   0%|          | 0/2970 [00:00<?, ? examples/s]

Accuracy of latent classification: 0.6828282828282828


In [None]:
# Re-merge the 1% data and rest of the data for self-supervised learning
latent_class_data = concatenate_datasets([labeled_data, labeled_rest_of_data])
print(latent_class_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 3000
})


In [None]:
del results
del model
del tokenizer
del training_args
del trainer
del rest_of_data
del rest_of_data_loader
del labeled_rest_of_data
del check_accuracy_dataset
del labeled_data
del gpt_dataset

# Collect garbage
gc.collect()

131

# Subset selection

Now that we have some form of latent classes assigned to all of the training data, we can find subsets using three methods:

1. Original SAS algorithm
2. Random subset from each latent class
3. Original SAS algorithm + Spanish embeddings

We will select a 20% subset and compare the three. In the long run, we also need to compare with using the whole dataset but that is going to be very computationally intensive.

In [None]:
# Split into latent classes to select subsets from each
def create_filter_function(latent_class):
    def filter_label(example):
        return example['latent_class'] == latent_class
    return filter_label

world_train_data = latent_class_data.filter(create_filter_function(0))
sports_train_data = latent_class_data.filter(create_filter_function(1))
business_train_data = latent_class_data.filter(create_filter_function(2))
tech_train_data = latent_class_data.filter(create_filter_function(3))

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# We will select the subset in this ratio for all subsets, for fairness
print(len(world_train_data))
print(len(sports_train_data))
print(len(business_train_data))
print(len(tech_train_data))

690
711
306
1293


## SAS

In [None]:
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')
model = AutoModel.from_pretrained('prajjwal1/bert-small')

def bert_embeddings(data):
    inputs = tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    outputs = model(**inputs)
    # Take the first token ([CLS]) embeddings from each sentence
    return outputs.last_hidden_state[:,0,:].detach()

def sas_algorithm(data, subset_size):
    # Get embeddings for the text data
    embeddings = bert_embeddings(data)
    n = len(embeddings)

    # Initialize the subset and similarities
    S = set()
    S_similarities = np.full(n, -np.inf)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = cosine_similarity(embeddings)

    del embeddings
    gc.collect()

    for _ in tqdm(range(subset_size)):
        not_in_S = np.array([i for i in range(n) if i not in S])

        # Calculate gains for each potential exemplar not in S
        gains = []
        for idx in not_in_S:
            gain = np.sum(np.maximum(S_similarities, similarity_matrix[idx]))
            gains.append(gain)

        # Find the exemplar with the best gain
        best_exemplar_idx = np.argmax(gains)
        best_exemplar = not_in_S[best_exemplar_idx]

        # Update the similarity for the best-selected subset
        S_similarities = np.maximum(S_similarities, similarity_matrix[best_exemplar])
        S.add(best_exemplar)

    return list(S)

In [None]:
# Run the SAS algorithm to select a 20% subset from each latent class
subset_size = 0.2

world_sas_indices = sas_algorithm(world_train_data['text'], int(subset_size * len(world_train_data)))
sports_sas_indices = sas_algorithm(sports_train_data['text'], int(subset_size * len(sports_train_data)))
business_sas_indices = sas_algorithm(business_train_data['text'], int(subset_size * len(business_train_data)))
tech_sas_indices = sas_algorithm(tech_train_data['text'], int(subset_size * len(tech_train_data)))

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/142 [00:00<?, ?it/s]

  0%|          | 0/61 [00:00<?, ?it/s]

  0%|          | 0/258 [00:00<?, ?it/s]

In [None]:
# Sanity check on indices
print(world_sas_indices)
print(sports_sas_indices)
print(business_sas_indices)
print(tech_sas_indices)

[513, 3, 9, 524, 12, 526, 16, 529, 19, 20, 27, 28, 31, 546, 549, 550, 38, 554, 561, 563, 54, 570, 63, 578, 582, 76, 589, 591, 593, 82, 599, 600, 89, 90, 601, 605, 609, 611, 612, 101, 614, 108, 114, 629, 119, 121, 633, 634, 636, 124, 635, 133, 137, 138, 650, 140, 142, 144, 659, 149, 661, 665, 676, 678, 682, 683, 172, 688, 179, 183, 190, 196, 198, 199, 200, 201, 204, 206, 210, 211, 225, 227, 233, 234, 235, 265, 271, 277, 279, 286, 290, 295, 296, 302, 312, 321, 323, 325, 328, 329, 330, 337, 339, 341, 343, 344, 345, 351, 352, 353, 355, 367, 369, 371, 381, 394, 395, 397, 399, 400, 409, 415, 416, 421, 425, 430, 454, 459, 461, 463, 473, 476, 481, 491, 492, 493, 501, 507]
[517, 523, 16, 21, 22, 23, 26, 541, 31, 39, 551, 41, 42, 555, 44, 558, 47, 50, 564, 570, 574, 576, 64, 69, 80, 595, 83, 84, 596, 86, 598, 89, 90, 602, 600, 87, 606, 97, 610, 612, 107, 619, 620, 626, 631, 635, 637, 639, 643, 133, 135, 654, 146, 148, 663, 664, 667, 156, 160, 672, 673, 162, 678, 171, 173, 685, 175, 688, 697, 700

In [None]:
world_sas_subset = world_train_data.select(world_sas_indices)
sports_sas_subset = sports_train_data.select(sports_sas_indices)
business_sas_subset = business_train_data.select(business_sas_indices)
tech_sas_subset = tech_train_data.select(tech_sas_indices)

# Create the subset to use for contrastive self-supervised learning
sas_subset_data = concatenate_datasets([world_sas_subset, sports_sas_subset, business_sas_subset, tech_sas_subset])

In [None]:
del world_sas_subset
del sports_sas_subset
del business_sas_subset
del tech_sas_subset
del world_sas_indices
del sports_sas_indices
del business_sas_indices
del tech_sas_indices
del tokenizer
del model

gc.collect()

15

In [None]:
print(sas_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 599
})


## Random

In [None]:
# Randomly sample 20% of the dataset for each latent class
def random_subset(data, subset_size):
    return random.sample(range(len(data)), subset_size)

In [None]:
subset_size = 0.2

world_random_indices = random_subset(world_train_data['text'], int(subset_size * len(world_train_data)))
sports_random_indices = random_subset(sports_train_data['text'], int(subset_size * len(sports_train_data)))
business_random_indices = random_subset(business_train_data['text'], int(subset_size * len(business_train_data)))
tech_random_indices = random_subset(tech_train_data['text'], int(subset_size * len(tech_train_data)))

In [None]:
# Sanity check on indices
print(world_random_indices)
print(sports_random_indices)
print(business_random_indices)
print(tech_random_indices)

[654, 114, 25, 281, 250, 228, 142, 104, 558, 89, 604, 432, 32, 30, 95, 223, 238, 517, 616, 27, 574, 203, 665, 681, 429, 225, 459, 603, 284, 6, 163, 678, 348, 661, 159, 220, 344, 682, 94, 389, 99, 367, 352, 618, 270, 44, 470, 549, 127, 387, 80, 565, 300, 633, 370, 591, 196, 71, 46, 233, 296, 81, 673, 103, 650, 656, 464, 373, 166, 379, 363, 214, 273, 73, 175, 546, 685, 167, 473, 388, 276, 570, 224, 332, 57, 234, 677, 323, 410, 274, 67, 216, 580, 322, 217, 511, 405, 469, 146, 271, 683, 252, 669, 551, 269, 438, 408, 635, 607, 141, 521, 505, 93, 48, 112, 156, 659, 658, 65, 394, 390, 479, 541, 257, 11, 117, 642, 617, 657, 688, 637, 445, 161, 623, 3, 585, 512, 182]
[519, 108, 640, 305, 654, 710, 623, 203, 156, 382, 165, 552, 543, 0, 613, 331, 500, 19, 114, 371, 314, 245, 59, 246, 580, 80, 87, 497, 70, 545, 128, 131, 675, 486, 562, 169, 271, 540, 621, 433, 216, 699, 205, 319, 408, 665, 701, 448, 529, 462, 123, 253, 230, 65, 346, 21, 602, 567, 235, 706, 225, 7, 72, 646, 60, 234, 69, 32, 338, 64

In [None]:
world_random_subset = world_train_data.select(world_random_indices)
sports_random_subset = sports_train_data.select(sports_random_indices)
business_random_subset = business_train_data.select(business_random_indices)
tech_random_subset = tech_train_data.select(tech_random_indices)

# Create the subset to use for contrastive self-supervised learning
random_subset_data = concatenate_datasets([world_random_subset, sports_random_subset, business_random_subset, tech_random_subset])

In [None]:
del world_random_subset
del sports_random_subset
del business_random_subset
del tech_random_subset
del world_random_indices
del sports_random_indices
del business_random_indices
del tech_random_indices

gc.collect()

23

In [None]:
print(random_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 599
})


## Multi-lingual SAS

In [None]:
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')
model = AutoModel.from_pretrained('prajjwal1/bert-small')

spanish_tokenizer = AutoTokenizer.from_pretrained('dccuchile/albert-tiny-spanish')
spanish_model = AutoModel.from_pretrained('dccuchile/albert-tiny-spanish')

def bert_embeddings(data):
    inputs = tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    outputs = model(**inputs)
    # Take the first token ([CLS]) embeddings from each sentence
    return outputs.last_hidden_state[:,0,:].detach()

def albert_embeddings(data):
    inputs = spanish_tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    outputs = spanish_model(**inputs)
    # Take the first token ([CLS]) embeddings from each sentence
    return outputs.last_hidden_state[:,0,:].detach()

def sas_with_spanish_algorithm(data, subset_size):
    # Get embeddings for the text data
    embeddings = bert_embeddings(data['text'])
    n = len(embeddings)
    print(embeddings.shape)

    # Initialize the subset and similarities
    S = set()
    S_similarities = np.full(n, -np.inf)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = cosine_similarity(embeddings)

    del(embeddings)
    gc.collect()

    # Get Spanish embeddings
    spanish_embeddings = albert_embeddings(data['translation'])
    print(spanish_embeddings.shape)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = np.add(similarity_matrix, cosine_similarity(spanish_embeddings))

    del(spanish_embeddings)
    gc.collect()

    for _ in tqdm(range(subset_size)):
        not_in_S = np.array([i for i in range(n) if i not in S])

        # Calculate gains for each potential exemplar not in S
        gains = []
        for idx in not_in_S:
            gain = np.sum(np.maximum(S_similarities, similarity_matrix[idx]))
            gains.append(gain)

        # Find the exemplar with the best gain
        best_exemplar_idx = np.argmax(gains)
        best_exemplar = not_in_S[best_exemplar_idx]

        # Update the similarity for the best-selected subset
        S_similarities = np.maximum(S_similarities, similarity_matrix[best_exemplar])
        S.add(best_exemplar)

    return list(S)

tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

In [None]:
# Run the SAS algorithm to select a 20% subset from each latent class
subset_size = 0.2

world_multilingual_sas_indices = sas_with_spanish_algorithm(world_train_data, int(subset_size * len(world_train_data)))
sports_multilingual_sas_indices = sas_with_spanish_algorithm(sports_train_data, int(subset_size * len(sports_train_data)))
business_multilingual_sas_indices = sas_with_spanish_algorithm(business_train_data, int(subset_size * len(business_train_data)))
tech_multilingual_sas_indices = sas_with_spanish_algorithm(tech_train_data, int(subset_size * len(tech_train_data)))

torch.Size([690, 512])
torch.Size([690, 312])


  0%|          | 0/138 [00:00<?, ?it/s]

torch.Size([711, 512])
torch.Size([711, 312])


  0%|          | 0/142 [00:00<?, ?it/s]

torch.Size([306, 512])
torch.Size([306, 312])


  0%|          | 0/61 [00:00<?, ?it/s]

torch.Size([1293, 512])
torch.Size([1293, 312])


  0%|          | 0/258 [00:00<?, ?it/s]

In [None]:
world_multilingual_sas_subset = world_train_data.select(world_multilingual_sas_indices)
sports_multilingual_sas_subset = sports_train_data.select(sports_multilingual_sas_indices)
business_multilingual_sas_subset = business_train_data.select(business_multilingual_sas_indices)
tech_multilingual_sas_subset = tech_train_data.select(tech_multilingual_sas_indices)

# Create the subset to use for contrastive self-supervised learning
multilingual_sas_subset_data = concatenate_datasets([world_multilingual_sas_subset, sports_multilingual_sas_subset, business_multilingual_sas_subset, tech_multilingual_sas_subset])

In [None]:
del world_multilingual_sas_subset
del sports_multilingual_sas_subset
del business_multilingual_sas_subset
del tech_multilingual_sas_subset
del world_multilingual_sas_indices
del sports_multilingual_sas_indices
del business_multilingual_sas_indices
del tech_multilingual_sas_indices
del tokenizer
del model
del spanish_tokenizer
del spanish_model

gc.collect()

15

In [None]:
print(multilingual_sas_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 599
})


# Contrastive learning pipeline

Now that we have a 20% representative subset of the data in many different ways. We use this subset, along with two different forms of textual augmentation:

1. Easy data augmentation such as synonym replacement, random addition and random swapping
2. GPT for paraphrasing sentences

to train a self-supervised contrastive learning encoder, which will be evaluated in a downstream prediction task.

In [None]:
# Define the encoder model. We will use pre-trained BERT as the initial embedding
# and use contrastive learning to further train embeddings that can be tested
# downstream, specifically in news article domain.
model_name = 'prajjwal1/bert-small'

class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :] # Use the [CLS] token as the embedding

# Define the contrastive (InfoNCE) loss
class ContrastiveLoss(nn.Module):
    def forward(self, z_i, z_j, z_k):
        # We use cosine similarities between the embeddings
        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

        # Similarity with positive augmented examples vs. negative examples
        positive_similarity = torch.exp(cos(z_i, z_j))
        negative_similarity = torch.exp(cos(z_i, z_k))

        loss = -torch.log(positive_similarity / (positive_similarity + negative_similarity))

        return loss.mean()

In [None]:
# Helper function to randomly move around indices to create a list for negative examples
def shuffle_without_duplication(arr):
    np.random.seed(global_seed)
    arr = np.array(arr)
    n = len(arr)
    # Create an array of the same shape filled with the indices
    indices = np.arange(n)
    while True:
        # Shuffle the indices
        np.random.shuffle(indices)
        # Check if no element remains in its original position
        if not np.any(indices == np.arange(n)):
            break
    # Return the shuffled array
    return arr[indices].tolist()

In [None]:
# Training function for self-supervised contrastive learning
def self_supervised_training(data, num_epochs, augment='eda'):
    # Parameters
    batch_size = 32
    learning_rate = 1e-6

    # Shuffle the data for better training. At first, all subsets are concatenated and
    # therefore separated by latent class
    data = data.shuffle(seed = global_seed)
    texts = data['text']
    if (augment == 'eda'):
        existing_augments_1 = data['augment_1']
        existing_augments_2 = data['augment_2']
    else:
        existing_augments_1 = data['gpt_1']
        existing_augments_2 = data['gpt_2']

    encoder = Encoder().to(device)
    loss_fn = ContrastiveLoss()
    optimizer = optim.Adam(encoder.parameters(), lr = learning_rate)

    # Training loop
    encoder.train()
    for epoch in range(num_epochs):
        for i in range(0, len(texts), batch_size):
            # Sample a batch of texts
            batch_existing_augments_1 = existing_augments_1[i:i + batch_size]
            batch_existing_augments_2 = existing_augments_2[i:i + batch_size]

            # Clean texts and create a list of negative texts from the batch, ensuring
            # that the same example is not selected for the negative.
            clean_existing_augments_1 = [preprocess(text) for text in batch_existing_augments_1]
            clean_existing_augments_2 = [preprocess(text) for text in batch_existing_augments_2]
            negative_augments = shuffle_without_duplication(clean_existing_augments_1)

            # Get the embeddings from the encoder
            z_i = encoder(clean_existing_augments_1)
            z_j = encoder(clean_existing_augments_2)
            z_k = encoder(negative_augments)

            # Compute the contrastive loss
            loss = loss_fn(z_i, z_j, z_k)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print every 20 steps
            if (i // batch_size) % 20 == 0:
                print(f'Epoch {epoch}, Step {i // batch_size}, Loss: {loss.item()}')

    return encoder

# Evaluation pipeline

Through contrastive learning, we have learned an encoder $f$ that can embed our news article text for better downstream performance. We test by using this encoder $f$ and the true labels in training a linear classifier head, and testing on the test dataset. Here, we are trying to compare the efficacy of the encoder $f$ for the three different ways of selecting a subset. Additionally, we need to compare with using the full data, which will be left as a future direction due to computational issues.

In [None]:
# Define the linear classifier head
class LinearClassifier(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super().__init__()
        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        return self.linear(x)

In [None]:
# Create dataloaders from the train and test data
def create_dataloaders(train_data, test_data, batch_size):
    train_val_data = train_data.train_test_split(test_size=0.2, stratify_by_column="label", seed = global_seed)

    train_texts = [preprocess(text) for text in train_val_data['train']['text']]
    val_texts = [preprocess(text) for text in train_val_data['test']['text']]
    test_texts = [preprocess(text) for text in test_data['text']]

    # Create tensors for labels
    train_labels = torch.tensor(train_val_data['train']['label'])
    val_labels = torch.tensor(train_val_data['test']['label'])
    test_labels = torch.tensor(test_data['label'])

    # Create a DataLoader for our training and testing data
    train_data_for_classification = list(zip(train_texts, train_labels))
    val_data_for_classification = list(zip(val_texts, val_labels))
    test_data_for_classification = list(zip(test_texts, test_labels))

    train_dataloader = DataLoader(train_data_for_classification, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_data_for_classification, batch_size=batch_size)
    test_dataloader = DataLoader(test_data_for_classification, batch_size=batch_size)

    return train_dataloader, val_dataloader, test_dataloader

# Evaluation function to train a linear classifier head on top of the learned encoder
# and evaluate on the test set
def evaluate_encoder(encoder, train_data, test_data):
    # Parameters
    batch_size = 64
    num_epochs = 5
    num_classes = 4
    learning_rate = 1e-4

    # Create dataloaders
    train_dataloader, val_dataloader, test_dataloader = create_dataloaders(train_data, test_data, batch_size)

    # Set the encoder to evaluation mode and freeze all layers to test the trained embeddings from contrastive learning
    encoder.eval()
    for param in encoder.parameters():
        param.requires_grad = False

    # Initialize the linear classifier head
    classifier = LinearClassifier(encoder.model.config.hidden_size, num_classes).to(device)

    # Loss function and optimizer for the classifier head
    # Define scheduler for learning rate.
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(classifier.parameters(), lr = learning_rate)

    best_val_loss = float('inf')
    checkpoint_path = './best_checkpoint.pth'

    # Train the classifier
    for epoch in range(num_epochs):
        classifier.train()
        train_loss = 0.0

        for (index, data) in enumerate(train_dataloader):
            texts, labels = data
            labels = labels.to(device)
            optimizer.zero_grad()

            # Forward pass through the frozen encoder and classifier head
            embeddings = encoder(texts)
            outputs = classifier(embeddings)
            loss = criterion(outputs, labels)

            # Backpropagation
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            # Print every 20 steps
            if index % 50 == 0:
                print(f'Epoch {epoch}, Step {index}, Loss: {loss.item()}')

        classifier.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data in val_dataloader:
                texts, labels = data
                labels = labels.to(device)

                embeddings = encoder(texts)
                outputs = classifier(embeddings)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        # Average losses
        train_loss /= len(train_dataloader.dataset)
        val_loss /= len(val_dataloader.dataset)

        # Print stats
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(classifier.state_dict(), checkpoint_path)

    # Evaluate the classifier
    classifier.load_state_dict(torch.load(checkpoint_path))
    classifier.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for texts, labels in test_dataloader:
            labels = labels.to(device)

            embeddings = encoder(texts)
            outputs = classifier(embeddings)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f'Accuracy on the test set: {accuracy:.4f}')

# Testing different subset selection processes + augmentation techniques

## Subset selection using SAS

### GPT

In [None]:
encoder = self_supervised_training(sas_subset_data, 10, 'gpt')

Epoch 0, Step 0, Loss: 0.5572147369384766
Epoch 1, Step 0, Loss: 0.5487239360809326
Epoch 2, Step 0, Loss: 0.5372329950332642
Epoch 3, Step 0, Loss: 0.5243123173713684
Epoch 4, Step 0, Loss: 0.5100106596946716
Epoch 5, Step 0, Loss: 0.5019803047180176
Epoch 6, Step 0, Loss: 0.48678863048553467
Epoch 7, Step 0, Loss: 0.4794415831565857
Epoch 8, Step 0, Loss: 0.4733237624168396
Epoch 9, Step 0, Loss: 0.46080881357192993


In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.261918544769287
Epoch 1, Train Loss: 0.0176, Val Loss: 0.0164
Epoch 1, Step 0, Loss: 1.041725516319275
Epoch 2, Train Loss: 0.0138, Val Loss: 0.0132
Epoch 2, Step 0, Loss: 0.7876819968223572
Epoch 3, Train Loss: 0.0114, Val Loss: 0.0112
Epoch 3, Step 0, Loss: 0.636788547039032
Epoch 4, Train Loss: 0.0098, Val Loss: 0.0099
Epoch 4, Step 0, Loss: 0.6050729751586914
Epoch 5, Train Loss: 0.0088, Val Loss: 0.0090
Accuracy on the test set: 0.8403


In [None]:
encoder = self_supervised_training(sas_subset_data, 25, 'gpt')

Epoch 0, Step 0, Loss: 0.5598907470703125
Epoch 1, Step 0, Loss: 0.5449574589729309
Epoch 2, Step 0, Loss: 0.5363734364509583
Epoch 3, Step 0, Loss: 0.5250856876373291
Epoch 4, Step 0, Loss: 0.5131714940071106
Epoch 5, Step 0, Loss: 0.5002628564834595
Epoch 6, Step 0, Loss: 0.49239659309387207
Epoch 7, Step 0, Loss: 0.4832301139831543
Epoch 8, Step 0, Loss: 0.47163814306259155
Epoch 9, Step 0, Loss: 0.4667903482913971
Epoch 10, Step 0, Loss: 0.4598616361618042
Epoch 11, Step 0, Loss: 0.4524853527545929
Epoch 12, Step 0, Loss: 0.4502246379852295
Epoch 13, Step 0, Loss: 0.4375549554824829
Epoch 14, Step 0, Loss: 0.4431660771369934
Epoch 15, Step 0, Loss: 0.43299710750579834
Epoch 16, Step 0, Loss: 0.4344552159309387
Epoch 17, Step 0, Loss: 0.42099636793136597
Epoch 18, Step 0, Loss: 0.43072542548179626
Epoch 19, Step 0, Loss: 0.42104795575141907
Epoch 20, Step 0, Loss: 0.4224153757095337
Epoch 21, Step 0, Loss: 0.4180498719215393
Epoch 22, Step 0, Loss: 0.418239951133728
Epoch 23, Step 0

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.541580319404602
Epoch 1, Train Loss: 0.0209, Val Loss: 0.0186
Epoch 1, Step 0, Loss: 1.062971830368042
Epoch 2, Train Loss: 0.0156, Val Loss: 0.0143
Epoch 2, Step 0, Loss: 0.8808524012565613
Epoch 3, Train Loss: 0.0123, Val Loss: 0.0117
Epoch 3, Step 0, Loss: 0.6497108340263367
Epoch 4, Train Loss: 0.0103, Val Loss: 0.0100
Epoch 4, Step 0, Loss: 0.543342649936676
Epoch 5, Train Loss: 0.0090, Val Loss: 0.0090
Accuracy on the test set: 0.8438


In [None]:
encoder = self_supervised_training(sas_subset_data, 50, 'gpt')

Epoch 0, Step 0, Loss: 0.5605013370513916
Epoch 1, Step 0, Loss: 0.5443581342697144
Epoch 2, Step 0, Loss: 0.5375685095787048
Epoch 3, Step 0, Loss: 0.5223040580749512
Epoch 4, Step 0, Loss: 0.513237476348877
Epoch 5, Step 0, Loss: 0.5031770467758179
Epoch 6, Step 0, Loss: 0.49425631761550903
Epoch 7, Step 0, Loss: 0.4817686676979065
Epoch 8, Step 0, Loss: 0.471854031085968
Epoch 9, Step 0, Loss: 0.46601226925849915
Epoch 10, Step 0, Loss: 0.4611518383026123
Epoch 11, Step 0, Loss: 0.45168453454971313
Epoch 12, Step 0, Loss: 0.4476463794708252
Epoch 13, Step 0, Loss: 0.44148051738739014
Epoch 14, Step 0, Loss: 0.4354843497276306
Epoch 15, Step 0, Loss: 0.43728622794151306
Epoch 16, Step 0, Loss: 0.4331229329109192
Epoch 17, Step 0, Loss: 0.42780807614326477
Epoch 18, Step 0, Loss: 0.42646524310112
Epoch 19, Step 0, Loss: 0.42299866676330566
Epoch 20, Step 0, Loss: 0.42078113555908203
Epoch 21, Step 0, Loss: 0.42040199041366577
Epoch 22, Step 0, Loss: 0.41348105669021606
Epoch 23, Step 

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.425477385520935
Epoch 1, Train Loss: 0.0199, Val Loss: 0.0173
Epoch 1, Step 0, Loss: 1.0360114574432373
Epoch 2, Train Loss: 0.0143, Val Loss: 0.0130
Epoch 2, Step 0, Loss: 0.8039239645004272
Epoch 3, Train Loss: 0.0111, Val Loss: 0.0105
Epoch 3, Step 0, Loss: 0.5347409844398499
Epoch 4, Train Loss: 0.0093, Val Loss: 0.0091
Epoch 4, Step 0, Loss: 0.4734962284564972
Epoch 5, Train Loss: 0.0083, Val Loss: 0.0082
Accuracy on the test set: 0.8478


In [None]:
encoder = self_supervised_training(sas_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5570412278175354
Epoch 1, Step 0, Loss: 0.5430622696876526
Epoch 2, Step 0, Loss: 0.5331190824508667
Epoch 3, Step 0, Loss: 0.5214006304740906
Epoch 4, Step 0, Loss: 0.5150640606880188
Epoch 5, Step 0, Loss: 0.4967668652534485
Epoch 6, Step 0, Loss: 0.4923696517944336
Epoch 7, Step 0, Loss: 0.4808675944805145
Epoch 8, Step 0, Loss: 0.4698641896247864
Epoch 9, Step 0, Loss: 0.4658689498901367
Epoch 10, Step 0, Loss: 0.4612331986427307
Epoch 11, Step 0, Loss: 0.45376938581466675
Epoch 12, Step 0, Loss: 0.44677072763442993
Epoch 13, Step 0, Loss: 0.44712138175964355
Epoch 14, Step 0, Loss: 0.4392704367637634
Epoch 15, Step 0, Loss: 0.4361751079559326
Epoch 16, Step 0, Loss: 0.42953330278396606
Epoch 17, Step 0, Loss: 0.426697701215744
Epoch 18, Step 0, Loss: 0.42929816246032715
Epoch 19, Step 0, Loss: 0.4275878071784973
Epoch 20, Step 0, Loss: 0.41755223274230957
Epoch 21, Step 0, Loss: 0.41537222266197205
Epoch 22, Step 0, Loss: 0.4198949933052063
Epoch 23, Step 

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4243147373199463
Epoch 1, Train Loss: 0.0194, Val Loss: 0.0166
Epoch 1, Step 0, Loss: 1.0005953311920166
Epoch 2, Train Loss: 0.0138, Val Loss: 0.0124
Epoch 2, Step 0, Loss: 0.742308497428894
Epoch 3, Train Loss: 0.0108, Val Loss: 0.0103
Epoch 3, Step 0, Loss: 0.6107576489448547
Epoch 4, Train Loss: 0.0091, Val Loss: 0.0090
Epoch 4, Step 0, Loss: 0.6126748919487
Epoch 5, Train Loss: 0.0082, Val Loss: 0.0082
Accuracy on the test set: 0.8401


### EDA

In [None]:
encoder = self_supervised_training(sas_subset_data, 10, 'eda')

Epoch 0, Step 0, Loss: 0.5682438015937805
Epoch 1, Step 0, Loss: 0.5560617446899414
Epoch 2, Step 0, Loss: 0.5350694060325623
Epoch 3, Step 0, Loss: 0.5295361280441284
Epoch 4, Step 0, Loss: 0.5190794467926025
Epoch 5, Step 0, Loss: 0.504417896270752
Epoch 6, Step 0, Loss: 0.5022239089012146
Epoch 7, Step 0, Loss: 0.4897363781929016
Epoch 8, Step 0, Loss: 0.48505640029907227
Epoch 9, Step 0, Loss: 0.47680777311325073


In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.427250623703003
Epoch 1, Train Loss: 0.0209, Val Loss: 0.0192
Epoch 1, Step 0, Loss: 1.124593734741211
Epoch 2, Train Loss: 0.0165, Val Loss: 0.0154
Epoch 2, Step 0, Loss: 0.971137523651123
Epoch 3, Train Loss: 0.0136, Val Loss: 0.0129
Epoch 3, Step 0, Loss: 0.6435054540634155
Epoch 4, Train Loss: 0.0115, Val Loss: 0.0112
Epoch 4, Step 0, Loss: 0.7384827136993408
Epoch 5, Train Loss: 0.0102, Val Loss: 0.0101
Accuracy on the test set: 0.8257


In [None]:
encoder = self_supervised_training(sas_subset_data, 25, 'eda')

Epoch 0, Step 0, Loss: 0.5682851672172546
Epoch 1, Step 0, Loss: 0.5502724647521973
Epoch 2, Step 0, Loss: 0.5362656116485596
Epoch 3, Step 0, Loss: 0.5263598561286926
Epoch 4, Step 0, Loss: 0.5197099447250366
Epoch 5, Step 0, Loss: 0.5054095983505249
Epoch 6, Step 0, Loss: 0.497341513633728
Epoch 7, Step 0, Loss: 0.49065521359443665
Epoch 8, Step 0, Loss: 0.48077771067619324
Epoch 9, Step 0, Loss: 0.46753257513046265
Epoch 10, Step 0, Loss: 0.46739423274993896
Epoch 11, Step 0, Loss: 0.4600472152233124
Epoch 12, Step 0, Loss: 0.45895975828170776
Epoch 13, Step 0, Loss: 0.4531223177909851
Epoch 14, Step 0, Loss: 0.44757282733917236
Epoch 15, Step 0, Loss: 0.44575124979019165
Epoch 16, Step 0, Loss: 0.4398602247238159
Epoch 17, Step 0, Loss: 0.43304383754730225
Epoch 18, Step 0, Loss: 0.43582475185394287
Epoch 19, Step 0, Loss: 0.4293878376483917
Epoch 20, Step 0, Loss: 0.425858736038208
Epoch 21, Step 0, Loss: 0.4304991364479065
Epoch 22, Step 0, Loss: 0.4259016513824463
Epoch 23, Step

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.3607174158096313
Epoch 1, Train Loss: 0.0194, Val Loss: 0.0178
Epoch 1, Step 0, Loss: 1.023360252380371
Epoch 2, Train Loss: 0.0147, Val Loss: 0.0139
Epoch 2, Step 0, Loss: 0.897286593914032
Epoch 3, Train Loss: 0.0118, Val Loss: 0.0116
Epoch 3, Step 0, Loss: 0.6274234652519226
Epoch 4, Train Loss: 0.0100, Val Loss: 0.0101
Epoch 4, Step 0, Loss: 0.568164050579071
Epoch 5, Train Loss: 0.0088, Val Loss: 0.0091
Accuracy on the test set: 0.8426


In [None]:
encoder = self_supervised_training(sas_subset_data, 50, 'eda')

Epoch 0, Step 0, Loss: 0.5662268400192261
Epoch 1, Step 0, Loss: 0.5494905710220337
Epoch 2, Step 0, Loss: 0.5412120819091797
Epoch 3, Step 0, Loss: 0.5244659185409546
Epoch 4, Step 0, Loss: 0.5203303098678589
Epoch 5, Step 0, Loss: 0.5064499378204346
Epoch 6, Step 0, Loss: 0.5035855770111084
Epoch 7, Step 0, Loss: 0.4891277849674225
Epoch 8, Step 0, Loss: 0.4893309473991394
Epoch 9, Step 0, Loss: 0.47127392888069153
Epoch 10, Step 0, Loss: 0.47098153829574585
Epoch 11, Step 0, Loss: 0.46015894412994385
Epoch 12, Step 0, Loss: 0.4552467465400696
Epoch 13, Step 0, Loss: 0.4518410861492157
Epoch 14, Step 0, Loss: 0.44855302572250366
Epoch 15, Step 0, Loss: 0.44130992889404297
Epoch 16, Step 0, Loss: 0.44055670499801636
Epoch 17, Step 0, Loss: 0.4333299398422241
Epoch 18, Step 0, Loss: 0.4341665208339691
Epoch 19, Step 0, Loss: 0.42963600158691406
Epoch 20, Step 0, Loss: 0.42963480949401855
Epoch 21, Step 0, Loss: 0.428066223859787
Epoch 22, Step 0, Loss: 0.42648211121559143
Epoch 23, Ste

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4928101301193237
Epoch 1, Train Loss: 0.0204, Val Loss: 0.0177
Epoch 1, Step 0, Loss: 1.0577315092086792
Epoch 2, Train Loss: 0.0146, Val Loss: 0.0131
Epoch 2, Step 0, Loss: 0.7526770830154419
Epoch 3, Train Loss: 0.0113, Val Loss: 0.0106
Epoch 3, Step 0, Loss: 0.5699315667152405
Epoch 4, Train Loss: 0.0095, Val Loss: 0.0092
Epoch 4, Step 0, Loss: 0.6116964817047119
Epoch 5, Train Loss: 0.0084, Val Loss: 0.0083
Accuracy on the test set: 0.8470


In [None]:
encoder = self_supervised_training(sas_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5670948028564453
Epoch 1, Step 0, Loss: 0.5504319667816162
Epoch 2, Step 0, Loss: 0.5420278906822205
Epoch 3, Step 0, Loss: 0.5296784043312073
Epoch 4, Step 0, Loss: 0.5158551335334778
Epoch 5, Step 0, Loss: 0.5072920322418213
Epoch 6, Step 0, Loss: 0.4976954460144043
Epoch 7, Step 0, Loss: 0.48742759227752686
Epoch 8, Step 0, Loss: 0.47817522287368774
Epoch 9, Step 0, Loss: 0.4769640564918518
Epoch 10, Step 0, Loss: 0.4673202633857727
Epoch 11, Step 0, Loss: 0.45871829986572266
Epoch 12, Step 0, Loss: 0.45726150274276733
Epoch 13, Step 0, Loss: 0.453164279460907
Epoch 14, Step 0, Loss: 0.4522694945335388
Epoch 15, Step 0, Loss: 0.44547775387763977
Epoch 16, Step 0, Loss: 0.44325220584869385
Epoch 17, Step 0, Loss: 0.4388344883918762
Epoch 18, Step 0, Loss: 0.43415701389312744
Epoch 19, Step 0, Loss: 0.4300851821899414
Epoch 20, Step 0, Loss: 0.4350349009037018
Epoch 21, Step 0, Loss: 0.4204506278038025
Epoch 22, Step 0, Loss: 0.42415428161621094
Epoch 23, Step

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.2404255867004395
Epoch 1, Train Loss: 0.0173, Val Loss: 0.0150
Epoch 1, Step 0, Loss: 0.8946734666824341
Epoch 2, Train Loss: 0.0125, Val Loss: 0.0115
Epoch 2, Step 0, Loss: 0.7681529521942139
Epoch 3, Train Loss: 0.0101, Val Loss: 0.0097
Epoch 3, Step 0, Loss: 0.5423387289047241
Epoch 4, Train Loss: 0.0088, Val Loss: 0.0087
Epoch 4, Step 0, Loss: 0.6535813808441162
Epoch 5, Train Loss: 0.0080, Val Loss: 0.0080
Accuracy on the test set: 0.8341


## Random subset selection

### GPT

In [None]:
encoder = self_supervised_training(random_subset_data, 10, 'gpt')

Epoch 0, Step 0, Loss: 0.5616491436958313
Epoch 1, Step 0, Loss: 0.5450156927108765
Epoch 2, Step 0, Loss: 0.5385290384292603
Epoch 3, Step 0, Loss: 0.525793194770813
Epoch 4, Step 0, Loss: 0.5222278833389282
Epoch 5, Step 0, Loss: 0.5069168210029602
Epoch 6, Step 0, Loss: 0.49150216579437256
Epoch 7, Step 0, Loss: 0.48625460267066956
Epoch 8, Step 0, Loss: 0.48564884066581726
Epoch 9, Step 0, Loss: 0.4767572283744812


In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.301161527633667
Epoch 1, Train Loss: 0.0178, Val Loss: 0.0162
Epoch 1, Step 0, Loss: 1.0136420726776123
Epoch 2, Train Loss: 0.0139, Val Loss: 0.0131
Epoch 2, Step 0, Loss: 0.7026005983352661
Epoch 3, Train Loss: 0.0115, Val Loss: 0.0111
Epoch 3, Step 0, Loss: 0.625704824924469
Epoch 4, Train Loss: 0.0099, Val Loss: 0.0099
Epoch 4, Step 0, Loss: 0.6277867555618286
Epoch 5, Train Loss: 0.0089, Val Loss: 0.0090
Accuracy on the test set: 0.8429


In [None]:
encoder = self_supervised_training(random_subset_data, 25, 'gpt')

Epoch 0, Step 0, Loss: 0.5588974952697754
Epoch 1, Step 0, Loss: 0.5520447492599487
Epoch 2, Step 0, Loss: 0.5354654788970947
Epoch 3, Step 0, Loss: 0.5254510641098022
Epoch 4, Step 0, Loss: 0.5146105885505676
Epoch 5, Step 0, Loss: 0.5078282952308655
Epoch 6, Step 0, Loss: 0.49606120586395264
Epoch 7, Step 0, Loss: 0.49308836460113525
Epoch 8, Step 0, Loss: 0.4848272502422333
Epoch 9, Step 0, Loss: 0.48157596588134766
Epoch 10, Step 0, Loss: 0.4672115445137024
Epoch 11, Step 0, Loss: 0.4618943929672241
Epoch 12, Step 0, Loss: 0.4583147168159485
Epoch 13, Step 0, Loss: 0.4523182511329651
Epoch 14, Step 0, Loss: 0.4523266553878784
Epoch 15, Step 0, Loss: 0.4531674385070801
Epoch 16, Step 0, Loss: 0.4440675377845764
Epoch 17, Step 0, Loss: 0.4446900486946106
Epoch 18, Step 0, Loss: 0.44128328561782837
Epoch 19, Step 0, Loss: 0.4348301291465759
Epoch 20, Step 0, Loss: 0.4353276789188385
Epoch 21, Step 0, Loss: 0.4283103346824646
Epoch 22, Step 0, Loss: 0.4329988360404968
Epoch 23, Step 0,

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.3853570222854614
Epoch 1, Train Loss: 0.0188, Val Loss: 0.0165
Epoch 1, Step 0, Loss: 0.9547069668769836
Epoch 2, Train Loss: 0.0141, Val Loss: 0.0128
Epoch 2, Step 0, Loss: 0.7631710767745972
Epoch 3, Train Loss: 0.0112, Val Loss: 0.0106
Epoch 3, Step 0, Loss: 0.6080537438392639
Epoch 4, Train Loss: 0.0095, Val Loss: 0.0093
Epoch 4, Step 0, Loss: 0.5396283268928528
Epoch 5, Train Loss: 0.0084, Val Loss: 0.0084
Accuracy on the test set: 0.8480


In [None]:
encoder = self_supervised_training(random_subset_data, 50, 'gpt')

Epoch 0, Step 0, Loss: 0.5656836628913879
Epoch 1, Step 0, Loss: 0.5482562184333801
Epoch 2, Step 0, Loss: 0.5400741696357727
Epoch 3, Step 0, Loss: 0.519801139831543
Epoch 4, Step 0, Loss: 0.5162656903266907
Epoch 5, Step 0, Loss: 0.5104528665542603
Epoch 6, Step 0, Loss: 0.5025877952575684
Epoch 7, Step 0, Loss: 0.4890882968902588
Epoch 8, Step 0, Loss: 0.48248323798179626
Epoch 9, Step 0, Loss: 0.4687253534793854
Epoch 10, Step 0, Loss: 0.469651997089386
Epoch 11, Step 0, Loss: 0.46645283699035645
Epoch 12, Step 0, Loss: 0.45916399359703064
Epoch 13, Step 0, Loss: 0.45045435428619385
Epoch 14, Step 0, Loss: 0.44754692912101746
Epoch 15, Step 0, Loss: 0.4472731351852417
Epoch 16, Step 0, Loss: 0.44419652223587036
Epoch 17, Step 0, Loss: 0.44103410840034485
Epoch 18, Step 0, Loss: 0.4380837082862854
Epoch 19, Step 0, Loss: 0.4360145330429077
Epoch 20, Step 0, Loss: 0.43723052740097046
Epoch 21, Step 0, Loss: 0.43075138330459595
Epoch 22, Step 0, Loss: 0.42917758226394653
Epoch 23, Ste

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4425017833709717
Epoch 1, Train Loss: 0.0198, Val Loss: 0.0173
Epoch 1, Step 0, Loss: 1.0469363927841187
Epoch 2, Train Loss: 0.0141, Val Loss: 0.0129
Epoch 2, Step 0, Loss: 0.8589673042297363
Epoch 3, Train Loss: 0.0109, Val Loss: 0.0105
Epoch 3, Step 0, Loss: 0.6419544816017151
Epoch 4, Train Loss: 0.0091, Val Loss: 0.0091
Epoch 4, Step 0, Loss: 0.4977751076221466
Epoch 5, Train Loss: 0.0081, Val Loss: 0.0083
Accuracy on the test set: 0.8450


In [None]:
encoder = self_supervised_training(random_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5682392120361328
Epoch 1, Step 0, Loss: 0.5497227311134338
Epoch 2, Step 0, Loss: 0.5406889915466309
Epoch 3, Step 0, Loss: 0.5253230333328247
Epoch 4, Step 0, Loss: 0.5162917971611023
Epoch 5, Step 0, Loss: 0.5070321559906006
Epoch 6, Step 0, Loss: 0.4999668598175049
Epoch 7, Step 0, Loss: 0.4863300621509552
Epoch 8, Step 0, Loss: 0.48076218366622925
Epoch 9, Step 0, Loss: 0.48079535365104675
Epoch 10, Step 0, Loss: 0.4694767892360687
Epoch 11, Step 0, Loss: 0.4649263620376587
Epoch 12, Step 0, Loss: 0.45607542991638184
Epoch 13, Step 0, Loss: 0.45336633920669556
Epoch 14, Step 0, Loss: 0.4487159848213196
Epoch 15, Step 0, Loss: 0.45155084133148193
Epoch 16, Step 0, Loss: 0.4437651038169861
Epoch 17, Step 0, Loss: 0.4448585510253906
Epoch 18, Step 0, Loss: 0.44071006774902344
Epoch 19, Step 0, Loss: 0.43289780616760254
Epoch 20, Step 0, Loss: 0.43254590034484863
Epoch 21, Step 0, Loss: 0.43437081575393677
Epoch 22, Step 0, Loss: 0.42776843905448914
Epoch 23, S

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.3581933975219727
Epoch 1, Train Loss: 0.0178, Val Loss: 0.0150
Epoch 1, Step 0, Loss: 0.9199560880661011
Epoch 2, Train Loss: 0.0122, Val Loss: 0.0111
Epoch 2, Step 0, Loss: 0.5776678323745728
Epoch 3, Train Loss: 0.0096, Val Loss: 0.0093
Epoch 3, Step 0, Loss: 0.5498312711715698
Epoch 4, Train Loss: 0.0083, Val Loss: 0.0083
Epoch 4, Step 0, Loss: 0.5526129603385925
Epoch 5, Train Loss: 0.0075, Val Loss: 0.0077
Accuracy on the test set: 0.8505


### EDA

In [None]:
encoder = self_supervised_training(random_subset_data, 10, 'eda')

Epoch 0, Step 0, Loss: 0.5611862540245056
Epoch 1, Step 0, Loss: 0.5452650189399719
Epoch 2, Step 0, Loss: 0.5279363393783569
Epoch 3, Step 0, Loss: 0.5228238105773926
Epoch 4, Step 0, Loss: 0.5079990029335022
Epoch 5, Step 0, Loss: 0.5028289556503296
Epoch 6, Step 0, Loss: 0.49340885877609253
Epoch 7, Step 0, Loss: 0.47854405641555786
Epoch 8, Step 0, Loss: 0.47642213106155396
Epoch 9, Step 0, Loss: 0.4690830707550049


In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4675335884094238
Epoch 1, Train Loss: 0.0210, Val Loss: 0.0194
Epoch 1, Step 0, Loss: 1.0984059572219849
Epoch 2, Train Loss: 0.0167, Val Loss: 0.0157
Epoch 2, Step 0, Loss: 1.0710303783416748
Epoch 3, Train Loss: 0.0137, Val Loss: 0.0132
Epoch 3, Step 0, Loss: 0.6844800114631653
Epoch 4, Train Loss: 0.0117, Val Loss: 0.0116
Epoch 4, Step 0, Loss: 0.7346972227096558
Epoch 5, Train Loss: 0.0103, Val Loss: 0.0104
Accuracy on the test set: 0.8236


In [None]:
encoder = self_supervised_training(random_subset_data, 25, 'eda')

Epoch 0, Step 0, Loss: 0.562839686870575
Epoch 1, Step 0, Loss: 0.5472465753555298
Epoch 2, Step 0, Loss: 0.536990761756897
Epoch 3, Step 0, Loss: 0.5246263146400452
Epoch 4, Step 0, Loss: 0.5139427185058594
Epoch 5, Step 0, Loss: 0.5013857483863831
Epoch 6, Step 0, Loss: 0.4897349774837494
Epoch 7, Step 0, Loss: 0.4854744076728821
Epoch 8, Step 0, Loss: 0.47729215025901794
Epoch 9, Step 0, Loss: 0.46480876207351685
Epoch 10, Step 0, Loss: 0.4652397930622101
Epoch 11, Step 0, Loss: 0.45546942949295044
Epoch 12, Step 0, Loss: 0.4468385577201843
Epoch 13, Step 0, Loss: 0.4492311477661133
Epoch 14, Step 0, Loss: 0.4437180757522583
Epoch 15, Step 0, Loss: 0.43426892161369324
Epoch 16, Step 0, Loss: 0.43815910816192627
Epoch 17, Step 0, Loss: 0.43640491366386414
Epoch 18, Step 0, Loss: 0.429151713848114
Epoch 19, Step 0, Loss: 0.42520177364349365
Epoch 20, Step 0, Loss: 0.425620436668396
Epoch 21, Step 0, Loss: 0.42069685459136963
Epoch 22, Step 0, Loss: 0.41895490884780884
Epoch 23, Step 0

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4020557403564453
Epoch 1, Train Loss: 0.0191, Val Loss: 0.0171
Epoch 1, Step 0, Loss: 1.0011433362960815
Epoch 2, Train Loss: 0.0144, Val Loss: 0.0133
Epoch 2, Step 0, Loss: 0.8023506999015808
Epoch 3, Train Loss: 0.0116, Val Loss: 0.0110
Epoch 3, Step 0, Loss: 0.6897092461585999
Epoch 4, Train Loss: 0.0099, Val Loss: 0.0097
Epoch 4, Step 0, Loss: 0.4998319745063782
Epoch 5, Train Loss: 0.0089, Val Loss: 0.0087
Accuracy on the test set: 0.8351


In [None]:
encoder = self_supervised_training(random_subset_data, 50, 'eda')

Epoch 0, Step 0, Loss: 0.5585384368896484
Epoch 1, Step 0, Loss: 0.5463334321975708
Epoch 2, Step 0, Loss: 0.5336614847183228
Epoch 3, Step 0, Loss: 0.5285465717315674
Epoch 4, Step 0, Loss: 0.5119081139564514
Epoch 5, Step 0, Loss: 0.5023180842399597
Epoch 6, Step 0, Loss: 0.49108460545539856
Epoch 7, Step 0, Loss: 0.48154622316360474
Epoch 8, Step 0, Loss: 0.480596125125885
Epoch 9, Step 0, Loss: 0.47172945737838745
Epoch 10, Step 0, Loss: 0.46133899688720703
Epoch 11, Step 0, Loss: 0.45828723907470703
Epoch 12, Step 0, Loss: 0.45179983973503113
Epoch 13, Step 0, Loss: 0.4495469629764557
Epoch 14, Step 0, Loss: 0.44525569677352905
Epoch 15, Step 0, Loss: 0.4350450932979584
Epoch 16, Step 0, Loss: 0.4343477785587311
Epoch 17, Step 0, Loss: 0.43219470977783203
Epoch 18, Step 0, Loss: 0.43333160877227783
Epoch 19, Step 0, Loss: 0.43253371119499207
Epoch 20, Step 0, Loss: 0.42576950788497925
Epoch 21, Step 0, Loss: 0.42392057180404663
Epoch 22, Step 0, Loss: 0.42071226239204407
Epoch 23,

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.5771007537841797
Epoch 1, Train Loss: 0.0211, Val Loss: 0.0184
Epoch 1, Step 0, Loss: 1.087188959121704
Epoch 2, Train Loss: 0.0152, Val Loss: 0.0138
Epoch 2, Step 0, Loss: 0.8025113344192505
Epoch 3, Train Loss: 0.0118, Val Loss: 0.0112
Epoch 3, Step 0, Loss: 0.629466712474823
Epoch 4, Train Loss: 0.0099, Val Loss: 0.0097
Epoch 4, Step 0, Loss: 0.5689693689346313
Epoch 5, Train Loss: 0.0088, Val Loss: 0.0088
Accuracy on the test set: 0.8357


In [None]:
encoder = self_supervised_training(random_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5605783462524414
Epoch 1, Step 0, Loss: 0.5489597916603088
Epoch 2, Step 0, Loss: 0.5381859540939331
Epoch 3, Step 0, Loss: 0.5269277095794678
Epoch 4, Step 0, Loss: 0.5102295875549316
Epoch 5, Step 0, Loss: 0.5050872564315796
Epoch 6, Step 0, Loss: 0.48847562074661255
Epoch 7, Step 0, Loss: 0.48335009813308716
Epoch 8, Step 0, Loss: 0.47349515557289124
Epoch 9, Step 0, Loss: 0.46755021810531616
Epoch 10, Step 0, Loss: 0.46186956763267517
Epoch 11, Step 0, Loss: 0.4526538550853729
Epoch 12, Step 0, Loss: 0.4498024582862854
Epoch 13, Step 0, Loss: 0.4470057487487793
Epoch 14, Step 0, Loss: 0.4427124857902527
Epoch 15, Step 0, Loss: 0.43587520718574524
Epoch 16, Step 0, Loss: 0.43631017208099365
Epoch 17, Step 0, Loss: 0.42642664909362793
Epoch 18, Step 0, Loss: 0.4338732659816742
Epoch 19, Step 0, Loss: 0.42876166105270386
Epoch 20, Step 0, Loss: 0.4235695004463196
Epoch 21, Step 0, Loss: 0.4191829562187195
Epoch 22, Step 0, Loss: 0.42134934663772583
Epoch 23, S

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.3725414276123047
Epoch 1, Train Loss: 0.0184, Val Loss: 0.0159
Epoch 1, Step 0, Loss: 0.8441670536994934
Epoch 2, Train Loss: 0.0132, Val Loss: 0.0122
Epoch 2, Step 0, Loss: 0.8145902156829834
Epoch 3, Train Loss: 0.0106, Val Loss: 0.0104
Epoch 3, Step 0, Loss: 0.6599157452583313
Epoch 4, Train Loss: 0.0093, Val Loss: 0.0094
Epoch 4, Step 0, Loss: 0.6390494108200073
Epoch 5, Train Loss: 0.0085, Val Loss: 0.0088
Accuracy on the test set: 0.8246


## Subset selection using multilingual SAS

### GPT

In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 10, 'gpt')

Epoch 0, Step 0, Loss: 0.5544434785842896
Epoch 1, Step 0, Loss: 0.5423393845558167
Epoch 2, Step 0, Loss: 0.5276861190795898
Epoch 3, Step 0, Loss: 0.5203695893287659
Epoch 4, Step 0, Loss: 0.5036510229110718
Epoch 5, Step 0, Loss: 0.5011116862297058
Epoch 6, Step 0, Loss: 0.4924963116645813
Epoch 7, Step 0, Loss: 0.4825829267501831
Epoch 8, Step 0, Loss: 0.4754030704498291
Epoch 9, Step 0, Loss: 0.4702533781528473


In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4282073974609375
Epoch 1, Train Loss: 0.0199, Val Loss: 0.0187
Epoch 1, Step 0, Loss: 1.0272127389907837
Epoch 2, Train Loss: 0.0154, Val Loss: 0.0149
Epoch 2, Step 0, Loss: 0.8573707342147827
Epoch 3, Train Loss: 0.0125, Val Loss: 0.0125
Epoch 3, Step 0, Loss: 0.7143198847770691
Epoch 4, Train Loss: 0.0106, Val Loss: 0.0108
Epoch 4, Step 0, Loss: 0.6065401434898376
Epoch 5, Train Loss: 0.0094, Val Loss: 0.0097
Accuracy on the test set: 0.8361


In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 25, 'gpt')

Epoch 0, Step 0, Loss: 0.5608071088790894
Epoch 1, Step 0, Loss: 0.5429778099060059
Epoch 2, Step 0, Loss: 0.5326766967773438
Epoch 3, Step 0, Loss: 0.5167876482009888
Epoch 4, Step 0, Loss: 0.5111478567123413
Epoch 5, Step 0, Loss: 0.4985504150390625
Epoch 6, Step 0, Loss: 0.4919242858886719
Epoch 7, Step 0, Loss: 0.48079127073287964
Epoch 8, Step 0, Loss: 0.4733436405658722
Epoch 9, Step 0, Loss: 0.46560391783714294
Epoch 10, Step 0, Loss: 0.46051880717277527
Epoch 11, Step 0, Loss: 0.4576425552368164
Epoch 12, Step 0, Loss: 0.44770729541778564
Epoch 13, Step 0, Loss: 0.44859030842781067
Epoch 14, Step 0, Loss: 0.4456012547016144
Epoch 15, Step 0, Loss: 0.4426543414592743
Epoch 16, Step 0, Loss: 0.43385279178619385
Epoch 17, Step 0, Loss: 0.4266008138656616
Epoch 18, Step 0, Loss: 0.4310823678970337
Epoch 19, Step 0, Loss: 0.4301002621650696
Epoch 20, Step 0, Loss: 0.430492639541626
Epoch 21, Step 0, Loss: 0.42452529072761536
Epoch 22, Step 0, Loss: 0.418592631816864
Epoch 23, Step 0

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.394880771636963
Epoch 1, Train Loss: 0.0199, Val Loss: 0.0178
Epoch 1, Step 0, Loss: 1.1220853328704834
Epoch 2, Train Loss: 0.0149, Val Loss: 0.0137
Epoch 2, Step 0, Loss: 0.8767866492271423
Epoch 3, Train Loss: 0.0118, Val Loss: 0.0112
Epoch 3, Step 0, Loss: 0.7110730409622192
Epoch 4, Train Loss: 0.0099, Val Loss: 0.0097
Epoch 4, Step 0, Loss: 0.653026282787323
Epoch 5, Train Loss: 0.0088, Val Loss: 0.0087
Accuracy on the test set: 0.8446


In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 50, 'gpt')

Epoch 0, Step 0, Loss: 0.553716778755188
Epoch 1, Step 0, Loss: 0.5367976427078247
Epoch 2, Step 0, Loss: 0.5312955975532532
Epoch 3, Step 0, Loss: 0.5167423486709595
Epoch 4, Step 0, Loss: 0.5095878839492798
Epoch 5, Step 0, Loss: 0.49815046787261963
Epoch 6, Step 0, Loss: 0.48728010058403015
Epoch 7, Step 0, Loss: 0.47940194606781006
Epoch 8, Step 0, Loss: 0.4750712513923645
Epoch 9, Step 0, Loss: 0.4692748785018921
Epoch 10, Step 0, Loss: 0.46360406279563904
Epoch 11, Step 0, Loss: 0.45640426874160767
Epoch 12, Step 0, Loss: 0.44836845993995667
Epoch 13, Step 0, Loss: 0.44386351108551025
Epoch 14, Step 0, Loss: 0.4433917999267578
Epoch 15, Step 0, Loss: 0.4392719268798828
Epoch 16, Step 0, Loss: 0.43892940878868103
Epoch 17, Step 0, Loss: 0.4326699674129486
Epoch 18, Step 0, Loss: 0.4295411705970764
Epoch 19, Step 0, Loss: 0.4315854012966156
Epoch 20, Step 0, Loss: 0.4263738989830017
Epoch 21, Step 0, Loss: 0.425609827041626
Epoch 22, Step 0, Loss: 0.42226195335388184
Epoch 23, Step

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4368642568588257
Epoch 1, Train Loss: 0.0183, Val Loss: 0.0160
Epoch 1, Step 0, Loss: 1.0105388164520264
Epoch 2, Train Loss: 0.0132, Val Loss: 0.0121
Epoch 2, Step 0, Loss: 0.7068260312080383
Epoch 3, Train Loss: 0.0103, Val Loss: 0.0099
Epoch 3, Step 0, Loss: 0.6286647319793701
Epoch 4, Train Loss: 0.0088, Val Loss: 0.0087
Epoch 4, Step 0, Loss: 0.5006576776504517
Epoch 5, Train Loss: 0.0079, Val Loss: 0.0080
Accuracy on the test set: 0.8503


In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5587395429611206
Epoch 1, Step 0, Loss: 0.546870231628418
Epoch 2, Step 0, Loss: 0.5345774292945862
Epoch 3, Step 0, Loss: 0.5166549682617188
Epoch 4, Step 0, Loss: 0.5093525648117065
Epoch 5, Step 0, Loss: 0.5033179521560669
Epoch 6, Step 0, Loss: 0.48999136686325073
Epoch 7, Step 0, Loss: 0.4826696813106537
Epoch 8, Step 0, Loss: 0.4769364297389984
Epoch 9, Step 0, Loss: 0.4676419496536255
Epoch 10, Step 0, Loss: 0.462114155292511
Epoch 11, Step 0, Loss: 0.4553375840187073
Epoch 12, Step 0, Loss: 0.4520321786403656
Epoch 13, Step 0, Loss: 0.4442417025566101
Epoch 14, Step 0, Loss: 0.44591522216796875
Epoch 15, Step 0, Loss: 0.4410877823829651
Epoch 16, Step 0, Loss: 0.43970343470573425
Epoch 17, Step 0, Loss: 0.4348699152469635
Epoch 18, Step 0, Loss: 0.4355355501174927
Epoch 19, Step 0, Loss: 0.42767196893692017
Epoch 20, Step 0, Loss: 0.4300360679626465
Epoch 21, Step 0, Loss: 0.42687565088272095
Epoch 22, Step 0, Loss: 0.41823703050613403
Epoch 23, Step 0,

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4818689823150635
Epoch 1, Train Loss: 0.0194, Val Loss: 0.0165
Epoch 1, Step 0, Loss: 1.0176829099655151
Epoch 2, Train Loss: 0.0135, Val Loss: 0.0123
Epoch 2, Step 0, Loss: 0.7160937786102295
Epoch 3, Train Loss: 0.0106, Val Loss: 0.0102
Epoch 3, Step 0, Loss: 0.5708342790603638
Epoch 4, Train Loss: 0.0091, Val Loss: 0.0090
Epoch 4, Step 0, Loss: 0.6696465015411377
Epoch 5, Train Loss: 0.0082, Val Loss: 0.0082
Accuracy on the test set: 0.8387


### EDA

In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 10, 'eda')

Epoch 0, Step 0, Loss: 0.5710535645484924
Epoch 1, Step 0, Loss: 0.5596981644630432
Epoch 2, Step 0, Loss: 0.5481948852539062
Epoch 3, Step 0, Loss: 0.5334618091583252
Epoch 4, Step 0, Loss: 0.5213579535484314
Epoch 5, Step 0, Loss: 0.5118145942687988
Epoch 6, Step 0, Loss: 0.5018508434295654
Epoch 7, Step 0, Loss: 0.4947406053543091
Epoch 8, Step 0, Loss: 0.49425673484802246
Epoch 9, Step 0, Loss: 0.4796586036682129


In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.442926049232483
Epoch 1, Train Loss: 0.0198, Val Loss: 0.0185
Epoch 1, Step 0, Loss: 1.066210150718689
Epoch 2, Train Loss: 0.0156, Val Loss: 0.0148
Epoch 2, Step 0, Loss: 0.8748830556869507
Epoch 3, Train Loss: 0.0128, Val Loss: 0.0125
Epoch 3, Step 0, Loss: 0.7007070779800415
Epoch 4, Train Loss: 0.0110, Val Loss: 0.0109
Epoch 4, Step 0, Loss: 0.652547299861908
Epoch 5, Train Loss: 0.0098, Val Loss: 0.0099
Accuracy on the test set: 0.8309


In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 25, 'eda')

Epoch 0, Step 0, Loss: 0.5677881240844727
Epoch 1, Step 0, Loss: 0.5570362210273743
Epoch 2, Step 0, Loss: 0.548351526260376
Epoch 3, Step 0, Loss: 0.5334172248840332
Epoch 4, Step 0, Loss: 0.5187644958496094
Epoch 5, Step 0, Loss: 0.5122770071029663
Epoch 6, Step 0, Loss: 0.5008004903793335
Epoch 7, Step 0, Loss: 0.4948563575744629
Epoch 8, Step 0, Loss: 0.49167919158935547
Epoch 9, Step 0, Loss: 0.47778189182281494
Epoch 10, Step 0, Loss: 0.46964168548583984
Epoch 11, Step 0, Loss: 0.4662141501903534
Epoch 12, Step 0, Loss: 0.45720764994621277
Epoch 13, Step 0, Loss: 0.4571516513824463
Epoch 14, Step 0, Loss: 0.4550139904022217
Epoch 15, Step 0, Loss: 0.45158910751342773
Epoch 16, Step 0, Loss: 0.44999590516090393
Epoch 17, Step 0, Loss: 0.4491574168205261
Epoch 18, Step 0, Loss: 0.4444947838783264
Epoch 19, Step 0, Loss: 0.4405610263347626
Epoch 20, Step 0, Loss: 0.43968138098716736
Epoch 21, Step 0, Loss: 0.4324338436126709
Epoch 22, Step 0, Loss: 0.43126368522644043
Epoch 23, Step

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.5377691984176636
Epoch 1, Train Loss: 0.0202, Val Loss: 0.0184
Epoch 1, Step 0, Loss: 1.1070119142532349
Epoch 2, Train Loss: 0.0155, Val Loss: 0.0144
Epoch 2, Step 0, Loss: 0.8658573627471924
Epoch 3, Train Loss: 0.0125, Val Loss: 0.0120
Epoch 3, Step 0, Loss: 0.8271716833114624
Epoch 4, Train Loss: 0.0106, Val Loss: 0.0104
Epoch 4, Step 0, Loss: 0.5964956879615784
Epoch 5, Train Loss: 0.0094, Val Loss: 0.0094
Accuracy on the test set: 0.8246


In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 50, 'eda')

Epoch 0, Step 0, Loss: 0.5653917193412781
Epoch 1, Step 0, Loss: 0.5559509992599487
Epoch 2, Step 0, Loss: 0.5471234321594238
Epoch 3, Step 0, Loss: 0.5295686721801758
Epoch 4, Step 0, Loss: 0.5187992453575134
Epoch 5, Step 0, Loss: 0.5080201625823975
Epoch 6, Step 0, Loss: 0.5037364959716797
Epoch 7, Step 0, Loss: 0.49344807863235474
Epoch 8, Step 0, Loss: 0.48853111267089844
Epoch 9, Step 0, Loss: 0.47982311248779297
Epoch 10, Step 0, Loss: 0.4716777205467224
Epoch 11, Step 0, Loss: 0.465291827917099
Epoch 12, Step 0, Loss: 0.46639585494995117
Epoch 13, Step 0, Loss: 0.4617658853530884
Epoch 14, Step 0, Loss: 0.4563997983932495
Epoch 15, Step 0, Loss: 0.45130887627601624
Epoch 16, Step 0, Loss: 0.44064009189605713
Epoch 17, Step 0, Loss: 0.4408884644508362
Epoch 18, Step 0, Loss: 0.4405692517757416
Epoch 19, Step 0, Loss: 0.44296860694885254
Epoch 20, Step 0, Loss: 0.43685221672058105
Epoch 21, Step 0, Loss: 0.4375123381614685
Epoch 22, Step 0, Loss: 0.43286579847335815
Epoch 23, Ste

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.5924384593963623
Epoch 1, Train Loss: 0.0200, Val Loss: 0.0176
Epoch 1, Step 0, Loss: 1.060484766960144
Epoch 2, Train Loss: 0.0145, Val Loss: 0.0132
Epoch 2, Step 0, Loss: 0.8095983862876892
Epoch 3, Train Loss: 0.0114, Val Loss: 0.0108
Epoch 3, Step 0, Loss: 0.5994372367858887
Epoch 4, Train Loss: 0.0097, Val Loss: 0.0095
Epoch 4, Step 0, Loss: 0.5566151738166809
Epoch 5, Train Loss: 0.0087, Val Loss: 0.0086
Accuracy on the test set: 0.8238


In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5642368197441101
Epoch 1, Step 0, Loss: 0.5632263422012329
Epoch 2, Step 0, Loss: 0.5443133115768433
Epoch 3, Step 0, Loss: 0.5308079719543457
Epoch 4, Step 0, Loss: 0.521986186504364
Epoch 5, Step 0, Loss: 0.5157982110977173
Epoch 6, Step 0, Loss: 0.5062921643257141
Epoch 7, Step 0, Loss: 0.4942970275878906
Epoch 8, Step 0, Loss: 0.4837270975112915
Epoch 9, Step 0, Loss: 0.47684550285339355
Epoch 10, Step 0, Loss: 0.47182080149650574
Epoch 11, Step 0, Loss: 0.4706572890281677
Epoch 12, Step 0, Loss: 0.4622383415699005
Epoch 13, Step 0, Loss: 0.45829254388809204
Epoch 14, Step 0, Loss: 0.454015851020813
Epoch 15, Step 0, Loss: 0.44583621621131897
Epoch 16, Step 0, Loss: 0.45024025440216064
Epoch 17, Step 0, Loss: 0.4421229958534241
Epoch 18, Step 0, Loss: 0.4392634630203247
Epoch 19, Step 0, Loss: 0.4386412799358368
Epoch 20, Step 0, Loss: 0.43177950382232666
Epoch 21, Step 0, Loss: 0.4317772388458252
Epoch 22, Step 0, Loss: 0.42838847637176514
Epoch 23, Step 0

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4545848369598389
Epoch 1, Train Loss: 0.0197, Val Loss: 0.0169
Epoch 1, Step 0, Loss: 1.0673779249191284
Epoch 2, Train Loss: 0.0142, Val Loss: 0.0129
Epoch 2, Step 0, Loss: 0.7647920250892639
Epoch 3, Train Loss: 0.0114, Val Loss: 0.0109
Epoch 3, Step 0, Loss: 0.6776555180549622
Epoch 4, Train Loss: 0.0100, Val Loss: 0.0098
Epoch 4, Step 0, Loss: 0.6313803791999817
Epoch 5, Train Loss: 0.0091, Val Loss: 0.0091
Accuracy on the test set: 0.8184


# TODO: Further research to come.