In the following notebook, we investigate the use of SAS (subsets that maximize expected augmentation similarity) to select representative subsets that improve self-supervised learning in a text categorization task. We also investigate the use of multilingual embeddings to further strengthen the SAS selection.

# Setup

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install nlpaug
!pip install gensim

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-

In [None]:
import csv
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
import gensim.downloader as api
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from tqdm.auto import tqdm
# Delete some data from RAM to free up space for later processes
import gc

# Ensure you have the necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Import Google Drive for locally saved files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Global variables

In [None]:
# Max length of text embedding tensors
global_seed = 0
max_length = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set seed for reproducibility
np.random.seed(global_seed)
random.seed(global_seed)

## Helper functions

In [None]:
# Basic preprocessing: lowercasing and removing non-alphanumeric characters
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

## Load dataset

In [None]:
# Load the AG News dataset
dataset = load_dataset("ag_news")

# The dataset is divided into 'train' and 'test' splits
train_data = dataset['train']
test_data = dataset['test']

# For the parts that use GPT, we will use only 2.5% of the data due to the cost of running GPT.
train_data = train_data.train_test_split(test_size=0.975, stratify_by_column="label", seed = global_seed)
train_data = train_data['train']

# Example: Viewing the first training sample
print(train_data[0])
print(len(train_data))

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

{'text': '9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.', 'label': 0}
3000


## Create NLPAug augmented texts

In [None]:
# aug_1 = naw.WordEmbsAug(
#     model_type='word2vec',
#     model_path='/content/drive/MyDrive/COM SCI 260D/word2vec-google-news-300.bin',
#     top_k = 5, # Only use top 5 options
#     aug_p = 0.2 # 20% of the words augmented
# )
#
# aug_2 = naw.WordEmbsAug(
#     model_type='word2vec',
#     model_path='/content/drive/MyDrive/COM SCI 260D/word2vec-google-news-300.bin',
#     top_k = 10, # Only use top 10 options
#     aug_p = 0.4 # 40% of the words augmented
# )
#
# def eda_augmentation_with_word2vec(sentence, type):
#     if type == 1:
#         return aug_1.augment(sentence)[0]
#     else:
#         return aug_2.augment(sentence)[0]

In [None]:
# Add augmented texts for each example using NLPAug
# def add_nlp_aug_columns(data):
#     data['augment_1'] = eda_augmentation_with_word2vec(data['text'], 1)
#     data['augment_2'] = eda_augmentation_with_word2vec(data['text'], 2)
#     return data

# train_data = train_data.map(
#     lambda example: add_nlp_aug_columns(example)
# )

In [None]:
# Save the augmentations as the process takes 2 hours
# train_df = train_data.to_pandas()
# train_df.to_json("/content/drive/MyDrive/COM SCI 260D/augmented_dataset.json")

In [None]:
import datasets

# Load dataset with augmentations (pre-prepared)
train_df = pd.read_json("/content/drive/MyDrive/COM SCI 260D/augmented_dataset.json")
augmented_data = datasets.Dataset.from_pandas(train_df)

In [None]:
print(augmented_data)
print(augmented_data['text'][-1])
print(augmented_data['text'][-1])

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', '__index_level_0__'],
    num_rows: 3000
})
Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. unveiled Monday a new memory-packed personal digital assistant that can double as a portable data storage drive.
Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. unveiled Monday a new memory-packed personal digital assistant that can double as a portable data storage drive.


In [None]:
# Merge datasets by columns
def add_augmented_columns(data, augments, idx):
    data['augment_1'] = augments['augment_1'][idx]
    data['augment_2'] = augments['augment_2'][idx]
    return data

train_data = train_data.map(
    lambda example, idx: add_augmented_columns(example, augmented_data, idx),
    with_indices=True
)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

## Load GPT augmented text and Spanish translations

In [None]:
gpt_dataset = load_dataset("csv", data_files="/content/drive/MyDrive/COM SCI 260D/gpt_dataset.csv", split = 'train')

print(gpt_dataset)
print(gpt_dataset[0:5])
print(train_data[0:5])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['Original Text', 'Translation', 'Paraphrase 1', 'Paraphrase 2'],
    num_rows: 3000
})


In [None]:
# Ensure both datasets have the same number of rows
assert len(gpt_dataset) == len(train_data), "Datasets must have the same number of rows"

# Merge datasets by columns
def add_gpt_columns(data, augments, idx):
    data['translation'] = augments['Translation'][idx]
    data['gpt_1'] = augments['Paraphrase 1'][idx]
    existing_augment = augments['Paraphrase 2'][idx]
    if not existing_augment or not existing_augment.strip():
        data['gpt_2'] = data['augment_1'] # Fallback to EDA augmentation
    else:
        data['gpt_2'] = existing_augment
    return data

train_data = train_data.map(
    lambda example, idx: add_gpt_columns(example, gpt_dataset, idx),
    with_indices=True
)

print(train_data)
print(train_data[0])

# There are 8 examples that are not correctly being imported. For now, we just remove them.
error_data = train_data.filter(lambda row: row['gpt_2'] is None or row['gpt_1'] is None)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2'],
    num_rows: 3000
})
{'text': '9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.', 'label': 0, 'augment_1': '9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.', 'augment_2': "2 hurt in explosion Tuesdayat Indonesian Embassy LYON_France - - An explosion struck in Indonesian charge_d'_affaires in Paris today, tad wounding nine people, a Algerian radio Finsbury_Park_Tube reported.", 'translation': '9 heridos en explosión en la Embajada de Indonesia PARÍS -- Una explosión golpeó la Embajada de Indonesia en París hoy, hiriendo levemente a nueve personas, según informó una estación de radio francesa.', 'gpt_1': '1. The Indonesian Embassy

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
print(error_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2'],
    num_rows: 0
})


In [None]:
print(train_data['text'][0])
print(train_data['gpt_1'][0])
print(train_data['gpt_2'][0])
print(train_data['augment_1'][0])
print(train_data['augment_2'][0])
print("")
print(train_data['text'][-1])
print(train_data['gpt_1'][-1])
print(train_data['gpt_2'][-1])
print(train_data['augment_1'][-1])
print(train_data['augment_2'][-1])

9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.
1. The Indonesian Embassy in Paris was struck by an explosion today, leading to minor injuries for nine individuals, as reported by a French radio station.
9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.
9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.
2 hurt in explosion Tuesdayat Indonesian Embassy LYON_France - - An explosion struck in Indonesian charge_d'_affaires in Paris today, tad wounding nine people, a Algerian radio Finsbury_Park_Tube reported.

Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. un

# Latent class discovery

First, we use 1% of the data randomly selected as a small piece of labelled information to train a basic pre-trained classifier and assign latent classes to each training data point.

In [None]:
# Select 1% of the data randomly
shuffled_dataset = train_data.shuffle(seed = global_seed)
sample_size = int(0.01 * len(shuffled_dataset))
labeled_data = shuffled_dataset.select(range(sample_size))
rest_of_data = shuffled_dataset.select(range(sample_size, len(shuffled_dataset)))

# Get number of classes from the 1% data
unique_labels = set(labeled_data['label'])
num_classes = len(unique_labels)

print(num_classes)

4


In [None]:
# Split the 1% dataset into training and validation sets. We only use 20% as validation set due to size of data.
labeled_data_for_training = labeled_data.train_test_split(test_size=0.2, stratify_by_column="label", seed = global_seed)

In [None]:
# Pre-process and tokenize data
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')

def tokenize(examples):
    preprocessed_texts = [preprocess(text) for text in examples['text']]
    return tokenizer(preprocessed_texts, padding='max_length', truncation=True, max_length = max_length, return_tensors="pt")

tokenized_dataset = labeled_data_for_training.map(tokenize, batched=True)

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [None]:
# Import model for linear classification using the number of classes in 1% data.
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-small', num_labels=num_classes)

pytorch_model.bin:   0%|          | 0.00/116M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Use the 1% labelled data to train a BERT model
latent_num_epochs = 20

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=latent_num_epochs,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['test'])

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.341687
2,1.431500,1.339572
3,1.431500,1.335432
4,1.438000,1.330217
5,1.430600,1.323691
6,1.430600,1.316871
7,1.320800,1.308544
8,1.320800,1.298533
9,1.289300,1.286092
10,1.235500,1.276856


TrainOutput(global_step=120, training_loss=1.1530095338821411, metrics={'train_runtime': 3.9161, 'train_samples_per_second': 122.571, 'train_steps_per_second': 30.643, 'total_flos': 9492677591040.0, 'train_loss': 1.1530095338821411, 'epoch': 20.0})

In [None]:
# Save the model to a directory for local use
model.save_pretrained('./model')
tokenizer.save_pretrained('./tokenizer')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [None]:
# Use the model to predict latent classes for rest of training data
def predict_batch(batch):
    # Tokenize the examples
    preprocessed_texts = [preprocess(text) for text in batch['text']]
    inputs = tokenizer(preprocessed_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Perform the prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class indices
    predictions = outputs.logits.argmax(-1).cpu().numpy()
    return {'predictions': predictions}

# Create a DataLoader for the rest of the data
rest_of_data_loader = DataLoader(rest_of_data, batch_size=32)

# Run predictions in batches
results = []
for batch in tqdm(rest_of_data_loader, desc="Predicting latent classes"):
    batch_results = predict_batch(batch)
    results.extend(batch_results['predictions'])

Predicting latent classes:   0%|          | 0/93 [00:00<?, ?it/s]

In [None]:
# Save the predicted classes for local use
file_name = './latent_classes.csv'

with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)

    for integer in results:
        writer.writerow([integer])

In [None]:
# Add the latent class to the dataset
def add_new_column(example, idx, new_data):
    example['latent_class'] = new_data[idx]
    return example

labeled_rest_of_data = rest_of_data.map(
    lambda example, idx: add_new_column(example, idx, results),
    with_indices=True
)

# For the labelled data, we use the label as the latent class as it is already known.
def insert_column_with_same_value(example):
    example['latent_class'] = example['label']
    return example

labeled_data = labeled_data.map(
    lambda example: insert_column_with_same_value(example)
)

Map:   0%|          | 0/2970 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [None]:
# How well does the latent classification work?
def check_columns_equality(example, column1, column2):
    return example[column1] == example[column2]

check_accuracy_dataset = labeled_rest_of_data.filter(lambda example: check_columns_equality(example, 'label', 'latent_class'))

matching_count = len(check_accuracy_dataset)
print(f"Accuracy of latent classification: {matching_count / len(labeled_rest_of_data)}")

Filter:   0%|          | 0/2970 [00:00<?, ? examples/s]

Accuracy of latent classification: 0.7178451178451178


In [None]:
# Re-merge the 1% data and rest of the data for self-supervised learning
latent_class_data = concatenate_datasets([labeled_data, labeled_rest_of_data])
print(latent_class_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 3000
})


In [None]:
del results
del model
del tokenizer
del training_args
del trainer
del rest_of_data
del rest_of_data_loader
del labeled_rest_of_data
del check_accuracy_dataset
del labeled_data
del gpt_dataset

# Collect garbage
gc.collect()

107

# Subset selection

Now that we have some form of latent classes assigned to all of the training data, we can find subsets using three methods:

1. Original SAS algorithm
2. Random subset from each latent class
3. Original SAS algorithm + Spanish embeddings

We will select a 20% subset and compare the three. In the long run, we also need to compare with using the whole dataset but that is going to be very computationally intensive.

In [None]:
# Split into latent classes to select subsets from each
def create_filter_function(latent_class):
    def filter_label(example):
        return example['latent_class'] == latent_class
    return filter_label

world_train_data = latent_class_data.filter(create_filter_function(0))
sports_train_data = latent_class_data.filter(create_filter_function(1))
business_train_data = latent_class_data.filter(create_filter_function(2))
tech_train_data = latent_class_data.filter(create_filter_function(3))

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# We will select the subset in this ratio for all subsets, for fairness
print(len(world_train_data))
print(len(sports_train_data))
print(len(business_train_data))
print(len(tech_train_data))

922
714
387
977


## SAS

In [None]:
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')
model = AutoModel.from_pretrained('prajjwal1/bert-small')

def bert_embeddings(data):
    inputs = tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    outputs = model(**inputs)
    # Take the first token ([CLS]) embeddings from each sentence
    return outputs.last_hidden_state[:,0,:].detach()

def sas_algorithm(data, subset_size):
    # Get embeddings for the text data
    embeddings = bert_embeddings(data)
    n = len(embeddings)

    # Initialize the subset and similarities
    S = set()
    S_similarities = np.full(n, -np.inf)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = cosine_similarity(embeddings)

    del embeddings
    gc.collect()

    for _ in tqdm(range(subset_size)):
        not_in_S = np.array([i for i in range(n) if i not in S])

        # Calculate gains for each potential exemplar not in S
        gains = []
        for idx in not_in_S:
            gain = np.sum(np.maximum(S_similarities, similarity_matrix[idx]))
            gains.append(gain)

        # Find the exemplar with the best gain
        best_exemplar_idx = np.argmax(gains)
        best_exemplar = not_in_S[best_exemplar_idx]

        # Update the similarity for the best-selected subset
        S_similarities = np.maximum(S_similarities, similarity_matrix[best_exemplar])
        S.add(best_exemplar)

    return list(S)

In [None]:
# Run the SAS algorithm to select a 20% subset from each latent class
subset_size = 0.4

world_sas_indices = sas_algorithm(world_train_data['text'], int(subset_size * len(world_train_data)))
sports_sas_indices = sas_algorithm(sports_train_data['text'], int(subset_size * len(sports_train_data)))
business_sas_indices = sas_algorithm(business_train_data['text'], int(subset_size * len(business_train_data)))
tech_sas_indices = sas_algorithm(tech_train_data['text'], int(subset_size * len(tech_train_data)))

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/285 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/390 [00:00<?, ?it/s]

In [None]:
# Sanity check on indices
print(world_sas_indices)
print(sports_sas_indices)
print(business_sas_indices)
print(tech_sas_indices)

[1, 3, 9, 12, 13, 16, 18, 19, 21, 22, 26, 27, 31, 34, 37, 38, 39, 42, 46, 49, 50, 52, 53, 54, 56, 60, 64, 68, 69, 70, 73, 81, 86, 87, 89, 90, 91, 95, 97, 101, 102, 106, 107, 108, 109, 111, 114, 120, 121, 128, 130, 132, 134, 135, 136, 139, 141, 143, 145, 146, 151, 153, 154, 155, 157, 159, 160, 161, 162, 163, 170, 171, 172, 176, 179, 181, 193, 194, 195, 196, 197, 199, 205, 207, 211, 214, 215, 216, 217, 218, 229, 237, 240, 245, 247, 249, 250, 253, 258, 261, 262, 266, 269, 271, 272, 276, 279, 280, 283, 284, 285, 286, 288, 293, 294, 296, 297, 298, 301, 303, 304, 305, 314, 315, 317, 321, 323, 324, 326, 330, 333, 336, 338, 341, 343, 346, 349, 352, 354, 358, 360, 361, 362, 365, 367, 370, 371, 374, 375, 380, 384, 386, 387, 388, 390, 392, 394, 398, 400, 403, 406, 407, 408, 411, 413, 416, 417, 421, 422, 423, 434, 436, 437, 443, 445, 447, 448, 451, 453, 454, 461, 462, 463, 465, 467, 469, 471, 474, 479, 480, 481, 482, 484, 488, 492, 497, 498, 500, 502, 503, 508, 509, 510, 512, 517, 518, 521, 522, 5

In [None]:
world_sas_subset = world_train_data.select(world_sas_indices)
sports_sas_subset = sports_train_data.select(sports_sas_indices)
business_sas_subset = business_train_data.select(business_sas_indices)
tech_sas_subset = tech_train_data.select(tech_sas_indices)

# Create the subset to use for contrastive self-supervised learning
sas_subset_data = concatenate_datasets([world_sas_subset, sports_sas_subset, business_sas_subset, tech_sas_subset])

In [None]:
del world_sas_subset
del sports_sas_subset
del business_sas_subset
del tech_sas_subset
del world_sas_indices
del sports_sas_indices
del business_sas_indices
del tech_sas_indices
del tokenizer
del model

gc.collect()

15

In [None]:
print(sas_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 1197
})


## Random

In [None]:
# Randomly sample 20% of the dataset for each latent class
def random_subset(data, subset_size):
    return random.sample(range(len(data)), subset_size)

In [None]:
subset_size = 0.4

world_random_indices = random_subset(world_train_data['text'], int(subset_size * len(world_train_data)))
sports_random_indices = random_subset(sports_train_data['text'], int(subset_size * len(sports_train_data)))
business_random_indices = random_subset(business_train_data['text'], int(subset_size * len(business_train_data)))
tech_random_indices = random_subset(tech_train_data['text'], int(subset_size * len(tech_train_data)))

In [None]:
# Sanity check on indices
print(world_random_indices)
print(sports_random_indices)
print(business_random_indices)
print(tech_random_indices)

[654, 114, 25, 759, 281, 250, 228, 142, 754, 104, 692, 758, 558, 89, 604, 432, 32, 30, 95, 223, 238, 517, 616, 27, 574, 203, 733, 665, 718, 909, 429, 225, 459, 603, 284, 828, 6, 777, 825, 163, 714, 906, 348, 887, 159, 220, 781, 344, 912, 94, 389, 99, 367, 867, 352, 618, 270, 826, 44, 747, 470, 549, 127, 387, 80, 565, 300, 849, 643, 633, 370, 591, 196, 721, 71, 46, 677, 233, 791, 296, 81, 901, 103, 871, 878, 464, 650, 373, 166, 379, 363, 214, 686, 273, 893, 699, 663, 73, 623, 835, 175, 546, 746, 916, 167, 473, 388, 276, 655, 704, 570, 224, 701, 332, 786, 794, 57, 234, 905, 323, 410, 274, 67, 216, 580, 735, 322, 217, 671, 511, 405, 658, 469, 146, 271, 914, 252, 762, 897, 551, 269, 764, 598, 438, 597, 408, 851, 810, 141, 521, 505, 93, 48, 112, 156, 642, 882, 696, 880, 610, 65, 394, 390, 784, 479, 541, 257, 566, 11, 780, 738, 117, 698, 860, 886, 656, 879, 920, 855, 445, 161, 836, 3, 736, 875, 512, 182, 519, 108, 640, 305, 921, 734, 823, 896, 767, 382, 165, 552, 543, 0, 613, 331, 500, 19, 7

In [None]:
world_random_subset = world_train_data.select(world_random_indices)
sports_random_subset = sports_train_data.select(sports_random_indices)
business_random_subset = business_train_data.select(business_random_indices)
tech_random_subset = tech_train_data.select(tech_random_indices)

# Create the subset to use for contrastive self-supervised learning
random_subset_data = concatenate_datasets([world_random_subset, sports_random_subset, business_random_subset, tech_random_subset])

In [None]:
del world_random_subset
del sports_random_subset
del business_random_subset
del tech_random_subset
del world_random_indices
del sports_random_indices
del business_random_indices
del tech_random_indices

gc.collect()

23

In [None]:
print(random_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 1197
})


## Multi-lingual SAS

In [None]:
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')
model = AutoModel.from_pretrained('prajjwal1/bert-small')

spanish_tokenizer = AutoTokenizer.from_pretrained('dccuchile/albert-tiny-spanish')
spanish_model = AutoModel.from_pretrained('dccuchile/albert-tiny-spanish')

def bert_embeddings(data):
    inputs = tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    outputs = model(**inputs)
    # Take the first token ([CLS]) embeddings from each sentence
    return outputs.last_hidden_state[:,0,:].detach()

def albert_embeddings(data):
    inputs = spanish_tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    outputs = spanish_model(**inputs)
    # Take the first token ([CLS]) embeddings from each sentence
    return outputs.last_hidden_state[:,0,:].detach()

def sas_with_spanish_algorithm(data, subset_size):
    # Get embeddings for the text data
    embeddings = bert_embeddings(data['text'])
    n = len(embeddings)
    print(embeddings.shape)

    # Initialize the subset and similarities
    S = set()
    S_similarities = np.full(n, -np.inf)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = cosine_similarity(embeddings)

    del(embeddings)
    gc.collect()

    # Get Spanish embeddings
    spanish_embeddings = albert_embeddings(data['translation'])
    print(spanish_embeddings.shape)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = np.add(similarity_matrix, cosine_similarity(spanish_embeddings))

    del(spanish_embeddings)
    gc.collect()

    for _ in tqdm(range(subset_size)):
        not_in_S = np.array([i for i in range(n) if i not in S])

        # Calculate gains for each potential exemplar not in S
        gains = []
        for idx in not_in_S:
            gain = np.sum(np.maximum(S_similarities, similarity_matrix[idx]))
            gains.append(gain)

        # Find the exemplar with the best gain
        best_exemplar_idx = np.argmax(gains)
        best_exemplar = not_in_S[best_exemplar_idx]

        # Update the similarity for the best-selected subset
        S_similarities = np.maximum(S_similarities, similarity_matrix[best_exemplar])
        S.add(best_exemplar)

    return list(S)

tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/21.7M [00:00<?, ?B/s]

In [None]:
# Run the SAS algorithm to select a 20% subset from each latent class
subset_size = 0.4

world_multilingual_sas_indices = sas_with_spanish_algorithm(world_train_data, int(subset_size * len(world_train_data)))
sports_multilingual_sas_indices = sas_with_spanish_algorithm(sports_train_data, int(subset_size * len(sports_train_data)))
business_multilingual_sas_indices = sas_with_spanish_algorithm(business_train_data, int(subset_size * len(business_train_data)))
tech_multilingual_sas_indices = sas_with_spanish_algorithm(tech_train_data, int(subset_size * len(tech_train_data)))

torch.Size([922, 512])
torch.Size([922, 312])


  0%|          | 0/368 [00:00<?, ?it/s]

torch.Size([714, 512])
torch.Size([714, 312])


  0%|          | 0/285 [00:00<?, ?it/s]

torch.Size([387, 512])
torch.Size([387, 312])


  0%|          | 0/154 [00:00<?, ?it/s]

torch.Size([977, 512])
torch.Size([977, 312])


  0%|          | 0/390 [00:00<?, ?it/s]

In [None]:
world_multilingual_sas_subset = world_train_data.select(world_multilingual_sas_indices)
sports_multilingual_sas_subset = sports_train_data.select(sports_multilingual_sas_indices)
business_multilingual_sas_subset = business_train_data.select(business_multilingual_sas_indices)
tech_multilingual_sas_subset = tech_train_data.select(tech_multilingual_sas_indices)

# Create the subset to use for contrastive self-supervised learning
multilingual_sas_subset_data = concatenate_datasets([world_multilingual_sas_subset, sports_multilingual_sas_subset, business_multilingual_sas_subset, tech_multilingual_sas_subset])

In [None]:
del world_multilingual_sas_subset
del sports_multilingual_sas_subset
del business_multilingual_sas_subset
del tech_multilingual_sas_subset
del world_multilingual_sas_indices
del sports_multilingual_sas_indices
del business_multilingual_sas_indices
del tech_multilingual_sas_indices
del tokenizer
del model
del spanish_tokenizer
del spanish_model

gc.collect()

15

In [None]:
print(multilingual_sas_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 1197
})


# Contrastive learning pipeline

Now that we have a 20% representative subset of the data in many different ways. We use this subset, along with two different forms of textual augmentation:

1. Easy data augmentation such as synonym replacement, random addition and random swapping
2. GPT for paraphrasing sentences

to train a self-supervised contrastive learning encoder, which will be evaluated in a downstream prediction task.

In [None]:
# Define the encoder model. We will use pre-trained BERT as the initial embedding
# and use contrastive learning to further train embeddings that can be tested
# downstream, specifically in news article domain.
model_name = 'prajjwal1/bert-small'

class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :] # Use the [CLS] token as the embedding

# Define the contrastive (InfoNCE) loss
class ContrastiveLoss(nn.Module):
    def forward(self, z_i, z_j, z_k):
        # We use cosine similarities between the embeddings
        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

        # Similarity with positive augmented examples vs. negative examples
        positive_similarity = torch.exp(cos(z_i, z_j))
        negative_similarity = torch.exp(cos(z_i, z_k))

        loss = -torch.log(positive_similarity / (positive_similarity + negative_similarity))

        return loss.mean()

In [None]:
# Helper function to randomly move around indices to create a list for negative examples
def shuffle_without_duplication(arr):
    np.random.seed(global_seed)
    arr = np.array(arr)
    n = len(arr)
    # Create an array of the same shape filled with the indices
    indices = np.arange(n)
    while True:
        # Shuffle the indices
        np.random.shuffle(indices)
        # Check if no element remains in its original position
        if not np.any(indices == np.arange(n)):
            break
    # Return the shuffled array
    return arr[indices].tolist()

In [None]:
# Training function for self-supervised contrastive learning
def self_supervised_training(data, num_epochs, augment='eda'):
    # Parameters
    batch_size = 32
    learning_rate = 1e-6

    # Shuffle the data for better training. At first, all subsets are concatenated and
    # therefore separated by latent class
    data = data.shuffle(seed = global_seed)
    texts = data['text']
    if (augment == 'eda'):
        existing_augments_1 = data['augment_1']
        existing_augments_2 = data['augment_2']
    else:
        existing_augments_1 = data['gpt_1']
        existing_augments_2 = data['gpt_2']

    encoder = Encoder().to(device)
    loss_fn = ContrastiveLoss()
    optimizer = optim.Adam(encoder.parameters(), lr = learning_rate)

    # Training loop
    encoder.train()
    for epoch in range(num_epochs):
        for i in range(0, len(texts), batch_size):
            # Sample a batch of texts
            batch_existing_augments_1 = existing_augments_1[i:i + batch_size]
            batch_existing_augments_2 = existing_augments_2[i:i + batch_size]

            # Clean texts and create a list of negative texts from the batch, ensuring
            # that the same example is not selected for the negative.
            clean_existing_augments_1 = [preprocess(text) for text in batch_existing_augments_1]
            clean_existing_augments_2 = [preprocess(text) for text in batch_existing_augments_2]
            negative_augments = shuffle_without_duplication(clean_existing_augments_1)

            # Get the embeddings from the encoder
            z_i = encoder(clean_existing_augments_1)
            z_j = encoder(clean_existing_augments_2)
            z_k = encoder(negative_augments)

            # Compute the contrastive loss
            loss = loss_fn(z_i, z_j, z_k)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print every 20 steps
            if (i // batch_size) % 20 == 0:
                print(f'Epoch {epoch}, Step {i // batch_size}, Loss: {loss.item()}')

    return encoder

# Evaluation pipeline

Through contrastive learning, we have learned an encoder $f$ that can embed our news article text for better downstream performance. We test by using this encoder $f$ and the true labels in training a linear classifier head, and testing on the test dataset. Here, we are trying to compare the efficacy of the encoder $f$ for the three different ways of selecting a subset. Additionally, we need to compare with using the full data, which will be left as a future direction due to computational issues.

In [None]:
# Define the linear classifier head
class LinearClassifier(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super().__init__()
        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        return self.linear(x)

In [None]:
# Create dataloaders from the train and test data
def create_dataloaders(train_data, test_data, batch_size):
    train_val_data = train_data.train_test_split(test_size=0.2, stratify_by_column="label", seed = global_seed)

    train_texts = [preprocess(text) for text in train_val_data['train']['text']]
    val_texts = [preprocess(text) for text in train_val_data['test']['text']]
    test_texts = [preprocess(text) for text in test_data['text']]

    # Create tensors for labels
    train_labels = torch.tensor(train_val_data['train']['label'])
    val_labels = torch.tensor(train_val_data['test']['label'])
    test_labels = torch.tensor(test_data['label'])

    # Create a DataLoader for our training and testing data
    train_data_for_classification = list(zip(train_texts, train_labels))
    val_data_for_classification = list(zip(val_texts, val_labels))
    test_data_for_classification = list(zip(test_texts, test_labels))

    train_dataloader = DataLoader(train_data_for_classification, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_data_for_classification, batch_size=batch_size)
    test_dataloader = DataLoader(test_data_for_classification, batch_size=batch_size)

    return train_dataloader, val_dataloader, test_dataloader

# Evaluation function to train a linear classifier head on top of the learned encoder
# and evaluate on the test set
def evaluate_encoder(encoder, train_data, test_data):
    # Parameters
    batch_size = 64
    num_epochs = 5
    num_classes = 4
    learning_rate = 1e-4

    # Create dataloaders
    train_dataloader, val_dataloader, test_dataloader = create_dataloaders(train_data, test_data, batch_size)

    # Set the encoder to evaluation mode and freeze all layers to test the trained embeddings from contrastive learning
    encoder.eval()
    for param in encoder.parameters():
        param.requires_grad = False

    # Initialize the linear classifier head
    classifier = LinearClassifier(encoder.model.config.hidden_size, num_classes).to(device)

    # Loss function and optimizer for the classifier head
    # Define scheduler for learning rate.
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(classifier.parameters(), lr = learning_rate)

    best_val_loss = float('inf')
    checkpoint_path = './best_checkpoint.pth'

    # Train the classifier
    for epoch in range(num_epochs):
        classifier.train()
        train_loss = 0.0

        for (index, data) in enumerate(train_dataloader):
            texts, labels = data
            labels = labels.to(device)
            optimizer.zero_grad()

            # Forward pass through the frozen encoder and classifier head
            embeddings = encoder(texts)
            outputs = classifier(embeddings)
            loss = criterion(outputs, labels)

            # Backpropagation
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            # Print every 20 steps
            if index % 50 == 0:
                print(f'Epoch {epoch}, Step {index}, Loss: {loss.item()}')

        classifier.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data in val_dataloader:
                texts, labels = data
                labels = labels.to(device)

                embeddings = encoder(texts)
                outputs = classifier(embeddings)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        # Average losses
        train_loss /= len(train_dataloader.dataset)
        val_loss /= len(val_dataloader.dataset)

        # Print stats
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(classifier.state_dict(), checkpoint_path)

    # Evaluate the classifier
    classifier.load_state_dict(torch.load(checkpoint_path))
    classifier.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for texts, labels in test_dataloader:
            labels = labels.to(device)

            embeddings = encoder(texts)
            outputs = classifier(embeddings)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f'Accuracy on the test set: {accuracy:.4f}')

# Testing different subset selection processes + augmentation techniques

## Subset selection using SAS

### GPT

In [None]:
encoder = self_supervised_training(sas_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5624551773071289
Epoch 0, Step 20, Loss: 0.5557247400283813
Epoch 1, Step 0, Loss: 0.5455308556556702
Epoch 1, Step 20, Loss: 0.5284996032714844
Epoch 2, Step 0, Loss: 0.5210798382759094
Epoch 2, Step 20, Loss: 0.5087847113609314
Epoch 3, Step 0, Loss: 0.5035061836242676
Epoch 3, Step 20, Loss: 0.4934278130531311
Epoch 4, Step 0, Loss: 0.4917498230934143
Epoch 4, Step 20, Loss: 0.4736211895942688
Epoch 5, Step 0, Loss: 0.47641003131866455
Epoch 5, Step 20, Loss: 0.4676828384399414
Epoch 6, Step 0, Loss: 0.4634052813053131
Epoch 6, Step 20, Loss: 0.45456817746162415
Epoch 7, Step 0, Loss: 0.45593488216400146
Epoch 7, Step 20, Loss: 0.4514143466949463
Epoch 8, Step 0, Loss: 0.4444545805454254
Epoch 8, Step 20, Loss: 0.43791669607162476
Epoch 9, Step 0, Loss: 0.436351478099823
Epoch 9, Step 20, Loss: 0.43590256571769714
Epoch 10, Step 0, Loss: 0.43715015053749084
Epoch 10, Step 20, Loss: 0.42923519015312195
Epoch 11, Step 0, Loss: 0.4348379671573639
Epoch 11, Step

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.1403224468231201
Epoch 1, Train Loss: 0.0152, Val Loss: 0.0132
Epoch 1, Step 0, Loss: 0.872122585773468
Epoch 2, Train Loss: 0.0110, Val Loss: 0.0105
Epoch 2, Step 0, Loss: 0.602172315120697
Epoch 3, Train Loss: 0.0091, Val Loss: 0.0091
Epoch 3, Step 0, Loss: 0.5308443903923035
Epoch 4, Train Loss: 0.0081, Val Loss: 0.0084
Epoch 4, Step 0, Loss: 0.531055212020874
Epoch 5, Train Loss: 0.0076, Val Loss: 0.0079
Accuracy on the test set: 0.8380


### EDA

In [None]:
encoder = self_supervised_training(sas_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5717929005622864
Epoch 0, Step 20, Loss: 0.5626691579818726
Epoch 1, Step 0, Loss: 0.5474741458892822
Epoch 1, Step 20, Loss: 0.5327763557434082
Epoch 2, Step 0, Loss: 0.5272518992424011
Epoch 2, Step 20, Loss: 0.5157229900360107
Epoch 3, Step 0, Loss: 0.5028746128082275
Epoch 3, Step 20, Loss: 0.49790751934051514
Epoch 4, Step 0, Loss: 0.48600804805755615
Epoch 4, Step 20, Loss: 0.48508933186531067
Epoch 5, Step 0, Loss: 0.4756471514701843
Epoch 5, Step 20, Loss: 0.47350192070007324
Epoch 6, Step 0, Loss: 0.4648036062717438
Epoch 6, Step 20, Loss: 0.46503931283950806
Epoch 7, Step 0, Loss: 0.4537002444267273
Epoch 7, Step 20, Loss: 0.4617970585823059
Epoch 8, Step 0, Loss: 0.45240840315818787
Epoch 8, Step 20, Loss: 0.45531153678894043
Epoch 9, Step 0, Loss: 0.4495851695537567
Epoch 9, Step 20, Loss: 0.44206786155700684
Epoch 10, Step 0, Loss: 0.43928322196006775
Epoch 10, Step 20, Loss: 0.43731075525283813
Epoch 11, Step 0, Loss: 0.43740516901016235
Epoch 11,

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.552817463874817
Epoch 1, Train Loss: 0.0196, Val Loss: 0.0169
Epoch 1, Step 0, Loss: 0.9543692469596863
Epoch 2, Train Loss: 0.0139, Val Loss: 0.0130
Epoch 2, Step 0, Loss: 0.7859453558921814
Epoch 3, Train Loss: 0.0111, Val Loss: 0.0111
Epoch 3, Step 0, Loss: 0.6054638028144836
Epoch 4, Train Loss: 0.0098, Val Loss: 0.0101
Epoch 4, Step 0, Loss: 0.4969537556171417
Epoch 5, Train Loss: 0.0089, Val Loss: 0.0094
Accuracy on the test set: 0.8109


## Random subset selection

### GPT

In [None]:
encoder = self_supervised_training(random_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5752135515213013
Epoch 0, Step 20, Loss: 0.5482170581817627
Epoch 1, Step 0, Loss: 0.5432475805282593
Epoch 1, Step 20, Loss: 0.5222316980361938
Epoch 2, Step 0, Loss: 0.5159257650375366
Epoch 2, Step 20, Loss: 0.5040180087089539
Epoch 3, Step 0, Loss: 0.4937952160835266
Epoch 3, Step 20, Loss: 0.48634570837020874
Epoch 4, Step 0, Loss: 0.4787130355834961
Epoch 4, Step 20, Loss: 0.4815313220024109
Epoch 5, Step 0, Loss: 0.4617392122745514
Epoch 5, Step 20, Loss: 0.4664623737335205
Epoch 6, Step 0, Loss: 0.44846105575561523
Epoch 6, Step 20, Loss: 0.45259130001068115
Epoch 7, Step 0, Loss: 0.4401521682739258
Epoch 7, Step 20, Loss: 0.4468775987625122
Epoch 8, Step 0, Loss: 0.431944876909256
Epoch 8, Step 20, Loss: 0.44554150104522705
Epoch 9, Step 0, Loss: 0.4295223653316498
Epoch 9, Step 20, Loss: 0.43552497029304504
Epoch 10, Step 0, Loss: 0.42236775159835815
Epoch 10, Step 20, Loss: 0.4299592971801758
Epoch 11, Step 0, Loss: 0.41013556718826294
Epoch 11, Step

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.3936673402786255
Epoch 1, Train Loss: 0.0192, Val Loss: 0.0163
Epoch 1, Step 0, Loss: 1.0080592632293701
Epoch 2, Train Loss: 0.0136, Val Loss: 0.0122
Epoch 2, Step 0, Loss: 0.778862714767456
Epoch 3, Train Loss: 0.0109, Val Loss: 0.0102
Epoch 3, Step 0, Loss: 0.565380871295929
Epoch 4, Train Loss: 0.0095, Val Loss: 0.0092
Epoch 4, Step 0, Loss: 0.46891701221466064
Epoch 5, Train Loss: 0.0087, Val Loss: 0.0085
Accuracy on the test set: 0.8149


### EDA

In [None]:
encoder = self_supervised_training(random_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5661666393280029
Epoch 0, Step 20, Loss: 0.5494167804718018
Epoch 1, Step 0, Loss: 0.5412541627883911
Epoch 1, Step 20, Loss: 0.5317007899284363
Epoch 2, Step 0, Loss: 0.5266916751861572
Epoch 2, Step 20, Loss: 0.5044028759002686
Epoch 3, Step 0, Loss: 0.5002685785293579
Epoch 3, Step 20, Loss: 0.48900240659713745
Epoch 4, Step 0, Loss: 0.4876033067703247
Epoch 4, Step 20, Loss: 0.4709535539150238
Epoch 5, Step 0, Loss: 0.4735378623008728
Epoch 5, Step 20, Loss: 0.46240684390068054
Epoch 6, Step 0, Loss: 0.4541855454444885
Epoch 6, Step 20, Loss: 0.4536859393119812
Epoch 7, Step 0, Loss: 0.44708579778671265
Epoch 7, Step 20, Loss: 0.44711318612098694
Epoch 8, Step 0, Loss: 0.4442305564880371
Epoch 8, Step 20, Loss: 0.43892085552215576
Epoch 9, Step 0, Loss: 0.43350622057914734
Epoch 9, Step 20, Loss: 0.4273674190044403
Epoch 10, Step 0, Loss: 0.4237707853317261
Epoch 10, Step 20, Loss: 0.43539389967918396
Epoch 11, Step 0, Loss: 0.4198702871799469
Epoch 11, Ste

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4526606798171997
Epoch 1, Train Loss: 0.0193, Val Loss: 0.0165
Epoch 1, Step 0, Loss: 0.9643312692642212
Epoch 2, Train Loss: 0.0137, Val Loss: 0.0128
Epoch 2, Step 0, Loss: 0.7874377369880676
Epoch 3, Train Loss: 0.0112, Val Loss: 0.0110
Epoch 3, Step 0, Loss: 0.5619303584098816
Epoch 4, Train Loss: 0.0098, Val Loss: 0.0101
Epoch 4, Step 0, Loss: 0.7131004929542542
Epoch 5, Train Loss: 0.0092, Val Loss: 0.0094
Accuracy on the test set: 0.7984


## Subset selection using multilingual SAS

### GPT

In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5721149444580078
Epoch 0, Step 20, Loss: 0.556881308555603
Epoch 1, Step 0, Loss: 0.5494340658187866
Epoch 1, Step 20, Loss: 0.5362745523452759
Epoch 2, Step 0, Loss: 0.5272545218467712
Epoch 2, Step 20, Loss: 0.5160955190658569
Epoch 3, Step 0, Loss: 0.5013490915298462
Epoch 3, Step 20, Loss: 0.5023000240325928
Epoch 4, Step 0, Loss: 0.49027085304260254
Epoch 4, Step 20, Loss: 0.48825424909591675
Epoch 5, Step 0, Loss: 0.4732997715473175
Epoch 5, Step 20, Loss: 0.4746553897857666
Epoch 6, Step 0, Loss: 0.46465620398521423
Epoch 6, Step 20, Loss: 0.4696463346481323
Epoch 7, Step 0, Loss: 0.4562600255012512
Epoch 7, Step 20, Loss: 0.4604676365852356
Epoch 8, Step 0, Loss: 0.44463223218917847
Epoch 8, Step 20, Loss: 0.44969022274017334
Epoch 9, Step 0, Loss: 0.44560837745666504
Epoch 9, Step 20, Loss: 0.446366548538208
Epoch 10, Step 0, Loss: 0.43953442573547363
Epoch 10, Step 20, Loss: 0.4435325860977173
Epoch 11, Step 0, Loss: 0.428058385848999
Epoch 11, Step 2

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.452978253364563
Epoch 1, Train Loss: 0.0196, Val Loss: 0.0169
Epoch 1, Step 0, Loss: 1.001478910446167
Epoch 2, Train Loss: 0.0140, Val Loss: 0.0128
Epoch 2, Step 0, Loss: 0.8323085904121399
Epoch 3, Train Loss: 0.0112, Val Loss: 0.0108
Epoch 3, Step 0, Loss: 0.5293457508087158
Epoch 4, Train Loss: 0.0096, Val Loss: 0.0096
Epoch 4, Step 0, Loss: 0.5835214853286743
Epoch 5, Train Loss: 0.0088, Val Loss: 0.0089
Accuracy on the test set: 0.8367


### EDA

In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5604395866394043
Epoch 0, Step 20, Loss: 0.5603623390197754
Epoch 1, Step 0, Loss: 0.5462489128112793
Epoch 1, Step 20, Loss: 0.5437032580375671
Epoch 2, Step 0, Loss: 0.5212953090667725
Epoch 2, Step 20, Loss: 0.5226283073425293
Epoch 3, Step 0, Loss: 0.5056626796722412
Epoch 3, Step 20, Loss: 0.504928708076477
Epoch 4, Step 0, Loss: 0.4842556118965149
Epoch 4, Step 20, Loss: 0.49045801162719727
Epoch 5, Step 0, Loss: 0.4719673693180084
Epoch 5, Step 20, Loss: 0.4765450358390808
Epoch 6, Step 0, Loss: 0.4602634906768799
Epoch 6, Step 20, Loss: 0.4671669602394104
Epoch 7, Step 0, Loss: 0.4513300657272339
Epoch 7, Step 20, Loss: 0.45863381028175354
Epoch 8, Step 0, Loss: 0.4428727924823761
Epoch 8, Step 20, Loss: 0.457064688205719
Epoch 9, Step 0, Loss: 0.436479777097702
Epoch 9, Step 20, Loss: 0.4461488425731659
Epoch 10, Step 0, Loss: 0.4357358515262604
Epoch 10, Step 20, Loss: 0.4420330226421356
Epoch 11, Step 0, Loss: 0.4247455298900604
Epoch 11, Step 20, Lo

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.3473504781723022
Epoch 1, Train Loss: 0.0183, Val Loss: 0.0167
Epoch 1, Step 0, Loss: 0.9729761481285095
Epoch 2, Train Loss: 0.0137, Val Loss: 0.0134
Epoch 2, Step 0, Loss: 0.8345464468002319
Epoch 3, Train Loss: 0.0113, Val Loss: 0.0116
Epoch 3, Step 0, Loss: 0.5988467931747437
Epoch 4, Train Loss: 0.0100, Val Loss: 0.0105
Epoch 4, Step 0, Loss: 0.58343106508255
Epoch 5, Train Loss: 0.0091, Val Loss: 0.0099
Accuracy on the test set: 0.8191


## Baseline with full data

### GPT

In [None]:
# encoder = self_supervised_training(train_data, 100, 'gpt')

In [None]:
# evaluate_encoder(encoder, train_data, test_data)

### EDA

In [None]:
# encoder = self_supervised_training(train_data, 100, 'eda')

In [None]:
# evaluate_encoder(encoder, train_data, test_data)

# TODO: Further research to come.