In the following notebook, we investigate the use of SAS (subsets that maximize expected augmentation similarity) to select representative subsets that improve self-supervised learning in a text categorization task. We also investigate the use of multilingual embeddings to further strengthen the SAS selection.

# Setup

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install nlpaug
!pip install gensim



In [None]:
import csv
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
import gensim.downloader as api
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from tqdm.auto import tqdm
# Delete some data from RAM to free up space for later processes
import gc

# Ensure you have the necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Import Google Drive for locally saved files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Global variables

In [None]:
# Max length of text embedding tensors
global_seed = 0
max_length = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set seed for reproducibility
np.random.seed(global_seed)
random.seed(global_seed)

## Helper functions

In [None]:
# Basic preprocessing: lowercasing and removing non-alphanumeric characters
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

## Load dataset

In [None]:
# Load the AG News dataset
dataset = load_dataset("ag_news")

# The dataset is divided into 'train' and 'test' splits
train_data = dataset['train']
test_data = dataset['test']

# For the parts that use GPT, we will use only 2.5% of the data due to the cost of running GPT.
train_data = train_data.train_test_split(test_size=0.975, stratify_by_column="label", seed = global_seed)
train_data = train_data['train']

# Example: Viewing the first training sample
print(train_data[0])
print(len(train_data))

{'text': '9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.', 'label': 0}
3000


## Create NLPAug augmented texts

In [None]:
# aug_1 = naw.WordEmbsAug(
#     model_type='word2vec',
#     model_path='/content/drive/MyDrive/COM SCI 260D/word2vec-google-news-300.bin',
#     top_k = 5, # Only use top 5 options
#     aug_p = 0.2 # 20% of the words augmented
# )
#
# aug_2 = naw.WordEmbsAug(
#     model_type='word2vec',
#     model_path='/content/drive/MyDrive/COM SCI 260D/word2vec-google-news-300.bin',
#     top_k = 10, # Only use top 10 options
#     aug_p = 0.4 # 40% of the words augmented
# )
#
# def eda_augmentation_with_word2vec(sentence, type):
#     if type == 1:
#         return aug_1.augment(sentence)[0]
#     else:
#         return aug_2.augment(sentence)[0]

In [None]:
# Add augmented texts for each example using NLPAug
# def add_nlp_aug_columns(data):
#     data['augment_1'] = eda_augmentation_with_word2vec(data['text'], 1)
#     data['augment_2'] = eda_augmentation_with_word2vec(data['text'], 2)
#     return data

# train_data = train_data.map(
#     lambda example: add_nlp_aug_columns(example)
# )

In [None]:
# Save the augmentations as the process takes 2 hours
# train_df = train_data.to_pandas()
# train_df.to_json("/content/drive/MyDrive/COM SCI 260D/augmented_dataset.json")

In [None]:
import datasets

# Load dataset with augmentations (pre-prepared)
train_df = pd.read_json("/content/drive/MyDrive/COM SCI 260D/augmented_dataset.json")
augmented_data = datasets.Dataset.from_pandas(train_df)

In [None]:
print(augmented_data)
print(augmented_data['text'][-1])
print(augmented_data['text'][-1])

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', '__index_level_0__'],
    num_rows: 3000
})
Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. unveiled Monday a new memory-packed personal digital assistant that can double as a portable data storage drive.
Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. unveiled Monday a new memory-packed personal digital assistant that can double as a portable data storage drive.


In [None]:
# Merge datasets by columns
def add_augmented_columns(data, augments, idx):
    data['augment_1'] = augments['augment_1'][idx]
    data['augment_2'] = augments['augment_2'][idx]
    return data

train_data = train_data.map(
    lambda example, idx: add_augmented_columns(example, augmented_data, idx),
    with_indices=True
)

## Load GPT augmented text and Spanish translations

In [None]:
gpt_dataset = load_dataset("csv", data_files="/content/drive/MyDrive/COM SCI 260D/gpt_dataset.csv", split = 'train')

print(gpt_dataset)
print(gpt_dataset[0:5])
print(train_data[0:5])

Dataset({
    features: ['Original Text', 'Translation', 'Paraphrase 1', 'Paraphrase 2'],
    num_rows: 3000
})


In [None]:
# Ensure both datasets have the same number of rows
assert len(gpt_dataset) == len(train_data), "Datasets must have the same number of rows"

# Merge datasets by columns
def add_gpt_columns(data, augments, idx):
    data['translation'] = augments['Translation'][idx]
    data['gpt_1'] = augments['Paraphrase 1'][idx]
    existing_augment = augments['Paraphrase 2'][idx]
    if not existing_augment or not existing_augment.strip():
        data['gpt_2'] = data['augment_1'] # Fallback to EDA augmentation
    else:
        data['gpt_2'] = existing_augment
    return data

train_data = train_data.map(
    lambda example, idx: add_gpt_columns(example, gpt_dataset, idx),
    with_indices=True
)

print(train_data)
print(train_data[0])

# There are 8 examples that are not correctly being imported. For now, we just remove them.
error_data = train_data.filter(lambda row: row['gpt_2'] is None or row['gpt_1'] is None)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2'],
    num_rows: 3000
})
{'text': '9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.', 'label': 0, 'augment_1': '9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.', 'augment_2': "2 hurt in explosion Tuesdayat Indonesian Embassy LYON_France - - An explosion struck in Indonesian charge_d'_affaires in Paris today, tad wounding nine people, a Algerian radio Finsbury_Park_Tube reported.", 'translation': '9 heridos en explosión en la Embajada de Indonesia PARÍS -- Una explosión golpeó la Embajada de Indonesia en París hoy, hiriendo levemente a nueve personas, según informó una estación de radio francesa.', 'gpt_1': '1. The Indonesian Embassy

In [None]:
print(error_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2'],
    num_rows: 0
})


In [None]:
print(train_data['text'][0])
print(train_data['gpt_1'][0])
print(train_data['gpt_2'][0])
print(train_data['augment_1'][0])
print(train_data['augment_2'][0])
print("")
print(train_data['text'][-1])
print(train_data['gpt_1'][-1])
print(train_data['gpt_2'][-1])
print(train_data['augment_1'][-1])
print(train_data['augment_2'][-1])

9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.
1. The Indonesian Embassy in Paris was struck by an explosion today, leading to minor injuries for nine individuals, as reported by a French radio station.
9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.
9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.
2 hurt in explosion Tuesdayat Indonesian Embassy LYON_France - - An explosion struck in Indonesian charge_d'_affaires in Paris today, tad wounding nine people, a Algerian radio Finsbury_Park_Tube reported.

Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. un

# Latent class discovery

First, we use 1% of the data randomly selected as a small piece of labelled information to train a basic pre-trained classifier and assign latent classes to each training data point.

In [None]:
# Select 1% of the data randomly
shuffled_dataset = train_data.shuffle(seed = global_seed)
sample_size = int(0.01 * len(shuffled_dataset))
labeled_data = shuffled_dataset.select(range(sample_size))
rest_of_data = shuffled_dataset.select(range(sample_size, len(shuffled_dataset)))

# Get number of classes from the 1% data
unique_labels = set(labeled_data['label'])
num_classes = len(unique_labels)

print(num_classes)

4


In [None]:
# Split the 1% dataset into training and validation sets. We only use 20% as validation set due to size of data.
labeled_data_for_training = labeled_data.train_test_split(test_size=0.2, stratify_by_column="label", seed = global_seed)

In [None]:
# Pre-process and tokenize data
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')

def tokenize(examples):
    preprocessed_texts = [preprocess(text) for text in examples['text']]
    return tokenizer(preprocessed_texts, padding='max_length', truncation=True, max_length = max_length, return_tensors="pt")

tokenized_dataset = labeled_data_for_training.map(tokenize, batched=True)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [None]:
# Import model for linear classification using the number of classes in 1% data.
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-small', num_labels=num_classes)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Use the 1% labelled data to train a BERT model
latent_num_epochs = 20

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=latent_num_epochs,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['test'])

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.414043
2,1.401400,1.410811
3,1.401400,1.405662
4,1.345400,1.39899
5,1.334100,1.3901
6,1.334100,1.380624
7,1.309600,1.368604
8,1.309600,1.354202
9,1.227000,1.337197
10,1.192000,1.323368


TrainOutput(global_step=120, training_loss=1.1128187338511148, metrics={'train_runtime': 2.7361, 'train_samples_per_second': 175.432, 'train_steps_per_second': 43.858, 'total_flos': 9492677591040.0, 'train_loss': 1.1128187338511148, 'epoch': 20.0})

In [None]:
# Save the model to a directory for local use
model.save_pretrained('./model')
tokenizer.save_pretrained('./tokenizer')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [None]:
# Use the model to predict latent classes for rest of training data
def predict_batch(batch):
    # Tokenize the examples
    preprocessed_texts = [preprocess(text) for text in batch['text']]
    inputs = tokenizer(preprocessed_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Perform the prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class indices
    predictions = outputs.logits.argmax(-1).cpu().numpy()
    return {'predictions': predictions}

# Create a DataLoader for the rest of the data
rest_of_data_loader = DataLoader(rest_of_data, batch_size=32)

# Run predictions in batches
results = []
for batch in tqdm(rest_of_data_loader, desc="Predicting latent classes"):
    batch_results = predict_batch(batch)
    results.extend(batch_results['predictions'])

Predicting latent classes:   0%|          | 0/93 [00:00<?, ?it/s]

In [None]:
# Save the predicted classes for local use
file_name = './latent_classes.csv'

with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)

    for integer in results:
        writer.writerow([integer])

In [None]:
# Add the latent class to the dataset
def add_new_column(example, idx, new_data):
    example['latent_class'] = new_data[idx]
    return example

labeled_rest_of_data = rest_of_data.map(
    lambda example, idx: add_new_column(example, idx, results),
    with_indices=True
)

# For the labelled data, we use the label as the latent class as it is already known.
def insert_column_with_same_value(example):
    example['latent_class'] = example['label']
    return example

labeled_data = labeled_data.map(
    lambda example: insert_column_with_same_value(example)
)

Map:   0%|          | 0/2970 [00:00<?, ? examples/s]

In [None]:
# How well does the latent classification work?
def check_columns_equality(example, column1, column2):
    return example[column1] == example[column2]

check_accuracy_dataset = labeled_rest_of_data.filter(lambda example: check_columns_equality(example, 'label', 'latent_class'))

matching_count = len(check_accuracy_dataset)
print(f"Accuracy of latent classification: {matching_count / len(labeled_rest_of_data)}")

Filter:   0%|          | 0/2970 [00:00<?, ? examples/s]

Accuracy of latent classification: 0.7296296296296296


In [None]:
# Re-merge the 1% data and rest of the data for self-supervised learning
latent_class_data = concatenate_datasets([labeled_data, labeled_rest_of_data])
print(latent_class_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 3000
})


In [None]:
del results
del model
del tokenizer
del training_args
del trainer
del rest_of_data
del rest_of_data_loader
del labeled_rest_of_data
del check_accuracy_dataset
del labeled_data
del gpt_dataset

# Collect garbage
gc.collect()

55

# Subset selection

Now that we have some form of latent classes assigned to all of the training data, we can find subsets using three methods:

1. Original SAS algorithm
2. Random subset from each latent class
3. Original SAS algorithm + Spanish embeddings

We will select a 20% subset and compare the three. In the long run, we also need to compare with using the whole dataset but that is going to be very computationally intensive.

In [None]:
# Split into latent classes to select subsets from each
def create_filter_function(latent_class):
    def filter_label(example):
        return example['latent_class'] == latent_class
    return filter_label

world_train_data = latent_class_data.filter(create_filter_function(0))
sports_train_data = latent_class_data.filter(create_filter_function(1))
business_train_data = latent_class_data.filter(create_filter_function(2))
tech_train_data = latent_class_data.filter(create_filter_function(3))

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# We will select the subset in this ratio for all subsets, for fairness
print(len(world_train_data))
print(len(sports_train_data))
print(len(business_train_data))
print(len(tech_train_data))

780
812
372
1036


## SAS

In [None]:
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')
model = AutoModel.from_pretrained('prajjwal1/bert-small')

def bert_embeddings(data):
    inputs = tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    outputs = model(**inputs)
    # Take the first token ([CLS]) embeddings from each sentence
    return outputs.last_hidden_state[:,0,:].detach()

def sas_algorithm(data, subset_size):
    # Get embeddings for the text data
    embeddings = bert_embeddings(data)
    n = len(embeddings)

    # Initialize the subset and similarities
    S = set()
    S_similarities = np.full(n, -np.inf)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = cosine_similarity(embeddings)

    del embeddings
    gc.collect()

    for _ in tqdm(range(subset_size)):
        not_in_S = np.array([i for i in range(n) if i not in S])

        # Calculate gains for each potential exemplar not in S
        gains = []
        for idx in not_in_S:
            gain = np.sum(np.maximum(S_similarities, similarity_matrix[idx]))
            gains.append(gain)

        # Find the exemplar with the best gain
        best_exemplar_idx = np.argmax(gains)
        best_exemplar = not_in_S[best_exemplar_idx]

        # Update the similarity for the best-selected subset
        S_similarities = np.maximum(S_similarities, similarity_matrix[best_exemplar])
        S.add(best_exemplar)

    return list(S)

In [None]:
# Run the SAS algorithm to select a 20% subset from each latent class
subset_size = 0.2

world_sas_indices = sas_algorithm(world_train_data['text'], int(subset_size * len(world_train_data)))
sports_sas_indices = sas_algorithm(sports_train_data['text'], int(subset_size * len(sports_train_data)))
business_sas_indices = sas_algorithm(business_train_data['text'], int(subset_size * len(business_train_data)))
tech_sas_indices = sas_algorithm(tech_train_data['text'], int(subset_size * len(tech_train_data)))

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/74 [00:00<?, ?it/s]

  0%|          | 0/207 [00:00<?, ?it/s]

In [None]:
# Sanity check on indices
print(world_sas_indices)
print(sports_sas_indices)
print(business_sas_indices)
print(tech_sas_indices)

[517, 519, 11, 13, 14, 529, 18, 530, 21, 534, 23, 539, 30, 546, 549, 40, 553, 42, 44, 557, 559, 50, 565, 54, 567, 57, 61, 63, 64, 67, 68, 586, 76, 77, 590, 588, 592, 81, 600, 88, 98, 612, 614, 615, 626, 115, 117, 119, 122, 635, 126, 646, 134, 653, 661, 663, 156, 670, 671, 672, 161, 165, 678, 171, 684, 686, 177, 689, 690, 180, 694, 700, 189, 191, 713, 202, 204, 719, 723, 214, 727, 219, 736, 229, 747, 235, 749, 239, 753, 757, 246, 758, 247, 248, 765, 770, 771, 772, 261, 262, 775, 773, 269, 274, 277, 278, 279, 285, 287, 305, 307, 311, 332, 336, 337, 343, 348, 349, 356, 361, 370, 376, 378, 386, 387, 389, 390, 393, 395, 403, 404, 408, 410, 414, 422, 423, 426, 431, 432, 433, 438, 439, 443, 446, 448, 461, 463, 469, 470, 471, 475, 477, 479, 486, 488, 508]
[512, 517, 520, 12, 533, 534, 21, 536, 28, 30, 545, 34, 551, 40, 41, 43, 560, 562, 52, 565, 55, 568, 569, 58, 61, 576, 579, 74, 590, 591, 79, 596, 598, 90, 93, 96, 97, 98, 101, 102, 103, 105, 618, 110, 130, 642, 647, 136, 137, 653, 147, 148, 

In [None]:
world_sas_subset = world_train_data.select(world_sas_indices)
sports_sas_subset = sports_train_data.select(sports_sas_indices)
business_sas_subset = business_train_data.select(business_sas_indices)
tech_sas_subset = tech_train_data.select(tech_sas_indices)

# Create the subset to use for contrastive self-supervised learning
sas_subset_data = concatenate_datasets([world_sas_subset, sports_sas_subset, business_sas_subset, tech_sas_subset])

In [None]:
del world_sas_subset
del sports_sas_subset
del business_sas_subset
del tech_sas_subset
del world_sas_indices
del sports_sas_indices
del business_sas_indices
del tech_sas_indices
del tokenizer
del model

gc.collect()

10

In [None]:
print(sas_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 599
})


## Random

In [None]:
# Randomly sample 20% of the dataset for each latent class
def random_subset(data, subset_size):
    return random.sample(range(len(data)), subset_size)

In [None]:
subset_size = 0.2

world_random_indices = random_subset(world_train_data['text'], int(subset_size * len(world_train_data)))
sports_random_indices = random_subset(sports_train_data['text'], int(subset_size * len(sports_train_data)))
business_random_indices = random_subset(business_train_data['text'], int(subset_size * len(business_train_data)))
tech_random_indices = random_subset(tech_train_data['text'], int(subset_size * len(tech_train_data)))

In [None]:
# Sanity check on indices
print(world_random_indices)
print(sports_random_indices)
print(business_random_indices)
print(tech_random_indices)

[654, 114, 25, 759, 281, 250, 228, 142, 754, 104, 692, 758, 558, 89, 604, 432, 32, 30, 95, 223, 238, 517, 616, 27, 574, 203, 733, 665, 718, 767, 429, 225, 459, 603, 284, 6, 163, 714, 764, 348, 745, 159, 220, 344, 770, 94, 389, 99, 367, 352, 618, 270, 44, 470, 549, 127, 387, 80, 565, 300, 643, 633, 370, 591, 196, 71, 46, 677, 233, 296, 81, 776, 103, 753, 739, 464, 650, 373, 166, 379, 363, 214, 686, 273, 663, 73, 623, 703, 175, 546, 774, 167, 473, 388, 276, 655, 570, 224, 332, 57, 234, 763, 323, 410, 274, 67, 216, 580, 322, 217, 511, 405, 658, 469, 146, 271, 772, 252, 755, 551, 269, 598, 438, 597, 408, 717, 682, 141, 521, 505, 93, 48, 112, 156, 642, 743, 741, 610, 65, 394, 390, 645, 479, 541, 257, 566, 11, 117, 725, 696, 740, 778, 720, 445, 161, 704]
[3, 739, 736, 269, 512, 780, 182, 519, 108, 640, 305, 654, 804, 623, 203, 156, 382, 806, 165, 552, 543, 0, 613, 331, 500, 19, 114, 371, 314, 245, 59, 246, 580, 80, 87, 749, 497, 70, 545, 128, 131, 675, 486, 562, 169, 271, 540, 621, 433, 216,

In [None]:
world_random_subset = world_train_data.select(world_random_indices)
sports_random_subset = sports_train_data.select(sports_random_indices)
business_random_subset = business_train_data.select(business_random_indices)
tech_random_subset = tech_train_data.select(tech_random_indices)

# Create the subset to use for contrastive self-supervised learning
random_subset_data = concatenate_datasets([world_random_subset, sports_random_subset, business_random_subset, tech_random_subset])

In [None]:
del world_random_subset
del sports_random_subset
del business_random_subset
del tech_random_subset
del world_random_indices
del sports_random_indices
del business_random_indices
del tech_random_indices

gc.collect()

15

In [None]:
print(random_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 599
})


## Multi-lingual SAS

In [None]:
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')
model = AutoModel.from_pretrained('prajjwal1/bert-small')

spanish_tokenizer = AutoTokenizer.from_pretrained('dccuchile/albert-tiny-spanish')
spanish_model = AutoModel.from_pretrained('dccuchile/albert-tiny-spanish')

def bert_embeddings(data):
    inputs = tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    outputs = model(**inputs)
    # Take the first token ([CLS]) embeddings from each sentence
    return outputs.last_hidden_state[:,0,:].detach()

def albert_embeddings(data):
    inputs = spanish_tokenizer(data, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    outputs = spanish_model(**inputs)
    # Take the first token ([CLS]) embeddings from each sentence
    return outputs.last_hidden_state[:,0,:].detach()

def sas_with_spanish_algorithm(data, subset_size):
    # Get embeddings for the text data
    embeddings = bert_embeddings(data['text'])
    n = len(embeddings)
    print(embeddings.shape)

    # Initialize the subset and similarities
    S = set()
    S_similarities = np.full(n, -np.inf)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = cosine_similarity(embeddings)

    del(embeddings)
    gc.collect()

    # Get Spanish embeddings
    spanish_embeddings = albert_embeddings(data['translation'])
    print(spanish_embeddings.shape)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = np.add(similarity_matrix, cosine_similarity(spanish_embeddings))

    del(spanish_embeddings)
    gc.collect()

    for _ in tqdm(range(subset_size)):
        not_in_S = np.array([i for i in range(n) if i not in S])

        # Calculate gains for each potential exemplar not in S
        gains = []
        for idx in not_in_S:
            gain = np.sum(np.maximum(S_similarities, similarity_matrix[idx]))
            gains.append(gain)

        # Find the exemplar with the best gain
        best_exemplar_idx = np.argmax(gains)
        best_exemplar = not_in_S[best_exemplar_idx]

        # Update the similarity for the best-selected subset
        S_similarities = np.maximum(S_similarities, similarity_matrix[best_exemplar])
        S.add(best_exemplar)

    return list(S)

In [None]:
# Run the SAS algorithm to select a 20% subset from each latent class
subset_size = 0.2

world_multilingual_sas_indices = sas_with_spanish_algorithm(world_train_data, int(subset_size * len(world_train_data)))
sports_multilingual_sas_indices = sas_with_spanish_algorithm(sports_train_data, int(subset_size * len(sports_train_data)))
business_multilingual_sas_indices = sas_with_spanish_algorithm(business_train_data, int(subset_size * len(business_train_data)))
tech_multilingual_sas_indices = sas_with_spanish_algorithm(tech_train_data, int(subset_size * len(tech_train_data)))

torch.Size([780, 512])
torch.Size([780, 312])


  0%|          | 0/156 [00:00<?, ?it/s]

torch.Size([812, 512])
torch.Size([812, 312])


  0%|          | 0/162 [00:00<?, ?it/s]

torch.Size([372, 512])
torch.Size([372, 312])


  0%|          | 0/74 [00:00<?, ?it/s]

torch.Size([1036, 512])
torch.Size([1036, 312])


  0%|          | 0/207 [00:00<?, ?it/s]

In [None]:
world_multilingual_sas_subset = world_train_data.select(world_multilingual_sas_indices)
sports_multilingual_sas_subset = sports_train_data.select(sports_multilingual_sas_indices)
business_multilingual_sas_subset = business_train_data.select(business_multilingual_sas_indices)
tech_multilingual_sas_subset = tech_train_data.select(tech_multilingual_sas_indices)

# Create the subset to use for contrastive self-supervised learning
multilingual_sas_subset_data = concatenate_datasets([world_multilingual_sas_subset, sports_multilingual_sas_subset, business_multilingual_sas_subset, tech_multilingual_sas_subset])

In [None]:
del world_multilingual_sas_subset
del sports_multilingual_sas_subset
del business_multilingual_sas_subset
del tech_multilingual_sas_subset
del world_multilingual_sas_indices
del sports_multilingual_sas_indices
del business_multilingual_sas_indices
del tech_multilingual_sas_indices
del tokenizer
del model
del spanish_tokenizer
del spanish_model

gc.collect()

10

In [None]:
print(multilingual_sas_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 599
})


# Contrastive learning pipeline

Now that we have a 20% representative subset of the data in many different ways. We use this subset, along with two different forms of textual augmentation:

1. Easy data augmentation such as synonym replacement, random addition and random swapping
2. GPT for paraphrasing sentences

to train a self-supervised contrastive learning encoder, which will be evaluated in a downstream prediction task.

In [None]:
# Define the encoder model. We will use pre-trained BERT as the initial embedding
# and use contrastive learning to further train embeddings that can be tested
# downstream, specifically in news article domain.
model_name = 'prajjwal1/bert-small'

class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :] # Use the [CLS] token as the embedding

# Define the contrastive (InfoNCE) loss
class ContrastiveLoss(nn.Module):
    def forward(self, z_i, z_j, z_k):
        # We use cosine similarities between the embeddings
        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

        # Similarity with positive augmented examples vs. negative examples
        positive_similarity = torch.exp(cos(z_i, z_j))
        negative_similarity = torch.exp(cos(z_i, z_k))

        loss = -torch.log(positive_similarity / (positive_similarity + negative_similarity))

        return loss.mean()

In [None]:
# Helper function to randomly move around indices to create a list for negative examples
def shuffle_without_duplication(arr):
    np.random.seed(global_seed)
    arr = np.array(arr)
    n = len(arr)
    # Create an array of the same shape filled with the indices
    indices = np.arange(n)
    while True:
        # Shuffle the indices
        np.random.shuffle(indices)
        # Check if no element remains in its original position
        if not np.any(indices == np.arange(n)):
            break
    # Return the shuffled array
    return arr[indices].tolist()

In [None]:
# Training function for self-supervised contrastive learning
def self_supervised_training(data, num_epochs, augment='eda'):
    # Parameters
    batch_size = 32
    learning_rate = 1e-6

    # Shuffle the data for better training. At first, all subsets are concatenated and
    # therefore separated by latent class
    data = data.shuffle(seed = global_seed)
    texts = data['text']
    if (augment == 'eda'):
        existing_augments_1 = data['augment_1']
        existing_augments_2 = data['augment_2']
    else:
        existing_augments_1 = data['gpt_1']
        existing_augments_2 = data['gpt_2']

    encoder = Encoder().to(device)
    loss_fn = ContrastiveLoss()
    optimizer = optim.Adam(encoder.parameters(), lr = learning_rate)

    # Training loop
    encoder.train()
    for epoch in range(num_epochs):
        for i in range(0, len(texts), batch_size):
            # Sample a batch of texts
            batch_existing_augments_1 = existing_augments_1[i:i + batch_size]
            batch_existing_augments_2 = existing_augments_2[i:i + batch_size]

            # Clean texts and create a list of negative texts from the batch, ensuring
            # that the same example is not selected for the negative.
            clean_existing_augments_1 = [preprocess(text) for text in batch_existing_augments_1]
            clean_existing_augments_2 = [preprocess(text) for text in batch_existing_augments_2]
            negative_augments = shuffle_without_duplication(clean_existing_augments_1)

            # Get the embeddings from the encoder
            z_i = encoder(clean_existing_augments_1)
            z_j = encoder(clean_existing_augments_2)
            z_k = encoder(negative_augments)

            # Compute the contrastive loss
            loss = loss_fn(z_i, z_j, z_k)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print every 20 steps
            if (i // batch_size) % 20 == 0:
                print(f'Epoch {epoch}, Step {i // batch_size}, Loss: {loss.item()}')

    return encoder

# Evaluation pipeline

Through contrastive learning, we have learned an encoder $f$ that can embed our news article text for better downstream performance. We test by using this encoder $f$ and the true labels in training a linear classifier head, and testing on the test dataset. Here, we are trying to compare the efficacy of the encoder $f$ for the three different ways of selecting a subset. Additionally, we need to compare with using the full data, which will be left as a future direction due to computational issues.

In [None]:
# Define the linear classifier head
class LinearClassifier(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super().__init__()
        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        return self.linear(x)

In [None]:
# Create dataloaders from the train and test data
def create_dataloaders(train_data, test_data, batch_size):
    train_val_data = train_data.train_test_split(test_size=0.2, stratify_by_column="label", seed = global_seed)

    train_texts = [preprocess(text) for text in train_val_data['train']['text']]
    val_texts = [preprocess(text) for text in train_val_data['test']['text']]
    test_texts = [preprocess(text) for text in test_data['text']]

    # Create tensors for labels
    train_labels = torch.tensor(train_val_data['train']['label'])
    val_labels = torch.tensor(train_val_data['test']['label'])
    test_labels = torch.tensor(test_data['label'])

    # Create a DataLoader for our training and testing data
    train_data_for_classification = list(zip(train_texts, train_labels))
    val_data_for_classification = list(zip(val_texts, val_labels))
    test_data_for_classification = list(zip(test_texts, test_labels))

    train_dataloader = DataLoader(train_data_for_classification, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_data_for_classification, batch_size=batch_size)
    test_dataloader = DataLoader(test_data_for_classification, batch_size=batch_size)

    return train_dataloader, val_dataloader, test_dataloader

# Evaluation function to train a linear classifier head on top of the learned encoder
# and evaluate on the test set
def evaluate_encoder(encoder, train_data, test_data):
    # Parameters
    batch_size = 64
    num_epochs = 5
    num_classes = 4
    learning_rate = 1e-4

    # Create dataloaders
    train_dataloader, val_dataloader, test_dataloader = create_dataloaders(train_data, test_data, batch_size)

    # Set the encoder to evaluation mode and freeze all layers to test the trained embeddings from contrastive learning
    encoder.eval()
    for param in encoder.parameters():
        param.requires_grad = False

    # Initialize the linear classifier head
    classifier = LinearClassifier(encoder.model.config.hidden_size, num_classes).to(device)

    # Loss function and optimizer for the classifier head
    # Define scheduler for learning rate.
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(classifier.parameters(), lr = learning_rate)

    best_val_loss = float('inf')
    checkpoint_path = './best_checkpoint.pth'

    # Train the classifier
    for epoch in range(num_epochs):
        classifier.train()
        train_loss = 0.0

        for (index, data) in enumerate(train_dataloader):
            texts, labels = data
            labels = labels.to(device)
            optimizer.zero_grad()

            # Forward pass through the frozen encoder and classifier head
            embeddings = encoder(texts)
            outputs = classifier(embeddings)
            loss = criterion(outputs, labels)

            # Backpropagation
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            # Print every 20 steps
            if index % 50 == 0:
                print(f'Epoch {epoch}, Step {index}, Loss: {loss.item()}')

        classifier.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data in val_dataloader:
                texts, labels = data
                labels = labels.to(device)

                embeddings = encoder(texts)
                outputs = classifier(embeddings)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        # Average losses
        train_loss /= len(train_dataloader.dataset)
        val_loss /= len(val_dataloader.dataset)

        # Print stats
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(classifier.state_dict(), checkpoint_path)

    # Evaluate the classifier
    classifier.load_state_dict(torch.load(checkpoint_path))
    classifier.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for texts, labels in test_dataloader:
            labels = labels.to(device)

            embeddings = encoder(texts)
            outputs = classifier(embeddings)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f'Accuracy on the test set: {accuracy:.4f}')

# Testing different subset selection processes + augmentation techniques

## Subset selection using SAS

### GPT

In [None]:
encoder = self_supervised_training(sas_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5600748658180237
Epoch 1, Step 0, Loss: 0.5378324389457703
Epoch 2, Step 0, Loss: 0.5312172770500183
Epoch 3, Step 0, Loss: 0.5124976634979248
Epoch 4, Step 0, Loss: 0.5055180788040161
Epoch 5, Step 0, Loss: 0.4923444390296936
Epoch 6, Step 0, Loss: 0.490602970123291
Epoch 7, Step 0, Loss: 0.4779042601585388
Epoch 8, Step 0, Loss: 0.46538668870925903
Epoch 9, Step 0, Loss: 0.46234631538391113
Epoch 10, Step 0, Loss: 0.4579157531261444
Epoch 11, Step 0, Loss: 0.4555138945579529
Epoch 12, Step 0, Loss: 0.44756725430488586
Epoch 13, Step 0, Loss: 0.441618412733078
Epoch 14, Step 0, Loss: 0.4390103816986084
Epoch 15, Step 0, Loss: 0.433265745639801
Epoch 16, Step 0, Loss: 0.4344601333141327
Epoch 17, Step 0, Loss: 0.426557719707489
Epoch 18, Step 0, Loss: 0.42496681213378906
Epoch 19, Step 0, Loss: 0.4232085347175598
Epoch 20, Step 0, Loss: 0.4209192991256714
Epoch 21, Step 0, Loss: 0.4107571840286255
Epoch 22, Step 0, Loss: 0.41217851638793945
Epoch 23, Step 0, Lo

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.0927131175994873
Epoch 1, Train Loss: 0.0149, Val Loss: 0.0128
Epoch 1, Step 0, Loss: 0.7969875931739807
Epoch 2, Train Loss: 0.0107, Val Loss: 0.0099
Epoch 2, Step 0, Loss: 0.5671672821044922
Epoch 3, Train Loss: 0.0087, Val Loss: 0.0085
Epoch 3, Step 0, Loss: 0.46570172905921936
Epoch 4, Train Loss: 0.0077, Val Loss: 0.0077
Epoch 4, Step 0, Loss: 0.47760969400405884
Epoch 5, Train Loss: 0.0071, Val Loss: 0.0072
Accuracy on the test set: 0.8499


### EDA

In [None]:
encoder = self_supervised_training(sas_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5562006235122681
Epoch 1, Step 0, Loss: 0.5384521484375
Epoch 2, Step 0, Loss: 0.5296761989593506
Epoch 3, Step 0, Loss: 0.5178812742233276
Epoch 4, Step 0, Loss: 0.5067201852798462
Epoch 5, Step 0, Loss: 0.4917745888233185
Epoch 6, Step 0, Loss: 0.48616623878479004
Epoch 7, Step 0, Loss: 0.4759005904197693
Epoch 8, Step 0, Loss: 0.4667072296142578
Epoch 9, Step 0, Loss: 0.46327000856399536
Epoch 10, Step 0, Loss: 0.45665836334228516
Epoch 11, Step 0, Loss: 0.4491155445575714
Epoch 12, Step 0, Loss: 0.4462629556655884
Epoch 13, Step 0, Loss: 0.4418158531188965
Epoch 14, Step 0, Loss: 0.43808794021606445
Epoch 15, Step 0, Loss: 0.43351107835769653
Epoch 16, Step 0, Loss: 0.4299006462097168
Epoch 17, Step 0, Loss: 0.42780086398124695
Epoch 18, Step 0, Loss: 0.4193771481513977
Epoch 19, Step 0, Loss: 0.4192868173122406
Epoch 20, Step 0, Loss: 0.41830554604530334
Epoch 21, Step 0, Loss: 0.41315731406211853
Epoch 22, Step 0, Loss: 0.40581172704696655
Epoch 23, Step 

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.5847781896591187
Epoch 1, Train Loss: 0.0208, Val Loss: 0.0181
Epoch 1, Step 0, Loss: 0.995040774345398
Epoch 2, Train Loss: 0.0150, Val Loss: 0.0137
Epoch 2, Step 0, Loss: 0.8486573696136475
Epoch 3, Train Loss: 0.0118, Val Loss: 0.0114
Epoch 3, Step 0, Loss: 0.6549249291419983
Epoch 4, Train Loss: 0.0102, Val Loss: 0.0100
Epoch 4, Step 0, Loss: 0.5393980145454407
Epoch 5, Train Loss: 0.0091, Val Loss: 0.0092
Accuracy on the test set: 0.8221


## Random subset selection

### GPT

In [None]:
encoder = self_supervised_training(random_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5777820348739624
Epoch 1, Step 0, Loss: 0.5547431111335754
Epoch 2, Step 0, Loss: 0.5521800518035889
Epoch 3, Step 0, Loss: 0.5420279502868652
Epoch 4, Step 0, Loss: 0.5276839733123779
Epoch 5, Step 0, Loss: 0.513462483882904
Epoch 6, Step 0, Loss: 0.5103956460952759
Epoch 7, Step 0, Loss: 0.4980087876319885
Epoch 8, Step 0, Loss: 0.4907521903514862
Epoch 9, Step 0, Loss: 0.484147310256958
Epoch 10, Step 0, Loss: 0.4772396981716156
Epoch 11, Step 0, Loss: 0.4727325439453125
Epoch 12, Step 0, Loss: 0.4686007797718048
Epoch 13, Step 0, Loss: 0.45894578099250793
Epoch 14, Step 0, Loss: 0.45758306980133057
Epoch 15, Step 0, Loss: 0.45225387811660767
Epoch 16, Step 0, Loss: 0.44931143522262573
Epoch 17, Step 0, Loss: 0.44230228662490845
Epoch 18, Step 0, Loss: 0.4457394480705261
Epoch 19, Step 0, Loss: 0.4389830231666565
Epoch 20, Step 0, Loss: 0.43539994955062866
Epoch 21, Step 0, Loss: 0.4327471852302551
Epoch 22, Step 0, Loss: 0.4338375926017761
Epoch 23, Step 0,

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4105051755905151
Epoch 1, Train Loss: 0.0197, Val Loss: 0.0167
Epoch 1, Step 0, Loss: 1.0025835037231445
Epoch 2, Train Loss: 0.0137, Val Loss: 0.0122
Epoch 2, Step 0, Loss: 0.7725619673728943
Epoch 3, Train Loss: 0.0104, Val Loss: 0.0099
Epoch 3, Step 0, Loss: 0.5070099830627441
Epoch 4, Train Loss: 0.0087, Val Loss: 0.0086
Epoch 4, Step 0, Loss: 0.4345132112503052
Epoch 5, Train Loss: 0.0078, Val Loss: 0.0079
Accuracy on the test set: 0.8478


### EDA

In [None]:
encoder = self_supervised_training(random_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5745919346809387
Epoch 1, Step 0, Loss: 0.558458149433136
Epoch 2, Step 0, Loss: 0.5469321012496948
Epoch 3, Step 0, Loss: 0.535645604133606
Epoch 4, Step 0, Loss: 0.5263427495956421
Epoch 5, Step 0, Loss: 0.5219748616218567
Epoch 6, Step 0, Loss: 0.5112001299858093
Epoch 7, Step 0, Loss: 0.5054078698158264
Epoch 8, Step 0, Loss: 0.4921482801437378
Epoch 9, Step 0, Loss: 0.48368802666664124
Epoch 10, Step 0, Loss: 0.48155471682548523
Epoch 11, Step 0, Loss: 0.4779320955276489
Epoch 12, Step 0, Loss: 0.46922218799591064
Epoch 13, Step 0, Loss: 0.4646039307117462
Epoch 14, Step 0, Loss: 0.4649791717529297
Epoch 15, Step 0, Loss: 0.4607924520969391
Epoch 16, Step 0, Loss: 0.45970070362091064
Epoch 17, Step 0, Loss: 0.452333003282547
Epoch 18, Step 0, Loss: 0.4506819248199463
Epoch 19, Step 0, Loss: 0.45144712924957275
Epoch 20, Step 0, Loss: 0.44549599289894104
Epoch 21, Step 0, Loss: 0.44364893436431885
Epoch 22, Step 0, Loss: 0.44428542256355286
Epoch 23, Step 0

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.507509469985962
Epoch 1, Train Loss: 0.0207, Val Loss: 0.0178
Epoch 1, Step 0, Loss: 1.0663799047470093
Epoch 2, Train Loss: 0.0149, Val Loss: 0.0133
Epoch 2, Step 0, Loss: 0.8383258581161499
Epoch 3, Train Loss: 0.0116, Val Loss: 0.0109
Epoch 3, Step 0, Loss: 0.5757902264595032
Epoch 4, Train Loss: 0.0098, Val Loss: 0.0096
Epoch 4, Step 0, Loss: 0.670987069606781
Epoch 5, Train Loss: 0.0088, Val Loss: 0.0087
Accuracy on the test set: 0.8263


## Subset selection using multilingual SAS

### GPT

In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5604792833328247
Epoch 1, Step 0, Loss: 0.5525549650192261
Epoch 2, Step 0, Loss: 0.5392814874649048
Epoch 3, Step 0, Loss: 0.524996817111969
Epoch 4, Step 0, Loss: 0.5146453380584717
Epoch 5, Step 0, Loss: 0.5088536143302917
Epoch 6, Step 0, Loss: 0.5030300617218018
Epoch 7, Step 0, Loss: 0.4921185374259949
Epoch 8, Step 0, Loss: 0.4853336811065674
Epoch 9, Step 0, Loss: 0.4768053889274597
Epoch 10, Step 0, Loss: 0.46909672021865845
Epoch 11, Step 0, Loss: 0.4644984304904938
Epoch 12, Step 0, Loss: 0.45974934101104736
Epoch 13, Step 0, Loss: 0.4609379768371582
Epoch 14, Step 0, Loss: 0.45077207684516907
Epoch 15, Step 0, Loss: 0.444475919008255
Epoch 16, Step 0, Loss: 0.4481048583984375
Epoch 17, Step 0, Loss: 0.44843780994415283
Epoch 18, Step 0, Loss: 0.4382556080818176
Epoch 19, Step 0, Loss: 0.4391845166683197
Epoch 20, Step 0, Loss: 0.4403272867202759
Epoch 21, Step 0, Loss: 0.4341546297073364
Epoch 22, Step 0, Loss: 0.43100103735923767
Epoch 23, Step 0, 

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4955085515975952
Epoch 1, Train Loss: 0.0202, Val Loss: 0.0174
Epoch 1, Step 0, Loss: 1.0091978311538696
Epoch 2, Train Loss: 0.0141, Val Loss: 0.0127
Epoch 2, Step 0, Loss: 0.8569031357765198
Epoch 3, Train Loss: 0.0109, Val Loss: 0.0103
Epoch 3, Step 0, Loss: 0.517432451248169
Epoch 4, Train Loss: 0.0091, Val Loss: 0.0089
Epoch 4, Step 0, Loss: 0.5428343415260315
Epoch 5, Train Loss: 0.0081, Val Loss: 0.0081
Accuracy on the test set: 0.8508


### EDA

In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5690811276435852
Epoch 1, Step 0, Loss: 0.5566614866256714
Epoch 2, Step 0, Loss: 0.543766975402832
Epoch 3, Step 0, Loss: 0.5262359976768494
Epoch 4, Step 0, Loss: 0.5199500918388367
Epoch 5, Step 0, Loss: 0.512161135673523
Epoch 6, Step 0, Loss: 0.5060296654701233
Epoch 7, Step 0, Loss: 0.4975472688674927
Epoch 8, Step 0, Loss: 0.48652857542037964
Epoch 9, Step 0, Loss: 0.4818059206008911
Epoch 10, Step 0, Loss: 0.48214417695999146
Epoch 11, Step 0, Loss: 0.4709904193878174
Epoch 12, Step 0, Loss: 0.46362996101379395
Epoch 13, Step 0, Loss: 0.46156322956085205
Epoch 14, Step 0, Loss: 0.45478999614715576
Epoch 15, Step 0, Loss: 0.4518885612487793
Epoch 16, Step 0, Loss: 0.45335108041763306
Epoch 17, Step 0, Loss: 0.44403108954429626
Epoch 18, Step 0, Loss: 0.44454753398895264
Epoch 19, Step 0, Loss: 0.44807687401771545
Epoch 20, Step 0, Loss: 0.4395509362220764
Epoch 21, Step 0, Loss: 0.43824949860572815
Epoch 22, Step 0, Loss: 0.4354933798313141
Epoch 23, Ste

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4021613597869873
Epoch 1, Train Loss: 0.0188, Val Loss: 0.0167
Epoch 1, Step 0, Loss: 0.994870662689209
Epoch 2, Train Loss: 0.0134, Val Loss: 0.0127
Epoch 2, Step 0, Loss: 0.8064404129981995
Epoch 3, Train Loss: 0.0107, Val Loss: 0.0106
Epoch 3, Step 0, Loss: 0.5686622262001038
Epoch 4, Train Loss: 0.0092, Val Loss: 0.0095
Epoch 4, Step 0, Loss: 0.5793561339378357
Epoch 5, Train Loss: 0.0083, Val Loss: 0.0088
Accuracy on the test set: 0.8316


## Baseline with full data

### GPT

In [None]:
encoder = self_supervised_training(train_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5612486600875854
Epoch 0, Step 20, Loss: 0.5575088262557983
Epoch 0, Step 40, Loss: 0.5431331396102905
Epoch 0, Step 60, Loss: 0.5322815179824829
Epoch 0, Step 80, Loss: 0.5085015296936035
Epoch 1, Step 0, Loss: 0.5128942728042603
Epoch 1, Step 20, Loss: 0.49806106090545654
Epoch 1, Step 40, Loss: 0.49045172333717346
Epoch 1, Step 60, Loss: 0.4946742355823517
Epoch 1, Step 80, Loss: 0.46657395362854004
Epoch 2, Step 0, Loss: 0.47391512989997864
Epoch 2, Step 20, Loss: 0.46016156673431396
Epoch 2, Step 40, Loss: 0.46043696999549866
Epoch 2, Step 60, Loss: 0.4631285071372986
Epoch 2, Step 80, Loss: 0.4402851462364197
Epoch 3, Step 0, Loss: 0.4468011260032654
Epoch 3, Step 20, Loss: 0.43825122714042664
Epoch 3, Step 40, Loss: 0.4443293809890747
Epoch 3, Step 60, Loss: 0.4470658302307129
Epoch 3, Step 80, Loss: 0.4253312349319458
Epoch 4, Step 0, Loss: 0.42991697788238525
Epoch 4, Step 20, Loss: 0.42591428756713867
Epoch 4, Step 40, Loss: 0.4336296021938324
Epoch 4

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.4588570594787598
Epoch 1, Train Loss: 0.0199, Val Loss: 0.0173
Epoch 1, Step 0, Loss: 1.0806512832641602
Epoch 2, Train Loss: 0.0147, Val Loss: 0.0136
Epoch 2, Step 0, Loss: 0.8538839817047119
Epoch 3, Train Loss: 0.0122, Val Loss: 0.0119
Epoch 3, Step 0, Loss: 0.6252935528755188
Epoch 4, Train Loss: 0.0109, Val Loss: 0.0109
Epoch 4, Step 0, Loss: 0.749545693397522
Epoch 5, Train Loss: 0.0101, Val Loss: 0.0103
Accuracy on the test set: 0.7941


### EDA

In [None]:
encoder = self_supervised_training(train_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5731405019760132
Epoch 0, Step 20, Loss: 0.5587068796157837
Epoch 0, Step 40, Loss: 0.5431428551673889
Epoch 0, Step 60, Loss: 0.5234215259552002
Epoch 0, Step 80, Loss: 0.5087127685546875
Epoch 1, Step 0, Loss: 0.507163405418396
Epoch 1, Step 20, Loss: 0.5020095705986023
Epoch 1, Step 40, Loss: 0.496784508228302
Epoch 1, Step 60, Loss: 0.4808645248413086
Epoch 1, Step 80, Loss: 0.4672463536262512
Epoch 2, Step 0, Loss: 0.4715239107608795
Epoch 2, Step 20, Loss: 0.4677581191062927
Epoch 2, Step 40, Loss: 0.461266428232193
Epoch 2, Step 60, Loss: 0.45377853512763977
Epoch 2, Step 80, Loss: 0.4395783841609955
Epoch 3, Step 0, Loss: 0.4506603479385376
Epoch 3, Step 20, Loss: 0.448574423789978
Epoch 3, Step 40, Loss: 0.4431917071342468
Epoch 3, Step 60, Loss: 0.4392007887363434
Epoch 3, Step 80, Loss: 0.42451316118240356
Epoch 4, Step 0, Loss: 0.440254807472229
Epoch 4, Step 20, Loss: 0.432794451713562
Epoch 4, Step 40, Loss: 0.4352969527244568
Epoch 4, Step 60, Lo

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.1965320110321045
Epoch 1, Train Loss: 0.0170, Val Loss: 0.0155
Epoch 1, Step 0, Loss: 0.9454852342605591
Epoch 2, Train Loss: 0.0136, Val Loss: 0.0133
Epoch 2, Step 0, Loss: 0.8512250185012817
Epoch 3, Train Loss: 0.0120, Val Loss: 0.0120
Epoch 3, Step 0, Loss: 0.6426646113395691
Epoch 4, Train Loss: 0.0111, Val Loss: 0.0112
Epoch 4, Step 0, Loss: 0.7705690264701843
Epoch 5, Train Loss: 0.0103, Val Loss: 0.0106
Accuracy on the test set: 0.7625


# TODO: Further research to come.