In the following notebook, we investigate the use of SAS (subsets that maximize expected augmentation similarity) to select representative subsets that improve self-supervised learning in a text categorization task. We also investigate the use of multilingual embeddings to further strengthen the SAS selection.

# Setup

In [None]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
!pip install nlpaug
!pip install gensim

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-

In [None]:
import csv
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
import gensim.downloader as api
import nlpaug.augmenter.word as naw
import nltk
from nltk.corpus import wordnet
import numpy as np
import pandas as pd
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from tqdm.auto import tqdm
# Delete some data from RAM to free up space for later processes
import gc

# Ensure you have the necessary NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Import Google Drive for locally saved files
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Global variables

In [None]:
# Max length of text embedding tensors
global_seed = 0
max_length = 256
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set seed for reproducibility
np.random.seed(global_seed)
random.seed(global_seed)

## Helper functions

In [None]:
# Basic preprocessing: lowercasing and removing non-alphanumeric characters
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    return text

## Load dataset

In [None]:
# Load the AG News dataset
dataset = load_dataset("ag_news")

# The dataset is divided into 'train' and 'test' splits
train_data = dataset['train']
test_data = dataset['test']

# For the parts that use GPT, we will use only 2.5% of the data due to the cost of running GPT.
train_data = train_data.train_test_split(test_size=0.975, stratify_by_column="label", seed = global_seed)
train_data = train_data['train']

# Example: Viewing the first training sample
print(train_data[0])
print(len(train_data))

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

{'text': '9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.', 'label': 0}
3000


## Create NLPAug augmented texts

In [None]:
# aug_1 = naw.WordEmbsAug(
#     model_type='word2vec',
#     model_path='/content/drive/MyDrive/COM SCI 260D/word2vec-google-news-300.bin',
#     top_k = 5, # Only use top 5 options
#     aug_p = 0.2 # 20% of the words augmented
# )
#
# aug_2 = naw.WordEmbsAug(
#     model_type='word2vec',
#     model_path='/content/drive/MyDrive/COM SCI 260D/word2vec-google-news-300.bin',
#     top_k = 10, # Only use top 10 options
#     aug_p = 0.4 # 40% of the words augmented
# )
#
# def eda_augmentation_with_word2vec(sentence, type):
#     if type == 1:
#         return aug_1.augment(sentence)[0]
#     else:
#         return aug_2.augment(sentence)[0]

In [None]:
# Add augmented texts for each example using NLPAug
# def add_nlp_aug_columns(data):
#     data['augment_1'] = eda_augmentation_with_word2vec(data['text'], 1)
#     data['augment_2'] = eda_augmentation_with_word2vec(data['text'], 2)
#     return data

# train_data = train_data.map(
#     lambda example: add_nlp_aug_columns(example)
# )

In [None]:
# Save the augmentations as the process takes 2 hours
# train_df = train_data.to_pandas()
# train_df.to_json("/content/drive/MyDrive/COM SCI 260D/augmented_dataset.json")

In [None]:
import datasets

# Load dataset with augmentations (pre-prepared)
train_df = pd.read_json("/content/drive/MyDrive/COM SCI 260D/augmented_dataset.json")
augmented_data = datasets.Dataset.from_pandas(train_df)

In [None]:
print(augmented_data)
print(augmented_data['text'][-1])
print(augmented_data['text'][-1])

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', '__index_level_0__'],
    num_rows: 3000
})
Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. unveiled Monday a new memory-packed personal digital assistant that can double as a portable data storage drive.
Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. unveiled Monday a new memory-packed personal digital assistant that can double as a portable data storage drive.


In [None]:
# Merge datasets by columns
def add_augmented_columns(data, augments, idx):
    data['augment_1'] = augments['augment_1'][idx]
    data['augment_2'] = augments['augment_2'][idx]
    return data

train_data = train_data.map(
    lambda example, idx: add_augmented_columns(example, augmented_data, idx),
    with_indices=True
)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

## Load GPT augmented text and Spanish translations

In [None]:
gpt_dataset = load_dataset("csv", data_files="/content/drive/MyDrive/COM SCI 260D/gpt_dataset.csv", split = 'train')

print(gpt_dataset)
print(gpt_dataset[0:5])
print(train_data[0:5])

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['Original Text', 'Translation', 'Paraphrase 1', 'Paraphrase 2'],
    num_rows: 3000
})


In [None]:
# Ensure both datasets have the same number of rows
assert len(gpt_dataset) == len(train_data), "Datasets must have the same number of rows"

# Merge datasets by columns
def add_gpt_columns(data, augments, idx):
    data['translation'] = augments['Translation'][idx]
    data['gpt_1'] = augments['Paraphrase 1'][idx]
    existing_augment = augments['Paraphrase 2'][idx]
    if not existing_augment or not existing_augment.strip():
        data['gpt_2'] = data['augment_1'] # Fallback to EDA augmentation
    else:
        data['gpt_2'] = existing_augment
    return data

train_data = train_data.map(
    lambda example, idx: add_gpt_columns(example, gpt_dataset, idx),
    with_indices=True
)

print(train_data)
print(train_data[0])

# There are 8 examples that are not correctly being imported. For now, we just remove them.
error_data = train_data.filter(lambda row: row['gpt_2'] is None or row['gpt_1'] is None)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2'],
    num_rows: 3000
})
{'text': '9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.', 'label': 0, 'augment_1': '9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.', 'augment_2': "2 hurt in explosion Tuesdayat Indonesian Embassy LYON_France - - An explosion struck in Indonesian charge_d'_affaires in Paris today, tad wounding nine people, a Algerian radio Finsbury_Park_Tube reported.", 'translation': '9 heridos en explosión en la Embajada de Indonesia PARÍS -- Una explosión golpeó la Embajada de Indonesia en París hoy, hiriendo levemente a nueve personas, según informó una estación de radio francesa.', 'gpt_1': '1. The Indonesian Embassy

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
print(error_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2'],
    num_rows: 0
})


In [None]:
print(train_data['text'][0])
print(train_data['gpt_1'][0])
print(train_data['gpt_2'][0])
print(train_data['augment_1'][0])
print(train_data['augment_2'][0])
print("")
print(train_data['text'][-1])
print(train_data['gpt_1'][-1])
print(train_data['gpt_2'][-1])
print(train_data['augment_1'][-1])
print(train_data['augment_2'][-1])

9 hurt in blast at Indonesian Embassy PARIS -- An explosion struck the Indonesian Embassy in Paris today, slightly injuring nine people, a French radio station reported.
1. The Indonesian Embassy in Paris was struck by an explosion today, leading to minor injuries for nine individuals, as reported by a French radio station.
9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.
9 hurt in blast at Indonesian Embassy PARIS - - An thunderous_explosion hit the Indonesian Embassy in Pantheon_Sorbonne today, slightly injure three people, a Paris radio station reports.
2 hurt in explosion Tuesdayat Indonesian Embassy LYON_France - - An explosion struck in Indonesian charge_d'_affaires in Paris today, tad wounding nine people, a Algerian radio Finsbury_Park_Tube reported.

Palm Introduces a Memory-Packed Organizer (AP) AP - Handheld computer maker PalmOne Inc. un

# Latent class discovery

First, we use 1% of the data randomly selected as a small piece of labelled information to train a basic pre-trained classifier and assign latent classes to each training data point.

In [None]:
# Select 1% of the data randomly
shuffled_dataset = train_data.shuffle(seed = global_seed)
sample_size = int(0.01 * len(shuffled_dataset))
labeled_data = shuffled_dataset.select(range(sample_size))
rest_of_data = shuffled_dataset.select(range(sample_size, len(shuffled_dataset)))

# Get number of classes from the 1% data
unique_labels = set(labeled_data['label'])
num_classes = len(unique_labels)

print(num_classes)

4


In [None]:
# Split the 1% dataset into training and validation sets. We only use 20% as validation set due to size of data.
labeled_data_for_training = labeled_data.train_test_split(test_size=0.2, stratify_by_column="label", seed = global_seed)

In [None]:
# Pre-process and tokenize data
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-small')

def tokenize(examples):
    preprocessed_texts = [preprocess(text) for text in examples['text']]
    return tokenizer(preprocessed_texts, padding='max_length', truncation=True, max_length = max_length, return_tensors="pt")

tokenized_dataset = labeled_data_for_training.map(tokenize, batched=True)

config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [None]:
# Import model for linear classification using the number of classes in 1% data.
model = AutoModelForSequenceClassification.from_pretrained('prajjwal1/bert-small', num_labels=num_classes)

pytorch_model.bin:   0%|          | 0.00/116M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Use the 1% labelled data to train a BERT model
latent_num_epochs = 20

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=latent_num_epochs,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['test'])

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.336391
2,1.409100,1.333906
3,1.409100,1.328684
4,1.384900,1.32195
5,1.374100,1.31249
6,1.374100,1.300743
7,1.316000,1.283679
8,1.316000,1.267876
9,1.286400,1.254886
10,1.196100,1.244909


TrainOutput(global_step=120, training_loss=1.1333855390548706, metrics={'train_runtime': 3.901, 'train_samples_per_second': 123.046, 'train_steps_per_second': 30.761, 'total_flos': 9492677591040.0, 'train_loss': 1.1333855390548706, 'epoch': 20.0})

In [None]:
# Save the model to a directory for local use
model.save_pretrained('./model')
tokenizer.save_pretrained('./tokenizer')

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [None]:
# Use the model to predict latent classes for rest of training data
def predict_batch(batch):
    # Tokenize the examples
    preprocessed_texts = [preprocess(text) for text in batch['text']]
    inputs = tokenizer(preprocessed_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Perform the prediction
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class indices
    predictions = outputs.logits.argmax(-1).cpu().numpy()
    return {'predictions': predictions}

# Create a DataLoader for the rest of the data
rest_of_data_loader = DataLoader(rest_of_data, batch_size=32)

# Run predictions in batches
results = []
for batch in tqdm(rest_of_data_loader, desc="Predicting latent classes"):
    batch_results = predict_batch(batch)
    results.extend(batch_results['predictions'])

Predicting latent classes:   0%|          | 0/93 [00:00<?, ?it/s]

In [None]:
# Save the predicted classes for local use
file_name = './latent_classes.csv'

with open(file_name, 'w', newline='') as file:
    writer = csv.writer(file)

    for integer in results:
        writer.writerow([integer])

In [None]:
# Add the latent class to the dataset
def add_new_column(example, idx, new_data):
    example['latent_class'] = new_data[idx]
    return example

labeled_rest_of_data = rest_of_data.map(
    lambda example, idx: add_new_column(example, idx, results),
    with_indices=True
)

# For the labelled data, we use the label as the latent class as it is already known.
def insert_column_with_same_value(example):
    example['latent_class'] = example['label']
    return example

labeled_data = labeled_data.map(
    lambda example: insert_column_with_same_value(example)
)

Map:   0%|          | 0/2970 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [None]:
# How well does the latent classification work?
def check_columns_equality(example, column1, column2):
    return example[column1] == example[column2]

check_accuracy_dataset = labeled_rest_of_data.filter(lambda example: check_columns_equality(example, 'label', 'latent_class'))

matching_count = len(check_accuracy_dataset)
print(f"Accuracy of latent classification: {matching_count / len(labeled_rest_of_data)}")

Filter:   0%|          | 0/2970 [00:00<?, ? examples/s]

Accuracy of latent classification: 0.7276094276094276


In [None]:
# Re-merge the 1% data and rest of the data for self-supervised learning
latent_class_data = concatenate_datasets([labeled_data, labeled_rest_of_data])
print(latent_class_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 3000
})


In [None]:
del results
del model
del tokenizer
del training_args
del trainer
del rest_of_data
del rest_of_data_loader
del labeled_rest_of_data
del check_accuracy_dataset
del labeled_data
del gpt_dataset

# Collect garbage
gc.collect()

107

# Subset selection

Now that we have some form of latent classes assigned to all of the training data, we can find subsets using three methods:

1. Original SAS algorithm
2. Random subset from each latent class
3. Original SAS algorithm + Spanish embeddings

We will select a 20% subset and compare the three. In the long run, we also need to compare with using the whole dataset but that is going to be very computationally intensive.

In [None]:
# Split into latent classes to select subsets from each
def create_filter_function(latent_class):
    def filter_label(example):
        return example['latent_class'] == latent_class
    return filter_label

world_train_data = latent_class_data.filter(create_filter_function(0))
sports_train_data = latent_class_data.filter(create_filter_function(1))
business_train_data = latent_class_data.filter(create_filter_function(2))
tech_train_data = latent_class_data.filter(create_filter_function(3))

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
# We will select the subset in this ratio for all subsets, for fairness
print(len(world_train_data))
print(len(sports_train_data))
print(len(business_train_data))
print(len(tech_train_data))

751
691
507
1051


## SAS

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def sas_algorithm(data, subset_size):
    vectorizer = TfidfVectorizer()

    # Get embeddings for the text data
    embeddings = vectorizer.fit_transform(data)
    n = len(data)

    # Initialize the subset and similarities
    S = set()
    S_similarities = np.full(n, -np.inf)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = cosine_similarity(embeddings, embeddings)

    del embeddings
    gc.collect()

    for _ in tqdm(range(subset_size)):
        not_in_S = np.array([i for i in range(n) if i not in S])

        # Calculate gains for each potential exemplar not in S
        gains = []
        for idx in not_in_S:
            gain = np.sum(np.maximum(S_similarities, similarity_matrix[idx]))
            gains.append(gain)

        # Find the exemplar with the best gain
        best_exemplar_idx = np.argmax(gains)
        best_exemplar = not_in_S[best_exemplar_idx]

        # Update the similarity for the best-selected subset
        S_similarities = np.maximum(S_similarities, similarity_matrix[best_exemplar])
        S.add(best_exemplar)

    return list(S)

In [None]:
# Run the SAS algorithm to select a 20% subset from each latent class
subset_size = 0.2

world_sas_indices = sas_algorithm(world_train_data['text'], int(subset_size * len(world_train_data)))
sports_sas_indices = sas_algorithm(sports_train_data['text'], int(subset_size * len(sports_train_data)))
business_sas_indices = sas_algorithm(business_train_data['text'], int(subset_size * len(business_train_data)))
tech_sas_indices = sas_algorithm(tech_train_data['text'], int(subset_size * len(tech_train_data)))

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

  0%|          | 0/101 [00:00<?, ?it/s]

  0%|          | 0/210 [00:00<?, ?it/s]

In [None]:
# Sanity check on indices
print(world_sas_indices)
print(sports_sas_indices)
print(business_sas_indices)
print(tech_sas_indices)

[512, 514, 515, 5, 7, 8, 9, 525, 17, 18, 19, 532, 21, 25, 27, 547, 548, 46, 49, 570, 60, 572, 576, 577, 581, 75, 78, 590, 89, 606, 96, 101, 102, 104, 616, 618, 620, 108, 113, 626, 628, 118, 119, 633, 634, 635, 126, 646, 136, 652, 140, 144, 657, 145, 661, 155, 670, 159, 162, 163, 677, 681, 682, 688, 689, 178, 691, 695, 184, 186, 700, 188, 198, 203, 206, 719, 722, 214, 216, 217, 732, 225, 740, 230, 231, 744, 233, 236, 748, 249, 251, 252, 254, 262, 264, 267, 274, 278, 279, 281, 284, 285, 287, 288, 301, 302, 309, 313, 319, 322, 325, 332, 334, 335, 347, 355, 357, 358, 359, 364, 365, 371, 372, 374, 378, 384, 390, 391, 393, 395, 396, 399, 402, 412, 416, 418, 419, 427, 432, 441, 470, 474, 475, 478, 485, 487, 493, 494, 507, 508]
[512, 516, 519, 12, 15, 528, 17, 18, 532, 535, 24, 537, 26, 27, 542, 35, 549, 550, 552, 41, 43, 555, 45, 556, 560, 52, 567, 56, 58, 59, 581, 70, 79, 86, 87, 89, 612, 104, 616, 106, 623, 115, 629, 122, 634, 640, 641, 642, 129, 643, 136, 651, 141, 656, 659, 150, 154, 672,

In [None]:
world_sas_subset = world_train_data.select(world_sas_indices)
sports_sas_subset = sports_train_data.select(sports_sas_indices)
business_sas_subset = business_train_data.select(business_sas_indices)
tech_sas_subset = tech_train_data.select(tech_sas_indices)

# Create the subset to use for contrastive self-supervised learning
sas_subset_data = concatenate_datasets([world_sas_subset, sports_sas_subset, business_sas_subset, tech_sas_subset])

In [None]:
del world_sas_subset
del sports_sas_subset
del business_sas_subset
del tech_sas_subset
del world_sas_indices
del sports_sas_indices
del business_sas_indices
del tech_sas_indices

gc.collect()

In [None]:
print(sas_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 599
})


## Random

In [None]:
# Randomly sample 20% of the dataset for each latent class
def random_subset(data, subset_size):
    return random.sample(range(len(data)), subset_size)

In [None]:
subset_size = 0.2

world_random_indices = random_subset(world_train_data['text'], int(subset_size * len(world_train_data)))
sports_random_indices = random_subset(sports_train_data['text'], int(subset_size * len(sports_train_data)))
business_random_indices = random_subset(business_train_data['text'], int(subset_size * len(business_train_data)))
tech_random_indices = random_subset(tech_train_data['text'], int(subset_size * len(tech_train_data)))

In [None]:
# Sanity check on indices
print(world_random_indices)
print(sports_random_indices)
print(business_random_indices)
print(tech_random_indices)

[654, 114, 25, 281, 250, 228, 142, 104, 692, 558, 89, 604, 432, 32, 30, 95, 223, 238, 517, 616, 27, 574, 203, 665, 718, 741, 429, 225, 459, 603, 284, 6, 163, 714, 738, 348, 720, 159, 220, 344, 743, 94, 389, 99, 367, 352, 618, 270, 44, 470, 549, 127, 387, 80, 565, 300, 643, 633, 370, 591, 196, 71, 46, 677, 233, 296, 81, 733, 103, 708, 717, 464, 650, 373, 166, 379, 363, 214, 273, 663, 73, 623, 678, 175, 546, 746, 167, 473, 388, 276, 655, 570, 224, 332, 57, 234, 737, 323, 410, 274, 67, 216, 580, 322, 217, 511, 405, 469, 146, 271, 744, 252, 729, 551, 269, 598, 438, 597, 408, 742, 658, 141, 521, 505, 93, 48, 112, 156, 726, 716, 610, 65, 394, 390, 620, 479, 541, 257, 566, 11, 117, 700, 672, 715, 749, 695, 445, 161, 679, 3]
[269, 512, 182, 519, 108, 640, 305, 654, 687, 623, 203, 156, 382, 165, 552, 543, 0, 613, 331, 500, 19, 114, 371, 314, 245, 59, 246, 580, 80, 87, 497, 70, 545, 128, 131, 486, 562, 169, 271, 540, 621, 433, 216, 676, 205, 319, 408, 678, 448, 529, 462, 123, 253, 230, 65, 346, 

In [None]:
world_random_subset = world_train_data.select(world_random_indices)
sports_random_subset = sports_train_data.select(sports_random_indices)
business_random_subset = business_train_data.select(business_random_indices)
tech_random_subset = tech_train_data.select(tech_random_indices)

# Create the subset to use for contrastive self-supervised learning
random_subset_data = concatenate_datasets([world_random_subset, sports_random_subset, business_random_subset, tech_random_subset])

In [None]:
del world_random_subset
del sports_random_subset
del business_random_subset
del tech_random_subset
del world_random_indices
del sports_random_indices
del business_random_indices
del tech_random_indices

gc.collect()

38

In [None]:
print(random_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 599
})


## Multi-lingual SAS

In [None]:
def sas_with_spanish_algorithm(data, subset_size):
    vectorizer = TfidfVectorizer()

    # Get embeddings for the text data
    embeddings = vectorizer.fit_transform(data['text'])
    n = len(data['text'])
    print(embeddings.shape)

    # Initialize the subset and similarities
    S = set()
    S_similarities = np.full(n, -np.inf)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = cosine_similarity(embeddings, embeddings)

    del(embeddings)
    gc.collect()

    # Get Spanish embeddings
    spanish_vectorizer = TfidfVectorizer()
    spanish_embeddings = spanish_vectorizer.fit_transform(data['translation'])
    print(spanish_embeddings.shape)

    # Compute similarity matrix using cosine_similarity
    similarity_matrix = np.add(similarity_matrix, cosine_similarity(spanish_embeddings, spanish_embeddings))

    del(spanish_embeddings)
    gc.collect()

    for _ in tqdm(range(subset_size)):
        not_in_S = np.array([i for i in range(n) if i not in S])

        # Calculate gains for each potential exemplar not in S
        gains = []
        for idx in not_in_S:
            gain = np.sum(np.maximum(S_similarities, similarity_matrix[idx]))
            gains.append(gain)

        # Find the exemplar with the best gain
        best_exemplar_idx = np.argmax(gains)
        best_exemplar = not_in_S[best_exemplar_idx]

        # Update the similarity for the best-selected subset
        S_similarities = np.maximum(S_similarities, similarity_matrix[best_exemplar])
        S.add(best_exemplar)

    return list(S)

In [None]:
# Run the SAS algorithm to select a 20% subset from each latent class
subset_size = 0.2

world_multilingual_sas_indices = sas_with_spanish_algorithm(world_train_data, int(subset_size * len(world_train_data)))
sports_multilingual_sas_indices = sas_with_spanish_algorithm(sports_train_data, int(subset_size * len(sports_train_data)))
business_multilingual_sas_indices = sas_with_spanish_algorithm(business_train_data, int(subset_size * len(business_train_data)))
tech_multilingual_sas_indices = sas_with_spanish_algorithm(tech_train_data, int(subset_size * len(tech_train_data)))

(751, 5622)
(751, 6479)


  0%|          | 0/150 [00:00<?, ?it/s]

(691, 5486)
(691, 6130)


  0%|          | 0/138 [00:00<?, ?it/s]

(507, 4498)
(507, 5097)


  0%|          | 0/101 [00:00<?, ?it/s]

(1051, 7461)
(1051, 8363)


  0%|          | 0/210 [00:00<?, ?it/s]

In [None]:
world_multilingual_sas_subset = world_train_data.select(world_multilingual_sas_indices)
sports_multilingual_sas_subset = sports_train_data.select(sports_multilingual_sas_indices)
business_multilingual_sas_subset = business_train_data.select(business_multilingual_sas_indices)
tech_multilingual_sas_subset = tech_train_data.select(tech_multilingual_sas_indices)

# Create the subset to use for contrastive self-supervised learning
multilingual_sas_subset_data = concatenate_datasets([world_multilingual_sas_subset, sports_multilingual_sas_subset, business_multilingual_sas_subset, tech_multilingual_sas_subset])

In [None]:
del world_multilingual_sas_subset
del sports_multilingual_sas_subset
del business_multilingual_sas_subset
del tech_multilingual_sas_subset
del world_multilingual_sas_indices
del sports_multilingual_sas_indices
del business_multilingual_sas_indices
del tech_multilingual_sas_indices

gc.collect()

In [None]:
print(multilingual_sas_subset_data)

Dataset({
    features: ['text', 'label', 'augment_1', 'augment_2', 'translation', 'gpt_1', 'gpt_2', 'latent_class'],
    num_rows: 599
})


# Contrastive learning pipeline

Now that we have a 20% representative subset of the data in many different ways. We use this subset, along with two different forms of textual augmentation:

1. Easy data augmentation such as synonym replacement, random addition and random swapping
2. GPT for paraphrasing sentences

to train a self-supervised contrastive learning encoder, which will be evaluated in a downstream prediction task.

In [None]:
# Define the encoder model. We will use pre-trained BERT as the initial embedding
# and use contrastive learning to further train embeddings that can be tested
# downstream, specifically in news article domain.
model_name = 'prajjwal1/bert-small'

class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def forward(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = self.model(**inputs)
        return outputs.last_hidden_state[:, 0, :] # Use the [CLS] token as the embedding

# Define the contrastive (InfoNCE) loss
class ContrastiveLoss(nn.Module):
    def forward(self, z_i, z_j, z_k):
        # We use cosine similarities between the embeddings
        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)

        # Similarity with positive augmented examples vs. negative examples
        positive_similarity = torch.exp(cos(z_i, z_j))
        negative_similarity = torch.exp(cos(z_i, z_k))

        loss = -torch.log(positive_similarity / (positive_similarity + negative_similarity))

        return loss.mean()

In [None]:
# Helper function to randomly move around indices to create a list for negative examples
def shuffle_without_duplication(arr):
    np.random.seed(global_seed)
    arr = np.array(arr)
    n = len(arr)
    # Create an array of the same shape filled with the indices
    indices = np.arange(n)
    while True:
        # Shuffle the indices
        np.random.shuffle(indices)
        # Check if no element remains in its original position
        if not np.any(indices == np.arange(n)):
            break
    # Return the shuffled array
    return arr[indices].tolist()

In [None]:
# Training function for self-supervised contrastive learning
def self_supervised_training(data, num_epochs, augment='eda'):
    # Parameters
    batch_size = 32
    learning_rate = 1e-6

    # Shuffle the data for better training. At first, all subsets are concatenated and
    # therefore separated by latent class
    data = data.shuffle(seed = global_seed)
    texts = data['text']
    if (augment == 'eda'):
        existing_augments_1 = data['augment_1']
        existing_augments_2 = data['augment_2']
    else:
        existing_augments_1 = data['gpt_1']
        existing_augments_2 = data['gpt_2']

    encoder = Encoder().to(device)
    loss_fn = ContrastiveLoss()
    optimizer = optim.Adam(encoder.parameters(), lr = learning_rate)

    # Training loop
    encoder.train()
    for epoch in range(num_epochs):
        for i in range(0, len(texts), batch_size):
            # Sample a batch of texts
            batch_existing_augments_1 = existing_augments_1[i:i + batch_size]
            batch_existing_augments_2 = existing_augments_2[i:i + batch_size]

            # Clean texts and create a list of negative texts from the batch, ensuring
            # that the same example is not selected for the negative.
            clean_existing_augments_1 = [preprocess(text) for text in batch_existing_augments_1]
            clean_existing_augments_2 = [preprocess(text) for text in batch_existing_augments_2]
            negative_augments = shuffle_without_duplication(clean_existing_augments_1)

            # Get the embeddings from the encoder
            z_i = encoder(clean_existing_augments_1)
            z_j = encoder(clean_existing_augments_2)
            z_k = encoder(negative_augments)

            # Compute the contrastive loss
            loss = loss_fn(z_i, z_j, z_k)

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print every 20 steps
            if (i // batch_size) % 20 == 0:
                print(f'Epoch {epoch}, Step {i // batch_size}, Loss: {loss.item()}')

    return encoder

# Evaluation pipeline

Through contrastive learning, we have learned an encoder $f$ that can embed our news article text for better downstream performance. We test by using this encoder $f$ and the true labels in training a linear classifier head, and testing on the test dataset. Here, we are trying to compare the efficacy of the encoder $f$ for the three different ways of selecting a subset. Additionally, we need to compare with using the full data, which will be left as a future direction due to computational issues.

In [None]:
# Define the linear classifier head
class LinearClassifier(nn.Module):
    def __init__(self, hidden_size, num_classes):
        super().__init__()
        self.linear = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        return self.linear(x)

In [None]:
# Create dataloaders from the train and test data
def create_dataloaders(train_data, test_data, batch_size):
    train_val_data = train_data.train_test_split(test_size=0.2, stratify_by_column="label", seed = global_seed)

    train_texts = [preprocess(text) for text in train_val_data['train']['text']]
    val_texts = [preprocess(text) for text in train_val_data['test']['text']]
    test_texts = [preprocess(text) for text in test_data['text']]

    # Create tensors for labels
    train_labels = torch.tensor(train_val_data['train']['label'])
    val_labels = torch.tensor(train_val_data['test']['label'])
    test_labels = torch.tensor(test_data['label'])

    # Create a DataLoader for our training and testing data
    train_data_for_classification = list(zip(train_texts, train_labels))
    val_data_for_classification = list(zip(val_texts, val_labels))
    test_data_for_classification = list(zip(test_texts, test_labels))

    train_dataloader = DataLoader(train_data_for_classification, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_data_for_classification, batch_size=batch_size)
    test_dataloader = DataLoader(test_data_for_classification, batch_size=batch_size)

    return train_dataloader, val_dataloader, test_dataloader

# Evaluation function to train a linear classifier head on top of the learned encoder
# and evaluate on the test set
def evaluate_encoder(encoder, train_data, test_data):
    # Parameters
    batch_size = 64
    num_epochs = 5
    num_classes = 4
    learning_rate = 1e-4

    # Create dataloaders
    train_dataloader, val_dataloader, test_dataloader = create_dataloaders(train_data, test_data, batch_size)

    # Set the encoder to evaluation mode and freeze all layers to test the trained embeddings from contrastive learning
    encoder.eval()
    for param in encoder.parameters():
        param.requires_grad = False

    # Initialize the linear classifier head
    classifier = LinearClassifier(encoder.model.config.hidden_size, num_classes).to(device)

    # Loss function and optimizer for the classifier head
    # Define scheduler for learning rate.
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(classifier.parameters(), lr = learning_rate)

    best_val_loss = float('inf')
    checkpoint_path = './best_checkpoint.pth'

    # Train the classifier
    for epoch in range(num_epochs):
        classifier.train()
        train_loss = 0.0

        for (index, data) in enumerate(train_dataloader):
            texts, labels = data
            labels = labels.to(device)
            optimizer.zero_grad()

            # Forward pass through the frozen encoder and classifier head
            embeddings = encoder(texts)
            outputs = classifier(embeddings)
            loss = criterion(outputs, labels)

            # Backpropagation
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            # Print every 20 steps
            if index % 50 == 0:
                print(f'Epoch {epoch}, Step {index}, Loss: {loss.item()}')

        classifier.eval()
        val_loss = 0.0
        with torch.no_grad():
            for data in val_dataloader:
                texts, labels = data
                labels = labels.to(device)

                embeddings = encoder(texts)
                outputs = classifier(embeddings)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

        # Average losses
        train_loss /= len(train_dataloader.dataset)
        val_loss /= len(val_dataloader.dataset)

        # Print stats
        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(classifier.state_dict(), checkpoint_path)

    # Evaluate the classifier
    classifier.load_state_dict(torch.load(checkpoint_path))
    classifier.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for texts, labels in test_dataloader:
            labels = labels.to(device)

            embeddings = encoder(texts)
            outputs = classifier(embeddings)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f'Accuracy on the test set: {accuracy:.4f}')

# Testing different subset selection processes + augmentation techniques

## Subset selection using SAS

### GPT

In [None]:
encoder = self_supervised_training(sas_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5632005333900452
Epoch 1, Step 0, Loss: 0.5509656667709351
Epoch 2, Step 0, Loss: 0.5395742058753967
Epoch 3, Step 0, Loss: 0.5238931179046631
Epoch 4, Step 0, Loss: 0.5073396563529968
Epoch 5, Step 0, Loss: 0.5052011609077454
Epoch 6, Step 0, Loss: 0.49774664640426636
Epoch 7, Step 0, Loss: 0.4870452880859375
Epoch 8, Step 0, Loss: 0.4786990284919739
Epoch 9, Step 0, Loss: 0.46859487891197205
Epoch 10, Step 0, Loss: 0.45873740315437317
Epoch 11, Step 0, Loss: 0.4538267254829407
Epoch 12, Step 0, Loss: 0.4479732811450958
Epoch 13, Step 0, Loss: 0.44804850220680237
Epoch 14, Step 0, Loss: 0.4400217533111572
Epoch 15, Step 0, Loss: 0.43273597955703735
Epoch 16, Step 0, Loss: 0.4317689538002014
Epoch 17, Step 0, Loss: 0.4314153790473938
Epoch 18, Step 0, Loss: 0.4281109571456909
Epoch 19, Step 0, Loss: 0.4226861894130707
Epoch 20, Step 0, Loss: 0.41884517669677734
Epoch 21, Step 0, Loss: 0.42297276854515076
Epoch 22, Step 0, Loss: 0.41542738676071167
Epoch 23, Ste

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.5764225721359253
Epoch 1, Train Loss: 0.0204, Val Loss: 0.0175
Epoch 1, Step 0, Loss: 1.0386879444122314
Epoch 2, Train Loss: 0.0142, Val Loss: 0.0128
Epoch 2, Step 0, Loss: 0.8465198278427124
Epoch 3, Train Loss: 0.0109, Val Loss: 0.0104
Epoch 3, Step 0, Loss: 0.6313831806182861
Epoch 4, Train Loss: 0.0092, Val Loss: 0.0090
Epoch 4, Step 0, Loss: 0.5107360482215881
Epoch 5, Train Loss: 0.0083, Val Loss: 0.0082
Accuracy on the test set: 0.8437


### EDA

In [None]:
encoder = self_supervised_training(sas_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5611304044723511
Epoch 1, Step 0, Loss: 0.5500785112380981
Epoch 2, Step 0, Loss: 0.5371167659759521
Epoch 3, Step 0, Loss: 0.5282013416290283
Epoch 4, Step 0, Loss: 0.5122260451316833
Epoch 5, Step 0, Loss: 0.49994704127311707
Epoch 6, Step 0, Loss: 0.48881596326828003
Epoch 7, Step 0, Loss: 0.48516592383384705
Epoch 8, Step 0, Loss: 0.4738141596317291
Epoch 9, Step 0, Loss: 0.4695310890674591
Epoch 10, Step 0, Loss: 0.46564480662345886
Epoch 11, Step 0, Loss: 0.45699024200439453
Epoch 12, Step 0, Loss: 0.45234787464141846
Epoch 13, Step 0, Loss: 0.45202767848968506
Epoch 14, Step 0, Loss: 0.4406840205192566
Epoch 15, Step 0, Loss: 0.4372349679470062
Epoch 16, Step 0, Loss: 0.4336959719657898
Epoch 17, Step 0, Loss: 0.4276503324508667
Epoch 18, Step 0, Loss: 0.4299049377441406
Epoch 19, Step 0, Loss: 0.425557941198349
Epoch 20, Step 0, Loss: 0.424533486366272
Epoch 21, Step 0, Loss: 0.42136096954345703
Epoch 22, Step 0, Loss: 0.4169996976852417
Epoch 23, Step 

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.8447624444961548
Epoch 1, Train Loss: 0.0243, Val Loss: 0.0212
Epoch 1, Step 0, Loss: 1.256259799003601
Epoch 2, Train Loss: 0.0171, Val Loss: 0.0155
Epoch 2, Step 0, Loss: 0.9051374793052673
Epoch 3, Train Loss: 0.0130, Val Loss: 0.0123
Epoch 3, Step 0, Loss: 0.6701733469963074
Epoch 4, Train Loss: 0.0108, Val Loss: 0.0106
Epoch 4, Step 0, Loss: 0.569615364074707
Epoch 5, Train Loss: 0.0095, Val Loss: 0.0096
Accuracy on the test set: 0.8175


## Random subset selection

### GPT

In [None]:
encoder = self_supervised_training(random_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5641740560531616
Epoch 1, Step 0, Loss: 0.558645486831665
Epoch 2, Step 0, Loss: 0.545375406742096
Epoch 3, Step 0, Loss: 0.5327896475791931
Epoch 4, Step 0, Loss: 0.5211867094039917
Epoch 5, Step 0, Loss: 0.5120052099227905
Epoch 6, Step 0, Loss: 0.5057783722877502
Epoch 7, Step 0, Loss: 0.49204832315444946
Epoch 8, Step 0, Loss: 0.4893980026245117
Epoch 9, Step 0, Loss: 0.4822220206260681
Epoch 10, Step 0, Loss: 0.47499218583106995
Epoch 11, Step 0, Loss: 0.4701765477657318
Epoch 12, Step 0, Loss: 0.4656178057193756
Epoch 13, Step 0, Loss: 0.45999014377593994
Epoch 14, Step 0, Loss: 0.4520881175994873
Epoch 15, Step 0, Loss: 0.45173555612564087
Epoch 16, Step 0, Loss: 0.4475706219673157
Epoch 17, Step 0, Loss: 0.4394904375076294
Epoch 18, Step 0, Loss: 0.44536012411117554
Epoch 19, Step 0, Loss: 0.44064098596572876
Epoch 20, Step 0, Loss: 0.43569618463516235
Epoch 21, Step 0, Loss: 0.4351831078529358
Epoch 22, Step 0, Loss: 0.42618411779403687
Epoch 23, Step 

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.8358432054519653
Epoch 1, Train Loss: 0.0236, Val Loss: 0.0205
Epoch 1, Step 0, Loss: 1.2686395645141602
Epoch 2, Train Loss: 0.0162, Val Loss: 0.0145
Epoch 2, Step 0, Loss: 0.9222849607467651
Epoch 3, Train Loss: 0.0119, Val Loss: 0.0113
Epoch 3, Step 0, Loss: 0.613990843296051
Epoch 4, Train Loss: 0.0097, Val Loss: 0.0096
Epoch 4, Step 0, Loss: 0.506876528263092
Epoch 5, Train Loss: 0.0084, Val Loss: 0.0086
Accuracy on the test set: 0.8445


### EDA

In [None]:
encoder = self_supervised_training(random_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5710808634757996
Epoch 1, Step 0, Loss: 0.5556999444961548
Epoch 2, Step 0, Loss: 0.5427685976028442
Epoch 3, Step 0, Loss: 0.5336276292800903
Epoch 4, Step 0, Loss: 0.5221565961837769
Epoch 5, Step 0, Loss: 0.5144083499908447
Epoch 6, Step 0, Loss: 0.5091357231140137
Epoch 7, Step 0, Loss: 0.4930974841117859
Epoch 8, Step 0, Loss: 0.48900270462036133
Epoch 9, Step 0, Loss: 0.48175057768821716
Epoch 10, Step 0, Loss: 0.47555962204933167
Epoch 11, Step 0, Loss: 0.4688972234725952
Epoch 12, Step 0, Loss: 0.4602883458137512
Epoch 13, Step 0, Loss: 0.46042001247406006
Epoch 14, Step 0, Loss: 0.4576275646686554
Epoch 15, Step 0, Loss: 0.4465128183364868
Epoch 16, Step 0, Loss: 0.44868698716163635
Epoch 17, Step 0, Loss: 0.44865310192108154
Epoch 18, Step 0, Loss: 0.4430893361568451
Epoch 19, Step 0, Loss: 0.4377768337726593
Epoch 20, Step 0, Loss: 0.4352160692214966
Epoch 21, Step 0, Loss: 0.4377955198287964
Epoch 22, Step 0, Loss: 0.4337537884712219
Epoch 23, Step 

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.7301530838012695
Epoch 1, Train Loss: 0.0225, Val Loss: 0.0193
Epoch 1, Step 0, Loss: 1.165528416633606
Epoch 2, Train Loss: 0.0155, Val Loss: 0.0140
Epoch 2, Step 0, Loss: 0.8649449944496155
Epoch 3, Train Loss: 0.0119, Val Loss: 0.0113
Epoch 3, Step 0, Loss: 0.6911087036132812
Epoch 4, Train Loss: 0.0100, Val Loss: 0.0098
Epoch 4, Step 0, Loss: 0.5739361643791199
Epoch 5, Train Loss: 0.0088, Val Loss: 0.0090
Accuracy on the test set: 0.8200


## Subset selection using multilingual SAS

### GPT

In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 100, 'gpt')

Epoch 0, Step 0, Loss: 0.5663215517997742
Epoch 1, Step 0, Loss: 0.5516553521156311
Epoch 2, Step 0, Loss: 0.5441715121269226
Epoch 3, Step 0, Loss: 0.5365208983421326
Epoch 4, Step 0, Loss: 0.5259498953819275
Epoch 5, Step 0, Loss: 0.5151268243789673
Epoch 6, Step 0, Loss: 0.5032510161399841
Epoch 7, Step 0, Loss: 0.49017563462257385
Epoch 8, Step 0, Loss: 0.4873460531234741
Epoch 9, Step 0, Loss: 0.48072147369384766
Epoch 10, Step 0, Loss: 0.4710867702960968
Epoch 11, Step 0, Loss: 0.47311121225357056
Epoch 12, Step 0, Loss: 0.47102612257003784
Epoch 13, Step 0, Loss: 0.4563906788825989
Epoch 14, Step 0, Loss: 0.4527590572834015
Epoch 15, Step 0, Loss: 0.4539031386375427
Epoch 16, Step 0, Loss: 0.44574493169784546
Epoch 17, Step 0, Loss: 0.44063982367515564
Epoch 18, Step 0, Loss: 0.4463937282562256
Epoch 19, Step 0, Loss: 0.43666934967041016
Epoch 20, Step 0, Loss: 0.43026208877563477
Epoch 21, Step 0, Loss: 0.43306833505630493
Epoch 22, Step 0, Loss: 0.43067467212677
Epoch 23, Step

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.3927090167999268
Epoch 1, Train Loss: 0.0193, Val Loss: 0.0165
Epoch 1, Step 0, Loss: 1.0212249755859375
Epoch 2, Train Loss: 0.0136, Val Loss: 0.0123
Epoch 2, Step 0, Loss: 0.7707505822181702
Epoch 3, Train Loss: 0.0106, Val Loss: 0.0101
Epoch 3, Step 0, Loss: 0.616077184677124
Epoch 4, Train Loss: 0.0090, Val Loss: 0.0089
Epoch 4, Step 0, Loss: 0.47930994629859924
Epoch 5, Train Loss: 0.0081, Val Loss: 0.0082
Accuracy on the test set: 0.8482


### EDA

In [None]:
encoder = self_supervised_training(multilingual_sas_subset_data, 100, 'eda')

Epoch 0, Step 0, Loss: 0.5700315237045288
Epoch 1, Step 0, Loss: 0.555937647819519
Epoch 2, Step 0, Loss: 0.5470132827758789
Epoch 3, Step 0, Loss: 0.5334316492080688
Epoch 4, Step 0, Loss: 0.5214264392852783
Epoch 5, Step 0, Loss: 0.507258415222168
Epoch 6, Step 0, Loss: 0.5000195503234863
Epoch 7, Step 0, Loss: 0.49756163358688354
Epoch 8, Step 0, Loss: 0.4853055477142334
Epoch 9, Step 0, Loss: 0.4820448160171509
Epoch 10, Step 0, Loss: 0.4718504250049591
Epoch 11, Step 0, Loss: 0.46417999267578125
Epoch 12, Step 0, Loss: 0.4645654857158661
Epoch 13, Step 0, Loss: 0.4548589289188385
Epoch 14, Step 0, Loss: 0.4522782266139984
Epoch 15, Step 0, Loss: 0.44653773307800293
Epoch 16, Step 0, Loss: 0.4411824941635132
Epoch 17, Step 0, Loss: 0.4414566457271576
Epoch 18, Step 0, Loss: 0.44016143679618835
Epoch 19, Step 0, Loss: 0.4373800754547119
Epoch 20, Step 0, Loss: 0.43322765827178955
Epoch 21, Step 0, Loss: 0.4292086362838745
Epoch 22, Step 0, Loss: 0.4235588312149048
Epoch 23, Step 0, 

In [None]:
evaluate_encoder(encoder, train_data, test_data)

Epoch 0, Step 0, Loss: 1.562373399734497
Epoch 1, Train Loss: 0.0201, Val Loss: 0.0170
Epoch 1, Step 0, Loss: 1.0424948930740356
Epoch 2, Train Loss: 0.0142, Val Loss: 0.0126
Epoch 2, Step 0, Loss: 0.7893850803375244
Epoch 3, Train Loss: 0.0113, Val Loss: 0.0105
Epoch 3, Step 0, Loss: 0.5720924735069275
Epoch 4, Train Loss: 0.0097, Val Loss: 0.0094
Epoch 4, Step 0, Loss: 0.48923778533935547
Epoch 5, Train Loss: 0.0089, Val Loss: 0.0087
Accuracy on the test set: 0.8183


# TODO: Further research to come.