# Augmentation for Sentence Transformers

<p align="center">
<img src="/Users/mariavivo/repos/meri/learning/transformers/data-aug/data-aug-sentence-transf.png" width="500" height="300">
</p>

In [None]:
# env transformers
import datasets

# load dataset
stsb = datasets.load_dataset('glue', 'stsb', split='train')
stsb_dev = datasets.load_dataset('glue', 'stsb', split='validation')
stsb

In [None]:
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

train_data = []
for row in stsb:
    train_data.append(
        InputExample(
            texts=[row['sentence1'], row['sentence2']],
            label=int(float(row['label']))
        )
    )

batch_size = 16
# load our training data (first 95%) into a dataloader
loader = DataLoader(
    train_data, shuffle=True, batch_size=batch_size
)

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

# initialize cross-encoder
cross_encoder = CrossEncoder('bert-base-uncased', num_labels=1)

In [None]:
# fine-tune cross-encoder
num_epochs = 1
warmup = int(len(loader) * num_epochs * 0.4)

cross_encoder.fit(
    train_dataloader=loader,
    epochs=num_epochs,
    warmup_steps=warmup,
    output_path='bert-stsb-cross-encoder'
)

* ***The number of warmup steps is 40% of the total training steps. It is high but helps prevent overfitting.*** 

* ***The same could likely be achieved using a lower learning rate (the default is 2e-5)***

_Evaluation of the cross-encoder model on the dev set returns a correlation score of 0.578_

# Create Unlabeled Data

In [None]:
import pandas as pd

gold = datasets.load_dataset('glue', 'stsb', split='train')

gold = pd.DataFrame({
    'sentence1': gold['sentence1'],
    'sentence2': gold['sentence2']
})

In [None]:
from tqdm.auto import tqdm

# initialize a new pairs dataframe, loop through each unique 
# sentence from the sentence1 column and find new pairs from 
# the sentence2 column.

pairs = pd.DataFrame()
# loop through each unique sentence in 'sentence1'
for sentence1 in tqdm(list(set(gold['sentence1']))):
    # get a sample of 5 rows that do not contain the current 'sentence1'
    sampled = gold[gold['sentence1'] != sentence1].sample(5)
    # get the 5 sentence2 sentences
    sampled = sampled['sentence2'].tolist()
    for sentence2 in sampled:
        # append all of these new pairs to the new 'pairs' dataframe
        pairs = pairs.append({
            'sentence1': sentence1,
            'sentence2': sentence2
        }, ignore_index=True)

# remove duplicates
pairs = pairs.drop_duplicates()
print(f"Now there are {len(pairs)} unlabeled sentence pairs")

# Labeling the Silver Dataset

In [None]:
# load previously fine-tuned cross-encoder
cross_encoder = CrossEncoder('bert-stsb-cross-encoder')

# predict labels
#--------------------------------------------------------------
# zip pairs together in format for the cross-encoder
silver = list(zip(pairs['sentence1'], pairs['sentence2']))
# predict labels for the unlabeled silver data
scores = cross_encoder.predict(silver)

# add the predicted scores to the pairs dataframe
pairs['label'] = scores.tolist()
pairs.head()

# Fine-Tune Sentence Transformer

In [None]:
# Before training, we need to merge the silver and gold datasets
all_data = gold.append(pairs, ignore_index=True)

# format into input examples
train = []
for _, row in all_data.iterrows():
    train.append(
        InputExample(
            texts=[row['sentence1'], row['sentence2']],
            label=float(row['label'])
        )
    )

# initialize dataloader
loader = DataLoader(
    train, shuffle=True, batch_size=batch_size
)

The model consists of a:
- ***core transformer model*** ( `bert-base-uncased` )
- ***pooling layer*** to _transform the 512 token-level vectors into single sentence vectors_ (`mean pooling method`)

In [None]:
from sentence_transformers import models, SentenceTransformer
from sentence_transformers import losses

# initialize model
bert = models.Transformer('bert-base-uncased')
pooler = models.Pooling(
    bert.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True
)
model = SentenceTransformer(modules=[bert, pooler])

# define loss function -> Cosine Similarity
# (to optimize similarity scores is nice func)
loss = losses.CosineSimilarityLoss(model=model)

In [None]:
# and training
# We use the default learning rate and warmup for the first 15% of steps
epochs = 1
# warmup for first 15% of training steps
warmup_steps = int(len(loader) * epochs * 0.15)

model.fit(
    train_objectives=[(loader, loss)],
    epochs=epochs,
    warmup_steps=warmup_steps,
    output_path='bert-stsb-aug'
)

# RESULTS

<p align="center">
<img src="/Users/mariavivo/repos/meri/learning/transformers/data-aug/results-data-aug-sentence-transf.png" width="500" height="300">
</p>