### Step 1: Install the necessary packages

In [1]:
# !pip install datasets peft

### Step 2: Generate data for fine-tuning

In [2]:
import json

file_path = '../data/full_test/questions.json'

with open(file_path, 'r') as file:
    data = json.load(file)

questions = [data[q]['question']['question'] for q in range(len(data))]
questions

['How does a basic game of Dungeons & Dragons progress with respect to the roles of the Dungeon Master and the players?',
 "In a tabletop role-playing game, what is the process that the game master follows when a player's action results in a challenging situation?",
 'How does the nature of gameplay in a tabletop role-playing game like Dungeons & Dragons typically vary with the circumstances?',
 'What methods can a Dungeon Master use to help set the scene in a tabletop role-playing game, and how do players also contribute to character portrayal?',
 'In the game Dungeons & Dragons, how is the result of a roll of percentile dice, or d100, generated and read, particularly when the ten-sided dice are numbered in tens?',
 'How can one simulate a roll of 1d3 or 1d2 in a tabletop role-playing game using a d6 or any die?',
 'In the gameplay of Dungeons & Dragons, how is the success or failure of an action determined?',
 'What are the steps involved in making a d20 roll in a tabletop role-playi

In [3]:
from dotenv import load_dotenv
import os
from langchain.chat_models import AzureChatOpenAI

load_dotenv()

llm = AzureChatOpenAI(
    openai_api_key=os.getenv('OPENAI_API_KEY'),
    azure_endpoint=os.getenv('AZURE_ENDPOINT'),
    openai_api_version='2023-08-01-preview',
    deployment_name='gpt4_1106-Preview',
    temperature=0.5,
)

In [4]:
import sys
sys.path.append('..')

from synthlume.pipeline.step.text_generation_step import QuestionNewStyleStep

question_new_style_step = QuestionNewStyleStep(llm=llm, language='en')

generated_questions = []
for i in range(len(questions)):
    questions_dict = {'question': questions[i], 'new_style': 'informal'}
    generated_question = question_new_style_step.generate(**questions_dict)
    generated_questions.append(generated_question)

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

original_df = pd.DataFrame({'question': questions, 'style': 'original'})

generated_df = pd.DataFrame(generated_questions)
generated_df['style'] = generated_df['new_style']
generated_df.drop('new_style', axis=1, inplace=True)

df = pd.concat([original_df, generated_df], ignore_index=True)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['style'])

df

Unnamed: 0,question,style,label
0,How does a basic game of Dungeons & Dragons pr...,original,1
1,"In a tabletop role-playing game, what is the p...",original,1
2,How does the nature of gameplay in a tabletop ...,original,1
3,What methods can a Dungeon Master use to help ...,original,1
4,"In the game Dungeons & Dragons, how is the res...",original,1
...,...,...,...
79,"Hey, so if you're rolling up a dwarf character...",informal,0
80,"Hey, so in D&D, what stuff decides what gear y...",informal,0
81,"Hey, so in D&D, how do you figure out a charac...",informal,0
82,"Hey, so when you're swinging a sword or someth...",informal,0


In [6]:
# split df into train and validation
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.15, random_state=369, stratify=df['label'])

print(f'Train data: {len(train_df)} samples')
print(f'Validation data: {len(val_df)} samples')

Train data: 71 samples
Validation data: 13 samples


In [7]:
train_df.to_csv('../data/embeddings_finetune_data_train.csv', index=False)
val_df.to_csv('../data/embeddings_finetune_data_val.csv', index=False)

### Step 3: Prepare data for fine-tuning

In [8]:
from datasets import Dataset
from sentence_transformers import InputExample

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_examples = []
val_examples = []

for i in range(len(train_dataset)):
  example = train_dataset[i]
  train_examples.append(InputExample(texts=[example['question']], label=example['label']))

for i in range(len(val_dataset)):
  example = val_dataset[i]
  val_examples.append(InputExample(texts=[example['question']], label=example['label']))

### Step 4: Prepare the model for fine-tuning

In [9]:
from sentence_transformers import SentenceTransformer

model_id = 'sentence-transformers/all-MiniLM-L6-v2'
model = SentenceTransformer(model_id)

In [10]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  (2): Normalize()
)

In [11]:
embeddings_1 = model.encode(questions[0], normalize_embeddings=True)
embeddings_2 = model.encode(generated_questions[0]['question'], normalize_embeddings=True)

# compute cosine similarity as dot product of normalized embeddings
import numpy as np
np.dot(embeddings_1, embeddings_2)

0.41161135

In [12]:
encoder_model = list(model[0].children())[0]
encoder_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [13]:
print('Trainable parameters:', sum(p.numel() for p in encoder_model.parameters() if p.requires_grad))

Trainable parameters: 22713216


In [14]:
from peft import get_peft_model, LoraConfig, TaskType

lora_target_modules = [f'encoder.layer.{n}.attention.self.query' for n in range(6)]
lora_target_modules = lora_target_modules + [f'encoder.layer.{n}.attention.self.key' for n in range(6)]
lora_target_modules = lora_target_modules + [f'encoder.layer.{n}.attention.self.value' for n in range(6)]

peft_config = LoraConfig(
task_type=TaskType.FEATURE_EXTRACTION, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias='all', target_modules=lora_target_modules
)

encoder_model = get_peft_model(encoder_model, peft_config)

In [15]:
encoder_model

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 384, padding_idx=0)
        (position_embeddings): Embedding(512, 384)
        (token_type_embeddings): Embedding(2, 384)
        (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-5): 6 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=384, out_features=384, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=384, out_features=16, bias=False)
                  )
                  

In [16]:
print('Trainable parameters:', sum(p.numel() for p in encoder_model.parameters() if p.requires_grad))

Trainable parameters: 247296


Although the loss function takes care of mining the triplets (an anchor sentence paired with a positive sentence and paired with a negative sentence), according to the label they belong, the builtin evaluator functions to be used during training, such as the [`BinaryEmbeddingSimilarityEvaluator`](https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/evaluation/BinaryClassificationEvaluator.py) need to have sentence pairs prepared beforehand with labels being 0 for dissimilar pairs and 1 for similar pairs.

The function below is used to create such pairs from the validation dataset.

In [17]:
import itertools

def create_question_pairs(df, samples_per_category):
    # Create an empty dataframe with the required columns
    result = pd.DataFrame(columns=['question1', 'question2', 'label'])
  
    # Sample data from each category
    sampled_data = {}
    for category, num_samples in samples_per_category.items():
        sampled_data[category] = df[df['style'] == category].sample(n=num_samples)
  
    label_counts = {0: 0, 1: 0}
    rows_to_add = []  # A list to collect rows before concatenating
  
    # Generate all possible combinations of sampled data
    for category1, category2 in itertools.combinations(samples_per_category.keys(), 2):
        # Within-category combinations
        for i, j in itertools.combinations(sampled_data[category1].index, 2):
            if i < j:
                question1 = df.loc[i, 'question']
                question2 = df.loc[j, 'question']
            else:
                question1 = df.loc[j, 'question']
                question2 = df.loc[i, 'question']
              
            # Check for duplicates before adding to the result
            if not ((result['question1'] == question1) & (result['question2'] == question2)).any():
                rows_to_add.append({'question1': question1, 'question2': question2, 'label': 1})
                label_counts[1] += 1
  
        # Between-category combinations
        for i, j in itertools.product(sampled_data[category1].index, sampled_data[category2].index):
            if i < j:
                question1 = df.loc[i, 'question']
                question2 = df.loc[j, 'question']
            else:
                question1 = df.loc[j, 'question']
                question2 = df.loc[i, 'question']
  
            # Check for duplicates before adding to the result
            if not ((result['question1'] == question1) & (result['question2'] == question2)).any():
                rows_to_add.append({'question1': question1, 'question2': question2, 'label': 0})
                label_counts[0] += 1
  
    # Use pd.concat to add all the rows at once
    if rows_to_add:
        result = pd.concat([result, pd.DataFrame(rows_to_add)], ignore_index=True)
  
    # Trim the output to have an equal number of pairs for each label
    min_label_count = min(label_counts.values())
    trimmed_result = pd.DataFrame()
    for label in [0, 1]:
        trimmed_result = pd.concat([trimmed_result, result[result['label'] == label].sample(n=min_label_count)], ignore_index=True)
  
    return trimmed_result

In [18]:
val_df['style'].value_counts()

style
original    7
informal    6
Name: count, dtype: int64

In [19]:
val_df_pairs = create_question_pairs(val_df, {'original': 4, 'informal': 4})
val_df_pairs

Unnamed: 0,question1,question2,label
0,"In the game Dungeons & Dragons, how is the tou...","Hey, so like, what cool perks do you snag when...",0
1,"In the context of the game Dungeons & Dragons,...","Hey, so like, what cool perks do you snag when...",0
2,What signifies the entry of a character into t...,"Hey, so like, what cool perks do you snag when...",0
3,How can one simulate a roll of 1d3 or 1d2 in a...,"Hey, so like, what cool perks do you snag when...",0
4,How can one simulate a roll of 1d3 or 1d2 in a...,"Hey, so when you're rolling up a character in ...",0
5,What signifies the entry of a character into t...,"Hey, so when you're rolling up a character in ...",0
6,"In the context of the game Dungeons & Dragons,...","In the game Dungeons & Dragons, how is the tou...",1
7,How can one simulate a roll of 1d3 or 1d2 in a...,"In the game Dungeons & Dragons, how is the tou...",1
8,"In the context of the game Dungeons & Dragons,...",What signifies the entry of a character into t...,1
9,What signifies the entry of a character into t...,"In the game Dungeons & Dragons, how is the tou...",1


Instead of directly using the `BinaryClassificationEvaluator`, here we show how to create a custom evaluator derived from it but implementing a custom evaluation metric.

We call this metric separation score. It computes a value such that the closer this value to 1, the better separated are the sentences of different categories.

In [20]:
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from sklearn.metrics.pairwise import paired_cosine_distances
from typing import List

class CustomBinaryClassificationEvaluator(BinaryClassificationEvaluator):  
    def __init__(  
        self,  
        questions1: List[str],  
        questions2: List[str],  
        labels: List[int],  
        name: str = "",  
        show_progress_bar: bool = False,  
        batch_size: int = 512,  
        write_csv: bool = False,
        
        num_steps: int = 0,
    ):  
        super().__init__(questions1, questions2, labels, name, show_progress_bar, batch_size, write_csv)

        self.questions1 = questions1
        self.questions2 = questions2
        self.labels = labels
        self.num_steps = num_steps
  
    def __call__(self, model: SentenceTransformer, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
        
        # First, compute the sentence embeddings using the model  
        questions = list(set(self.questions1 + self.questions2))
        embeddings = model.encode(questions, batch_size=self.batch_size, convert_to_numpy=True, normalize_embeddings=True)
        emb_dict = {question: emb for question, emb in zip(questions, embeddings)}
        embeddings1 = [emb_dict[question] for question in self.questions1]
        embeddings2 = [emb_dict[question] for question in self.questions2]

        # Next, compute the cosine similarity between the embeddings
        dist_scores = paired_cosine_distances(embeddings1, embeddings2)
        
        # Cmpute the average similarity score for similar and non-similar pairs
        labels = np.asarray(self.labels)
        positive_pairs = labels == 1
        positive_pairs_dist_scores = dist_scores[positive_pairs]
        negative_pairs = labels == 0
        negative_pairs_dist_scores = dist_scores[negative_pairs]

        # Compute the average distance between similar pairs and non-similar pairs
        mean_positive_dist_score = np.mean(positive_pairs_dist_scores)
        mean_negative_dist_score = np.mean(negative_pairs_dist_scores)

        # Compute a normalized separation score such that the closer the value to 1, the better separated are the sentences of different categories 
        separation_score = ((1 - mean_positive_dist_score) + mean_negative_dist_score) / 2

        if steps == self.num_steps:
            print(f"epoch: {epoch}")
            print(f"Separation score: {separation_score:.4f}")
            print(f"Average distance between similar pairs: {mean_positive_dist_score:.4f}")
            print(f"Average distance between non-similar pairs: {mean_negative_dist_score:.4f}")

        return separation_score

### Step 5: Fine-tune

In [21]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=16)

In [22]:
from sentence_transformers.losses import BatchSemiHardTripletLoss

train_loss = BatchSemiHardTripletLoss(model=model)

num_epochs = 15
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

In [23]:
questions1 = val_df_pairs['question1'].tolist()
questions2 = val_df_pairs['question2'].tolist()
labels = val_df_pairs['label'].tolist()

evaluator = CustomBinaryClassificationEvaluator(questions1=questions1, questions2=questions2, labels=labels, batch_size=16, show_progress_bar=True,
                                                num_steps=len(train_dataloader))

In [24]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=num_epochs, warmup_steps=warmup_steps, optimizer_params={'lr': 1e-4},
          evaluator=evaluator, evaluation_steps=len(train_dataloader))

# model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=50, warmup_steps=10, optimizer_params={'lr': 5e-5})

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 0
Separation score: 0.4837
Average distance between similar pairs: 0.7386
Average distance between non-similar pairs: 0.7060


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 1
Separation score: 0.4842
Average distance between similar pairs: 0.7407
Average distance between non-similar pairs: 0.7090


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 2
Separation score: 0.4856
Average distance between similar pairs: 0.7409
Average distance between non-similar pairs: 0.7120


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 3
Separation score: 0.4871
Average distance between similar pairs: 0.7411
Average distance between non-similar pairs: 0.7152


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 4
Separation score: 0.4883
Average distance between similar pairs: 0.7423
Average distance between non-similar pairs: 0.7190


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 5
Separation score: 0.4889
Average distance between similar pairs: 0.7439
Average distance between non-similar pairs: 0.7216


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 6
Separation score: 0.4894
Average distance between similar pairs: 0.7457
Average distance between non-similar pairs: 0.7246


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 7
Separation score: 0.4903
Average distance between similar pairs: 0.7468
Average distance between non-similar pairs: 0.7273


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 8
Separation score: 0.4917
Average distance between similar pairs: 0.7462
Average distance between non-similar pairs: 0.7296


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 9
Separation score: 0.4928
Average distance between similar pairs: 0.7466
Average distance between non-similar pairs: 0.7322


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 10
Separation score: 0.4941
Average distance between similar pairs: 0.7464
Average distance between non-similar pairs: 0.7345


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 11
Separation score: 0.4951
Average distance between similar pairs: 0.7459
Average distance between non-similar pairs: 0.7362


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 12
Separation score: 0.4957
Average distance between similar pairs: 0.7461
Average distance between non-similar pairs: 0.7374


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 13
Separation score: 0.4962
Average distance between similar pairs: 0.7459
Average distance between non-similar pairs: 0.7383


Iteration:   0%|          | 0/5 [00:00<?, ?it/s]

epoch: 14
Separation score: 0.4964
Average distance between similar pairs: 0.7459
Average distance between non-similar pairs: 0.7387


In [25]:
embeddings_1 = model.encode(questions[0], normalize_embeddings=True)
embeddings_2 = model.encode(generated_questions[0]['question'], normalize_embeddings=True)

# compute cosine similarity as dot product of normalized embeddings
import numpy as np
np.dot(embeddings_1, embeddings_2)

0.32563233

In [26]:
encoder_model.save_pretrained('../finetuned_models/embeddings_finetune_model')

### Step 6: Test fine-tuned model

In [27]:
from peft import PeftModel

new_model = SentenceTransformer(model_id)

new_encoder_model = list(new_model[0].children())[0]
new_encoder_model = PeftModel.from_pretrained(model_id='../finetuned_models/embeddings_finetune_model', model=new_encoder_model)

In [28]:
embeddings_1 = new_model.encode(questions[0], normalize_embeddings=True)
embeddings_2 = new_model.encode(generated_questions[0]['question'], normalize_embeddings=True)

# compute cosine similarity as dot product of normalized embeddings
import numpy as np
np.dot(embeddings_1, embeddings_2)

0.32563233