### Step 1: Install the necessary packages

In [1]:
# !pip install datasets peft

### Step 2: Generate data for fine-tuning

In [2]:
import json

file_path = "../data/full_test/questions.json"

with open(file_path, "r") as file:
    data = json.load(file)

questions = [data[q]['question']['question'] for q in range(len(data))]
questions

['How does a basic game of Dungeons & Dragons progress with respect to the roles of the Dungeon Master and the players?',
 "In a tabletop role-playing game, what is the process that the game master follows when a player's action results in a challenging situation?",
 'How does the nature of gameplay in a tabletop role-playing game like Dungeons & Dragons typically vary with the circumstances?',
 'What methods can a Dungeon Master use to help set the scene in a tabletop role-playing game, and how do players also contribute to character portrayal?',
 'In the game Dungeons & Dragons, how is the result of a roll of percentile dice, or d100, generated and read, particularly when the ten-sided dice are numbered in tens?',
 'How can one simulate a roll of 1d3 or 1d2 in a tabletop role-playing game using a d6 or any die?',
 'In the gameplay of Dungeons & Dragons, how is the success or failure of an action determined?',
 'What are the steps involved in making a d20 roll in a tabletop role-playi

In [3]:
from dotenv import load_dotenv
import os
from langchain.chat_models import AzureChatOpenAI

load_dotenv()

llm = AzureChatOpenAI(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    azure_endpoint=os.getenv("AZURE_ENDPOINT"),
    openai_api_version="2023-08-01-preview",
    deployment_name="gpt4_1106-Preview",
    temperature=0.9,
)

In [4]:
import sys
sys.path.append('..')

from synthlume.pipeline.step.text_generation_step import QuestionNewStyleStep

question_new_style_step = QuestionNewStyleStep(llm=llm, language='en')

generated_questions = []
for i in range(len(questions)):
    questions_dict = {'question': questions[i], 'new_style': 'informal'}
    generated_question = question_new_style_step.generate(**questions_dict)
    generated_questions.append(generated_question)

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

original_df = pd.DataFrame({'question': questions, 'style': 'original'})

generated_df = pd.DataFrame(generated_questions)
generated_df['style'] = generated_df['new_style']
generated_df.drop('new_style', axis=1, inplace=True)

df = pd.concat([original_df, generated_df], ignore_index=True)

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['style'])

df

Unnamed: 0,question,style,label
0,How does a basic game of Dungeons & Dragons pr...,original,1
1,"In a tabletop role-playing game, what is the p...",original,1
2,How does the nature of gameplay in a tabletop ...,original,1
3,What methods can a Dungeon Master use to help ...,original,1
4,"In the game Dungeons & Dragons, how is the res...",original,1
...,...,...,...
79,"Hey, so if we're talkin' about a Dwarf who's k...",informal,0
80,"Hey, so in D&D, what decides the gear you kick...",informal,0
81,"Hey, so in D&D, how do you figure out someone'...",informal,0
82,"Hey, so when you're going at it with a melee w...",informal,0


In [6]:
df.to_csv('../data/embeddings_finetune_data.csv', index=False)

### Step 3: Prepare data for fine-tuning

In [7]:
from datasets import Dataset
from sentence_transformers import InputExample

dataset = Dataset.from_pandas(df)

train_examples = []
n_examples = len(df)

for i in range(n_examples):
  example = dataset[i]
  train_examples.append(InputExample(texts=[example['question']], label=example['label']))

In [8]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

### Step 4: Prepare the model for fine-tuning

In [9]:
from sentence_transformers import SentenceTransformer, models
from transformers import AutoModel

model_id = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_id)

In [10]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
  (2): Normalize()
)

In [11]:
embeddings_1 = model.encode(questions[0], normalize_embeddings=True)
embeddings_2 = model.encode(generated_questions[0]['question'], normalize_embeddings=True)

# compute cosine similarity as dot product of normalized embeddings
import numpy as np
np.dot(embeddings_1, embeddings_2)

0.60229886

In [12]:
encoder_model = list(model[0].children())[0]
encoder_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [13]:
print('Trainable parameters:', sum(p.numel() for p in encoder_model.parameters() if p.requires_grad))

Trainable parameters: 22713216


In [14]:
from peft import get_peft_model, LoraConfig, TaskType

lora_target_modules = [f'encoder.layer.{n}.attention.self.query' for n in range(6)]
lora_target_modules = lora_target_modules + [f'encoder.layer.{n}.attention.self.key' for n in range(6)]
lora_target_modules = lora_target_modules + [f'encoder.layer.{n}.attention.self.value' for n in range(6)]

peft_config = LoraConfig(
task_type=TaskType.FEATURE_EXTRACTION, inference_mode=False, r=16, lora_alpha=16, lora_dropout=0.1, bias='all', target_modules=lora_target_modules
)

encoder_model = get_peft_model(encoder_model, peft_config)

In [15]:
encoder_model

PeftModelForFeatureExtraction(
  (base_model): LoraModel(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 384, padding_idx=0)
        (position_embeddings): Embedding(512, 384)
        (token_type_embeddings): Embedding(2, 384)
        (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-5): 6 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): lora.Linear(
                  (base_layer): Linear(in_features=384, out_features=384, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=384, out_features=16, bias=False)
                  )
                  

In [16]:
print('Trainable parameters:', sum(p.numel() for p in encoder_model.parameters() if p.requires_grad))

Trainable parameters: 247296


In [17]:
from sentence_transformers.losses import BatchSemiHardTripletLoss

train_loss = BatchSemiHardTripletLoss(model=model)

### Step 5: Fine-tune

In [18]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=50, warmup_steps=10, optimizer_params={'lr': 5e-5})

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6 [00:00<?, ?it/s]

In [19]:
embeddings_1 = model.encode(questions[0], normalize_embeddings=True)
embeddings_2 = model.encode(generated_questions[0]['question'], normalize_embeddings=True)

# compute cosine similarity as dot product of normalized embeddings
import numpy as np
np.dot(embeddings_1, embeddings_2)

0.40957442

In [20]:
# model.save("../models/embeddings_finetune_model")

encoder_model.save_pretrained("../models/embeddings_finetune_model")

### Step 6: Test fine-tuned model

In [21]:
from peft import PeftModel

new_model = SentenceTransformer(model_id)

new_encoder_model = list(new_model[0].children())[0]
new_encoder_model = PeftModel.from_pretrained(model_id="../models/embeddings_finetune_model", model=new_encoder_model)

In [22]:
embeddings_1 = new_model.encode(questions[0], normalize_embeddings=True)
embeddings_2 = new_model.encode(generated_questions[0]['question'], normalize_embeddings=True)

# compute cosine similarity as dot product of normalized embeddings
import numpy as np
np.dot(embeddings_1, embeddings_2)

0.40957442