In [21]:
import os
import logging
from pathlib import Path
from tqdm.auto import tqdm
from datetime import datetime

from datasets import load_dataset
from torch.utils.data import DataLoader
from sentence_transformers.readers import InputExample
from sentence_transformers import losses, evaluation, util
from sentence_transformers import LoggingHandler, SentenceTransformer

## Config


In [18]:
num_epochs = 10
train_batch_size = 64
model_id = "stsb-distilbert-base"
dataset_path = Path("../dataset/")
dataset_name = Path("Open-Orca/SlimOrca")

In [13]:
model_weights_path = Path("./weights")
current_time = datetime.now().strftime("%Y-%b-%d %H:%M:%S")
model_save_path = (
    Path("train-output/training_MultipleNegativesRankingLoss") / current_time
)

In [14]:
model_save_path.mkdir(exist_ok=True, parents=True)

In [8]:
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%b-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)
logger = logging.getLogger(__name__)

## Get model


In [12]:
model = SentenceTransformer(
    model_name_or_path=model_id, cache_folder=model_weights_path
)

2024-Jan-28 22:33:20 - Load pretrained SentenceTransformer: stsb-distilbert-base


.gitattributes:   0%|          | 0.00/523 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/489 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

2024-Jan-28 22:33:49 - Use pytorch device: cuda


  return self.fget.__get__(instance, owner)()


## Get dataset


In [19]:
dataset = load_dataset(path=str(dataset_name), cache_dir=dataset_path)

In [35]:
for element in dataset["train"]["conversations"]:
    print(element)
    if len(element) != 3:
        print("OMG")
    break



### Preprocess dataset


In [43]:
train_samples = []
for data in tqdm(dataset["train"]["conversations"]):
    if len(data) == 2:
        query = data[0]["value"]
        response = data[1]["value"]
    elif len(data) == 3:
        query = data[1]["value"]
        response = data[2]["value"]
    else:
        print("[ERROR] With processing data!")
        print(data)
        break
    train_samples.append(InputExample(texts=[query, response], label=1))
    train_samples.append(InputExample(texts=[response, query], label=1))

  0%|          | 0/517982 [00:00<?, ?it/s]

In [45]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model)

## Evaluate data

We are using three evaluators:

- Classification: is query similar to text or not
- Duplicate Query Mining
- Information Retrival


In [46]:
evaluators = []

### Classification


In [None]:
dev_sentences1 = []
dev_sentences2 = []
dev_labels = []