# Fine-tuning Embedding for PSTuts Q&A data

The structure of the notebook follows the AIE6 09 notebook,
with certain functions packed away into `evaluator_utils.py` for 
readability and efficiency sake.

In [1]:
%load_ext autoreload
%autoreload 2


Wall of imports:

In [2]:
import os
import logging

import requests
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

from qdrant_client import QdrantClient

from pstuts_rag.rag import RAGChainInstance
import nest_asyncio


from dataclasses import dataclass
from datasets import load_dataset
from langsmith import EvaluationResult
from ragas import EvaluationDataset
from pandas import DataFrame
from langchain_core.runnables import Runnable


load_dotenv()

def set_api_key_if_not_present(key_name, prompt_message=""):
    if len(prompt_message) == 0:
        prompt_message=key_name
    if key_name not in os.environ or not os.environ[key_name]:
        os.environ[key_name] = getpass.getpass(prompt_message)

set_api_key_if_not_present("WANDB_API_KEY","Please enter your WANDB API key!")
set_api_key_if_not_present("HF_TOKEN","Please enter your HF token!")
set_api_key_if_not_present("HF_USER","Please enter your HuggingFace username!")
set_api_key_if_not_present("OPENAI_API_KEY")


# log into hugging face while we're at it
from huggingface_hub import notebook_login
notebook_login()


logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("langchain").setLevel(logging.WARNING)
nest_asyncio.apply()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
import torch
import gc


def mem_clean():
    # 1) delete any tensors you no longer need
    #    e.g. del inputs, labels, outputs

    # 2) force Python GC
    gc.collect()


    torch.cuda.empty_cache()

    print(torch.cuda.memory_summary())

mem_clean()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

This is the package where most functions can be found:

In [4]:
import pstuts_rag.evaluator_utils as eu


## Load data for RAG

Data is loaded from the dataset uploaded to HF.
It is in JSON.

Loading is analogous to the loading in Chainlit apps.

In [5]:
import pstuts_rag.loader

url = "https://huggingface.co/datasets/mbudisic/PsTuts-VQA/raw/main/train.json"
resp = requests.get(url)
resp.raise_for_status()
group = url.split('/')[-1].split('.')[0]
docs_json = pstuts_rag.loader.load_json_string(resp.content.decode('utf-8'), group)


## Load the fine-tuning dataset

Query-document pairs have been constructed by RAGAS knowledge graph
and uploaded to another HF dataset. Here we are using the largest
section of that data (`train`) and subdividing it into three 
components: training, validation, and test.

(Since the original PsTuts-VQA dataset already has the train-validate-test
split, it would be ideal to respect it but for simplicity, we will just
work with the train section as if it was the entire dataset.)


In [6]:
from collections import defaultdict
from typing import Dict, List


full_dataset = load_dataset("mbudisic/pstuts_rag_qa",split="train").to_pandas()

Out of curiosity, let's compute how many documents
have single-doc and how many multi-doc contexts.

This should match the query distribution used to create the query-doc pairs (spoiler: it does).

In [7]:

def histogram_by_length(input:List[List]) -> Dict[int,int]:
    hist = defaultdict(int)

    for v in input:
        length = len(v)
        hist[length] += 1
    return hist

histogram_by_length(full_dataset["reference_contexts"])

defaultdict(int, {1: 50, 2: 51})

Now, split that dataset in 60-20-20 ratio (the called function
wraps the calls to `sklearn` that has a convenient helper function).

In [8]:
dataset = {}
dataset = dict(zip(
    ["train","validate","test"],
    eu.train_val_test_split(full_dataset, (6,2,2), seed=42))
               )



[0.6, 0.2, 0.2]
Dataset of 101 split into [60, 20, 21] which is [0.59, 0.2, 0.21]


Now, we have to convert `dataset["train"]` into format acceptable to the 
optimization loop, while `dataset["validate"]` should be converted into the
`InformationRetrievalEvaluator` evaluator format.


### Encoding to `InformationRetrievalEvaluator`

The function `encode_corpus` below assigns UUIDs to each (unique) query (`Q-uuid`) and context (`D-uuid`) document,
then converts the input dataset into pairs ( `Q-uuid:List[D-uuid]` ).

In [9]:
print( type(dataset["train"]) )
relevant_docs = {}
queries = {}
corpus = {}
print(f"Q:{len(queries)}, C:{len(corpus)}")
queries, corpus, relevant_docs["train"] = \
    eu.encode_corpus(queries,corpus,dataset["train"])
print(f"Q:{len(queries)}, C:{len(corpus)}")
queries, corpus, relevant_docs["validate"] = \
    eu.encode_corpus(queries,corpus,dataset["validate"])
print(f"Q:{len(queries)}, C:{len(corpus)}")
queries, corpus, relevant_docs["test"] = \
    eu.encode_corpus(queries,corpus,dataset["test"])
print(f"Q:{len(queries)}, C:{len(corpus)}")
queries, corpus, relevant_docs["full"] = \
    eu.encode_corpus(queries,corpus,full_dataset)
print(f"Q:{len(queries)}, C:{len(corpus)}")


<class 'pandas.core.frame.DataFrame'>
Q:0, C:0
Q:59, C:13
Q:78, C:13
Q:99, C:13
Q:99, C:13


## Creating the training input format

Additionally, we have to convert the dataset into the format
used by the objective function downstream,

In [10]:
from sentence_transformers import InputExample
relevant_context = relevant_docs['train']

examples = []
for query_id, context in relevant_context.items():
    try:
        query = queries[query_id]
        scale = 1.0
        for doc_id in context:
            text = corpus[doc_id]
            example = InputExample(texts=[query, text], label=scale)
            scale /= 2.0 # make the second-ranked item be worth less
            examples.append(example)
    except KeyError:
        pass # 

print("Number of examples in the training set: {len(examples)}")
examples[2].texts

Number of examples in the training set: {len(examples)}


['How can the ALT key be used as a shortcut when zooming with the Zoom tool in Photoshop CC?',
 "Zooming and panning are ways to navigate around an image that you'll use often as you work on images in Photoshop CC. To practice working with the zoom and pan controls, open this image from the tutorial practice files, or open a large image of your own. Zooming means changing the magnification of the image, as you might do if you were looking at the sky through a telescope. You may want to zoom in for a closer view of part of an image, or you may want to zoom out to see more of an image on your screen. The most straightforward way to zoom is to select the Zoom tool, toward the bottom of the Tools panel here. Then go up to the Options bar for the Zoom tool, where you'll find a plus icon for zooming in, and a minus icon for zooming out. Let's start with the plus icon activated which is the default. Then to zoom in, move into the image and click. And each time you click, you'll zoom in a litt

## Fine-tuning the embedding model

As in the class, we'll be tuning `snowflake-arctic-embed-l`.

In [11]:


model_id = "Snowflake/snowflake-arctic-embed-s"
model_tag = model_id.split("/")[1]
print(model_tag)
model = HuggingFaceEmbeddings(model_name=model_id)

snowflake-arctic-embed-s


Weights and Biases will be used to track the progress and profile the results.

In [12]:

import wandb


wandb.init(mode="online",
               # Set the wandb entity where your project will be logged (generally your team name).
    entity="budisicm-virginia-commonwealth-university",
    # Set the wandb project where this run will be logged.
    project=f"Finetuning of {model_tag} using PsTuts-VQA"
    )
wandb.run.name = wandb.run.id
wandb.run.save()



[34m[1mwandb[0m: Currently logged in as: [33mbudisicm[0m ([33mbudisicm-virginia-commonwealth-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




True

We're using a toy batch size here to reflect the limited number of examples we have AND because this is running on local hardware for Marko.

Now we can create a `torch` `DataLoader`!

In [13]:
from torch.utils.data import DataLoader
BATCH_SIZE = 8
data_loader = DataLoader(
    examples, batch_size=BATCH_SIZE, shuffle=True
)
data_loader

<torch.utils.data.dataloader.DataLoader at 0x773f767a57d0>

## Loss function
Next up, we'll prepare our loss function!

Loss is an important part of training, fine-tuning, and more. If you want a deep dive on loss - you can check out our [event on loss!](https://www.youtube.com/watch?v=iB8FWR9aD5Q&t=8s).

The core loss we're using today is called `MultipleNegativesRankingLoss` - you can find more information [here](https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/losses/MultipleNegativesRankingLoss.py).

This is "wrapped" in `MatryoshkaLoss`, which you can read the implementation of [here](https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/losses/MatryoshkaLoss.py).

In [14]:
from sentence_transformers.losses import MatryoshkaLoss, MultipleNegativesRankingLoss

matryoshka_dimensions = [384, 256, 128, 64]
inner_train_loss = MultipleNegativesRankingLoss(model)
train_loss = MatryoshkaLoss(
    model, inner_train_loss, matryoshka_dims=matryoshka_dimensions
)

In [15]:
[d/float(64) for d in matryoshka_dimensions]

[6.0, 4.0, 2.0, 1.0]

## Evaluator

Now, we'll set up the evaluator

In [16]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator

evaluator = InformationRetrievalEvaluator(
    dict(queries), 
    dict(corpus), 
    relevant_docs['validate']
    )

In [17]:
mem_clean()

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 129743 KiB | 129743 KiB | 129743 KiB |      0 B   |
|       from large pool | 101079 KiB | 101079 KiB | 101079 KiB |      0 B   |
|       from small pool |  28664 KiB |  28664 KiB |  28664 KiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         | 129743 KiB | 129743 KiB | 129743 KiB |      0 B   |
|       from large pool | 101079 KiB | 101079 KiB | 101079 KiB |      0 B   |
|       from small pool |  28664 KiB |  28664 KiB |  28664 KiB |      0 B   |
|---------------------------------------------------------------

We'll train this model for 10 epochs, though you could increase this number if we had a significant amount more data.

In [18]:
EPOCHS = 50
warmup_steps = int(len(data_loader) * EPOCHS * 0.1)

model.fit(
    train_objectives=[(data_loader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path=f"{model_tag}-ft-pstuts-vqa",
    show_progress_bar=True,
    evaluator=evaluator,
    evaluation_steps=10
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Cosine Accuracy@1,Cosine Accuracy@3,Cosine Accuracy@5,Cosine Accuracy@10,Cosine Precision@1,Cosine Precision@3,Cosine Precision@5,Cosine Precision@10,Cosine Recall@1,Cosine Recall@3,Cosine Recall@5,Cosine Recall@10,Cosine Ndcg@10,Cosine Mrr@10,Cosine Map@100
10,No log,No log,0.4,0.95,0.95,1.0,0.4,0.316667,0.26,0.15,0.3,0.7,0.875,1.0,0.720808,0.658333,0.596667
12,No log,No log,0.4,0.95,0.95,1.0,0.4,0.316667,0.26,0.15,0.3,0.7,0.875,1.0,0.720808,0.658333,0.596667
20,No log,No log,0.45,0.95,1.0,1.0,0.45,0.316667,0.27,0.15,0.325,0.7,0.925,1.0,0.731123,0.685,0.606667
24,No log,No log,0.45,0.95,1.0,1.0,0.45,0.316667,0.27,0.15,0.325,0.7,0.925,1.0,0.73986,0.695833,0.6175
30,No log,No log,0.45,1.0,1.0,1.0,0.45,0.333333,0.27,0.15,0.325,0.75,0.925,1.0,0.74467,0.7,0.624167
36,No log,No log,0.4,1.0,1.0,1.0,0.4,0.333333,0.27,0.15,0.3,0.75,0.925,1.0,0.732166,0.675,0.61
40,No log,No log,0.45,0.9,1.0,1.0,0.45,0.3,0.27,0.15,0.35,0.65,0.925,1.0,0.745031,0.691667,0.629167
48,No log,No log,0.5,0.85,0.95,1.0,0.5,0.3,0.26,0.15,0.35,0.625,0.875,1.0,0.74161,0.7,0.625
50,No log,No log,0.45,0.85,0.95,1.0,0.45,0.316667,0.26,0.15,0.325,0.65,0.875,1.0,0.726215,0.6725,0.605833
60,No log,No log,0.5,0.95,0.95,1.0,0.5,0.366667,0.29,0.15,0.35,0.775,0.95,1.0,0.758144,0.716667,0.646667


In [20]:
import datetime
## uncomment with specific model name to override push
# force local store
from huggingface_hub import snapshot_download, scan_cache_dir



timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
model_hf_name = f"{os.environ['HF_USER']}/{model_tag}-ft-pstuts"

print(f"Pushing model to {model_hf_name}")  
model.push_to_hub(model_hf_name,exist_ok=True)

# path = snapshot_download(
#     repo_id=model_hf_name,
#     force_download=True
# )


# Check if model exists in cache before removing


Pushing model to mbudisic/snowflake-arctic-embed-s-ft-pstuts


No files have been modified since last commit. Skipping to prevent empty commit.


'https://huggingface.co/mbudisic/snowflake-arctic-embed-s-ft-pstuts/commit/a374cdbaa713a71d5ba7fbb74fea7151c3dbc35b'