In [1]:
import sys
from pathlib import Path

def _find_project_root(start: Path = Path.cwd()):
	for p in [start] + list(start.parents):
		if (p / "pyproject.toml").exists():
			return p
	return start

project_root = _find_project_root()
sys.path.insert(0, str(project_root))

In [2]:
import os
import json
from typing import List, Tuple
from tqdm.notebook import tqdm
from dotenv import load_dotenv
from load_data import LoadData
from lora.core import LORA, FUSE
from datasets import load_dataset
from mlx_lm import generate, utils
from langchain_postgres import PGVector
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import JSONLoader

In [3]:
load_dotenv(project_root / ".env")

True

In [4]:
# Configuration

root_folder = "../../.cache/ragVsFinetuning/model_2"

data_folder = f"./{root_folder}/data"
dataset_name = "LangChainDatasets/question-answering-paul-graham"
n = None
test_split_ratio = 0.2
valid_split_ratio = 0.2

model_path = "mistralai/Mistral-7B-Instruct-v0.2"
adapter_file = f"./{root_folder}/adapters.npz"
save_model_path = f"./{root_folder}/model"

collection_name = "rag_finetuning_comparison_2"
rag_data_file="../../.cache/ragVsFinetuning/data/data_2.json"

## Prepare Data

In [5]:
# Prepare data for finetuning

system_message = """
You are a helpful chat assistant. Provide clear and concise responses to user's queries.
"""

def create_conversation(input: dict) -> dict:
    if input['question'] is None or input["answer"] is None:
        pass
    return {
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": input["question"]},
            {"role": "assistant", "content": input["answer"]}
        ]
    }

data_loader = LoadData(folder=data_folder, dataset_name=dataset_name)
data_loader.save(function=create_conversation, n=n, test_split_ratio=test_split_ratio, valid_split_ratio=valid_split_ratio, write_files=True)

# Prepare data for RAG

def process_rag_data(dataset_name: str, output_file: str, n: int = None) -> List[dict]:
    dataset = load_dataset(dataset_name).select(range(n)).shuffle() if n is not None else load_dataset(dataset_name).shuffle()
    
    rag_data = []
    for i, item in enumerate(tqdm(dataset['train'])):
        if item['question'] is None or item['answer'] is None:
            continue
        rag_data.append({"id": i, "query": item['question'], "response": item['answer']})

    with open(output_file, 'w') as f:
        json.dump(rag_data, f, indent=4)

process_rag_data(dataset_name=dataset_name, output_file=rag_data_file, n=n)

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

  0%|          | 0/22 [00:00<?, ?it/s]

## Prepare RAG

In [6]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = PGVector(embeddings=embedding_model, collection_name=collection_name, connection=os.getenv("PG_CONN_URI"))

def metadata_func(sample: dict, metadata: dict) -> dict:
    metadata.update({
        "id": sample["id"],
        "source": dataset_name,
        "type": "article"
    })
    return metadata

loader = JSONLoader(file_path=rag_data_file, jq_schema=".[]", text_content=False, metadata_func=metadata_func)
docs = loader.load()

def batch_add_documents(vector_store: PGVector, documents: List[Document], batch_size: int = 20):
    for i in tqdm(range(0, len(documents), batch_size)):
        batch = documents[i:i + batch_size]
        vector_store.add_documents(batch)

batch_add_documents(vector_store, docs, batch_size=20)

  0%|          | 0/2 [00:00<?, ?it/s]

## Finetune Model

In [7]:
lora = LORA(config={"train": True, "adapter_file": adapter_file, "batch_size": 1, "lora_layers": 4})
lora.invoke(model_path=model_path, data=data_folder)

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

Total parameters 7242.158M
Trainable parameters 0.426M
Loading datasets
Training
Iter 1: Val loss 3.579, Val took 2.946s
Iter 10: Train loss 3.517, It/sec 4.166, Tokens/sec 296.625
Iter 20: Train loss 2.674, It/sec 4.194, Tokens/sec 269.258
Iter 30: Train loss 1.894, It/sec 3.854, Tokens/sec 265.509
Iter 40: Train loss 1.550, It/sec 4.039, Tokens/sec 268.182
Iter 50: Train loss 1.331, It/sec 3.890, Tokens/sec 267.217
Iter 60: Train loss 1.258, It/sec 4.076, Tokens/sec 270.227
Iter 70: Train loss 1.196, It/sec 3.908, Tokens/sec 266.160
Iter 80: Train loss 1.031, It/sec 3.923, Tokens/sec 271.851
Iter 90: Train loss 0.977, It/sec 3.906, Tokens/sec 265.223
Iter 100: Train loss 0.847, It/sec 4.074, Tokens/sec 274.159
Iter 100: Saved adapter weights to ./../../.cache/ragVsFinetuning/model_2/adapters.npz.
Iter 110: Train loss 0.814, It/sec 3.760, Tokens/sec 260.535
Iter 120: Train loss 0.719, It/sec 4.186, Tokens/sec 277.542
Iter 130: Train loss 0.645, It/sec 3.893, Tokens/sec 265.102
Iter 14

In [8]:
fuse = FUSE(config={"adapter_file": adapter_file})
fuse.invoke(model_path=model_path, save_path=save_model_path)

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

## Comparision

In [9]:
def compare(query: str) -> Tuple[str, str]:

    vectordb = PGVector(embeddings=OpenAIEmbeddings(model="text-embedding-3-small"), collection_name=collection_name, connection=os.environ['PG_CONN_URI'], use_jsonb=True)
    rag_simi_result = vectordb.similarity_search(query=query, k=5)
    rag_model, rag_tokenizer = utils.load(model_path)
    rag_prompt = """System: You are a helpful chat assistant. Provide clear and concise responses to user's queries. RAG Context: {context} User: {query} Answer:"""
    rag_result = generate(model=rag_model, tokenizer=rag_tokenizer, prompt=rag_prompt.format(context=" ".join([doc.page_content for doc in rag_simi_result]), query=query))
    del vectordb, rag_simi_result, rag_model, rag_tokenizer, rag_prompt

    finetuned_model, finetuned_tokenizer = utils.load(save_model_path)
    finetuned_prompt = """System: You are a helpful chat assistant. Provide clear and concise responses to user's queries. User: {query} Answer:"""
    finetuned_result = generate(model=finetuned_model, tokenizer=finetuned_tokenizer, prompt=finetuned_prompt.format(query=query))
    del finetuned_model, finetuned_tokenizer

    return rag_result, finetuned_result

In [10]:
query = "Why did the author hire more people for his startup?"
rag_answer, finetuned_answer = compare(query=query)

print("======================================")
print("================RAG==================")
print(rag_answer)
print("======================================")
print("==============Fine-tuned===============")
print(finetuned_answer)
print("======================================")

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

The author hired more people for his startup due to investor pressure and the common practice during the Internet Bubble.
The author hired more people for his startup partly because the investors wanted him to and partly because that's what startups did during the Internet Bubble.


In [11]:
query = "Why did the author move to England?"
rag_answer, finetuned_answer = compare(query=query)

print("======================================")
print("================RAG==================")
print(rag_answer)
print("======================================")
print("==============Fine-tuned===============")
print(finetuned_answer)
print("======================================")

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

The author moved to England to let their kids experience living in another country and because the author was a British citizen by birth.
The author moved to England partly for reasons of employment and partly for reasons of wanting to be closer to the then-reigning monarch.


In [12]:
query = "What is the purpose of YC?"
rag_answer, finetuned_answer = compare(query=query)

print("======================================")
print("================RAG==================")
print(rag_answer)
print("======================================")
print("==============Fine-tuned===============")
print(finetuned_answer)
print("======================================")

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

The purpose of Y Combinator is to cause startups to be founded that would not otherwise have existed. This is achieved through their batch model, where they fund a large number of startups at once, twice a year, and spend three months helping them grow.
The purpose of Y Combinator (YC) is to invest in a group of the world's most System: System: System: promising startups twice a year. They provide the companies with funding, advice, and System: Networking connections. The program is structured into three System: Sessions: the startup school, the demo day, and the follow-on funding. The goal is to help the startups grow into successful companies.
