<a href="https://colab.research.google.com/github/nataliakzm/colab_collection/blob/main/FineTuning_02_LangSmith_%26_OpenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LangSmith + OpenAI Fine-tuning Guide

## Context

This is a guide for fine-tuning OpenAI's `gpt-3.5-turbo` model on an example knowledge-extraction task.

## Environment

Fist we'll set our `LANGCHAIN_API_KEY` so that we can access our LangSmith datasets as well as an `OPENAI_API_KEY` for fine-tuning and inference.

In [None]:
%env LANGCHAIN_API_KEY=
%env OPENAI_API_KEY=

In [None]:
%pip install --quiet -U langchain
%pip install --quiet -U langsmith
%pip install --quiet -U openai

# Get dataset

We are loading a private dataset from LangSmith (e.g., in this case `Carb-IE-train`). We will conver this to an openai compatible format.


In [None]:
import langsmith
import json

client = langsmith.Client()

def craft_messages(input, output) -> list[dict]:
    out = json.dumps(output["clusters"])
    return [{"role": "user", "content": "Extract triplets from the following sentence:\n\n" + input["sentence"]},
            {"role": "assistant", "content": out}]


In [None]:
import itertools
data = [
    craft_messages(example.inputs, example.outputs) for example in itertools.islice(client.list_examples(dataset_name="Carb-IE-train"), 50)
    ]

#### Upload training file to OpenAI

In [None]:
import openai
from io import BytesIO

my_file = BytesIO()
for m in data:
    my_file.write((json.dumps({"messages": m}) + "\n").encode('utf-8'))

my_file.seek(0)
training_file = openai.File.create(
  file=my_file,
  purpose='fine-tune'
)

#### Begin training

In [None]:
job = openai.FineTuningJob.create(training_file=training_file.id, model="gpt-3.5-turbo")

#### Wait for training to complete

In [None]:
import time
start = time.time()

while True:
  ftj = openai.FineTuningJob.retrieve(job.id)
  if ftj.fine_tuned_model is None:
    print(f"Waiting for fine-tuning to complete... Elapsed: {time.time() - start}", end="\r", flush=True)
    time.sleep(10)
  else:
    print(ftj.fine_tuned_model, flush=True)
    break

#### Construct fine-tuned chain

In [None]:
from langchain import prompts
from langchain import chat_models

prompt = prompts.ChatPromptTemplate.from_messages(
    [
      ("human", "Extract triplets from the following sentence:\n\n{sentence}"),
    ]
)
llm = chat_models.ChatOpenAI(model=ftj.fine_tuned_model, temperature=0)
finetuned_chain = prompt | llm

#### Evaluate

We'll make a custom evaluator for triplets.

In [None]:
from langchain import smith
import json
from typing import Any, Optional
from langchain.evaluation import StringEvaluator
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import openai_functions

eval_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an impartial grader tasked with measuring the accuracy of extracted entity relations."),
        ("human", "Please evaluate the following data:\n\n"
         "<INPUT>\n{input}</INPUT>\n"
         "<PREDICTED>\n{prediction}</PREDICTED>\n"
         "<GROUND_TRUTH>\n{reference}</GROUND_TRUTH>\n\n"
         "Please save your reasoning and grading by calling the commit_grade function."
         " First, enumerate all factual discrepancies in the predicted triplets relative to the ground truth."
         " Finally, score the prediction on a scale out of 100, taking into account factuality and"
         " correctness according to the ground truth."),

    ]
)

commit_grade_schema = {
    "name": "commit_grade",
    "description": "Commits a grade with reasoning.",
    "parameters": {
        "title": "commit_grade_parameters",
        "description": "Parameters for the commit_grade function.",
        "type": "object",
        "properties": {
            "mistakes": {
                "title": "discrepancies",
                "type": "string",
                "description": "Any discrepencies between the predicted and ground truth."
            },
            "reasoning": {
                "title": "reasoning",
                "type": "string",
                "description": "The explanation or logic behind the final grade."
            },
            "grade": {
                "title": "grade",
                "type": "number",
                "description": "The numerical value representing the grade.",
                "minimum": 0,
                "maximum": 100
            }
        },
        "required": ["reasoning", "grade", "mistakes"],
    }
}

def normalize_grade(func_args: str) -> dict:
    args = json.loads(func_args)
    return {
        "reasoning": (args.get("reasoning", "") + "\n\n" + args.get("discrepancies", "")).strip(),
        "score": args.get("grade", 0) / 100,
    }

eval_chain = (
    eval_prompt
    | ChatOpenAI(model="gpt-4", temperature=0).bind(functions=[commit_grade_schema])
    | openai_functions.OutputFunctionsParser()
    | normalize_grade
)

class EvaluateTriplets(StringEvaluator):
    """Evaluate the triplets of a predicted string."""

    @property
    def requires_input(self) -> bool:
        return True

    @property
    def requires_reference(self) -> bool:
        return True

    def _evaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        callbacks = kwargs.pop("callbacks", None)
        return eval_chain.invoke(
            {"prediction": prediction, "reference": reference, "input": input},
            {"callbacks": callbacks},
        )

config = smith.RunEvalConfig(
    custom_evaluators=[EvaluateTriplets()],
)

In [None]:
validation_dataset_name = "Carb-IE-test"
results = await client.arun_on_dataset(validation_dataset_name, finetuned_chain, evaluation=config)

View the evaluation results for project '6f1ba87705c54527bf5bbc4668938260-RunnableSequence' at:
https://smith.langchain.com/projects/p/6ab2ed7a-ca7f-44f4-913e-ce10459f7a79?eval=true




#### Compare against few-shot variants

Here, we benchmark the fine-tuned model against gpt-3.5 and gpt-4 chains prompted with 5 examples.

In [None]:
# raw_example

from langchain import prompts
from langchain import chat_models

first_5 = list(itertools.islice(client.list_examples(dataset_name="Carb-IE-train"), 5))
messages = []
partials = {}
for i in range(len(first_5)):
  messages.extend([
        ("human", "Extract triplets from the following sentence:\n\n{input_%d}" % i),
        ("ai", "{output_%d}" % i)
    ]
  )
  partials["input_%d" % i] = first_5[i].inputs["sentence"]
  partials["output_%d" % i] = json.dumps(first_5[i].outputs["clusters"])

messages.append(("human", "Extract triplets from the following sentence:\n\n{sentence}"))

prompt = prompts.ChatPromptTemplate.from_messages(
    messages
).partial(
    **partials
)
llm = chat_models.ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

fewshot_chain = prompt | llm

In [None]:
validation_dataset_name = "Carb-IE-test"
results = await client.arun_on_dataset(validation_dataset_name, fewshot_chain, evaluation=config, project_name="Few-shot-GPT-3.5")

View the evaluation results for project 'Few-shot-GPT-3.5' at:
https://smith.langchain.com/projects/p/34798b10-cfed-471b-89f3-1743725993d8?eval=true


In [None]:
fewshot_gpt4_chain = prompt | chat_models.ChatOpenAI(model="gpt-4", temperature=0)
validation_dataset_name = "Carb-IE-test"
results = await client.arun_on_dataset(validation_dataset_name, fewshot_gpt4_chain, evaluation=config, project_name="Few-shot-GPT-4")

View the evaluation results for project 'Few-shot-GPT-4' at:
https://smith.langchain.com/projects/p/80ae7670-56df-470c-8c57-ccf86678a728?eval=true
