### Dynamic Schema Creation 

In [5]:
import os
import yaml
import instructor
from typing import Optional, Any, get_type_hints
from pydantic import BaseModel, Field, create_model
from openai import OpenAI

In [6]:
def load_yaml_schema(file_path):
    with open(file_path, 'r') as file:
        schema = yaml.safe_load(file)
    return schema

filepath = '/Users/nandy/Documents/GitHub/LLM-Foundry/configs/ouput_schema.yaml'
schema = load_yaml_schema(filepath)

In [7]:
type_mapping = {
    "str": str,
    "int": int,
    "float": float,
    "bool": bool,
    "Optional[str]": Optional[str],
    "Optional[int]": Optional[int],
    "Optional[float]": Optional[float],
    "Optional[bool]": Optional[bool],
    "Any": Any,
}

In [8]:
def create_pydantic_model(class_name: str, schema: dict, type_mapping: dict):
    fields = {}
    for field_name, field_info in schema.items():
        field_type_str = field_info.get('type')
        field_description = field_info.get('description', '')

        # Resolve the type using the type_mapping
        field_type = type_mapping.get(field_type_str, Any)

        # Add the field to the fields dictionary with a FieldInfo
        fields[field_name] = (field_type, Field(description=field_description))

    # Dynamically create the Pydantic model using create_model
    return create_model(class_name, **fields)

# Load schema from YAML
schema_yaml = load_yaml_schema(file_path=filepath)

# Create and use the Pydantic model
for class_name, schema in schema_yaml.items():
    model = create_pydantic_model(class_name, schema, type_mapping)
    print(model.schema())

    # Example usage of the model
    # example_data = {"name": "John Doe", "age": 30, "email": "john@example.com"}
    example_data = {
        "ingredient": "Tomato",
        "quantity": 2.5,
        "process": "chopped"
        }
    instance = model(**example_data)
    print(instance)

{'properties': {'ingredient': {'description': 'Name of the ingredient', 'title': 'Ingredient', 'type': 'string'}, 'quantity': {'description': 'Quantity of the ingredient', 'title': 'Quantity', 'type': 'number'}, 'process': {'anyOf': [{'type': 'string'}, {'type': 'null'}], 'description': 'Process of the ingredient eg., chopped, sliced, etc.', 'title': 'Process'}}, 'required': ['ingredient', 'quantity', 'process'], 'title': 'FoodSchema', 'type': 'object'}
ingredient='Tomato' quantity=2.5 process='chopped'


In [9]:
instance = model(**example_data)
print(instance)

ingredient='Tomato' quantity=2.5 process='chopped'


### D.S Testing

In [10]:
GPT_4o = "gpt-4o-2024-08-06"
GPT_4o_mini = "gpt-4o-mini"

In [20]:
# Patch the OpenAI client
client = instructor.from_openai(OpenAI())

def return_oai_response(inputs) :

  GPT_4o = "gpt-4o-2024-08-06"
  GPT_4o_mini = "gpt-4o-mini"

  user_info = client.chat.completions.create(
      model=GPT_4o_mini,
      temperature=0.05,
        max_tokens=1250,
        top_p=0.05,
        frequency_penalty=0.1,
        presence_penalty=1,
      response_model=model,
      messages=[{"role": "user", "content": inputs['content']}],
  )
  return user_info

### Langsmith Evaluation

In [13]:
import openai
from langsmith.wrappers import wrap_openai

client = wrap_openai(openai.Client())
client = instructor.from_openai(client, mode=instructor.Mode.TOOLS)

In [None]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator
from langsmith.schemas import Run, Example
# Evaluators

dataset_name = "Sample-NER-test"

def is_answered(run: Run, example: Example) -> dict:
    # Get outputs
    student_answer = run.outputs.get("output")
    # example.outputs
    # Check if the student_answer is an empty string
    if not student_answer:
        return {"key": "is_answered", "score": 0}
    else:
        return {"key": "is_answered", "score": 1}

qa_evalulator = [is_answered]

experiment_results = evaluate(
    return_oai_response,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-instructor-fc",
)

In [23]:
from langsmith.evaluation import evaluate, LangChainStringEvaluator
from langsmith.schemas import Run, Example
from difflib import SequenceMatcher

def similarity_score(a, b):
    """Calculates similarity score between two strings."""
    return SequenceMatcher(None, a, b).ratio()

def evaluate_dict(run: Run, example: Example) -> dict:
    output = run.outputs.get("output").dict()
    reference = example.outputs
    score = 0
    keys = [
        ('ingredient', 'ingredient'),
        ('quantity', 'Quantity'),
        ('process', 'Process')
    ]

    for ref_key, out_key in keys:
        ref_value = reference.get(ref_key, None)
        out_value = output.get(out_key, None)

        # Normalize None and empty strings
        if ref_value is None and out_value in [None, '']:
            score += 1
            continue

        # Convert strings to lowercase for comparison
        if isinstance(ref_value, str) and isinstance(out_value, str):
            ref_value = ref_value.lower()
            out_value = out_value.lower()

        # Compare the values and update the score
        if ref_value == out_value:
            score += 1
        else:
            # Apply partial scoring for non-exact matches
            if isinstance(ref_value, str) and isinstance(out_value, str):
                score += similarity_score(ref_value, out_value)
            else:
                score += 0  # No partial score for non-string mismatches

    # Normalize the score to be out of 1 (or 100 if you prefer percentages)
    max_score = len(keys)  # total possible score for exact match
    normalized_score = score / max_score
    return {"key": "EvaluationResults", "score": normalized_score*100}

dataset_name = "Sample-NER-test"
qa_evalulator = [evaluate_dict]

experiment_results = evaluate(
    return_oai_response,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="test-custom-fc-test",
)

# # Example usage
# A = {'ingredient': 'refined sugar', 'quantity': 21.5, 'process': None}
# B = {'Process': '', 'Quantity': 21.5, 'ingredient': 'refined sugar'}

# evaluation = evaluate_dict(A, B)
# print(evaluation)

View the evaluation results for experiment: 'test-custom-fc-test-ce85c72b' at:
https://smith.langchain.com/o/8c28e029-234e-55f2-addf-105eb152accb/datasets/85e68a36-80f0-47eb-800e-f21db38a1792/compare?selectedSessions=b9daa867-3b91-49c5-9606-5ddbe945ed41




0it [00:00, ?it/s]

### Prompt based Extraction 

In [68]:
import json
json_output = [{
    "ing": "name of ingredient.",
    "qty": "the quantity is a number or fraction only formatted as a string e.g. 1/2, 1, 1 1/2, 1.5, 0.5, 1-1/2",
    "uom": "options: '' or the unit of measure",
    "is_food_item": "true / false if this is an actual food item",
    "brand": "options : '' or commercial brand of input if present",
    "weight": "weight is a number or fraction only, formatted as a string",
    "weight_uom": "weight unit of measure",
    "volume": "volume is a number or fraction only, formatted as a string",
    "volume_uom": "volume unit of measure",
    "length": "length is a number or fraction only, formatted as a string",
    "length_uom": "length unit of measure",
    "width": "width is a number or fraction only, formatted as a string",
    "width_uom": "width unit of measure",
    "height": "height is a number or fraction only, formatted as a string",
    "height_uom": "height unit of measure",
    "part": "options : '' or the part of a meat ingredient e.g. beef chin -> chin, chicken breast => breast",
    "process": "transformation done to the ingredient e.g. sliced, shredded, cut into something",
    "pq": "physical quality of the product : e.g. 20% sodium, 2% fat, lean #/fat # %",
    "purpose": "options: '' or purpose e.g. for frying, for serving",
    #"origin": "when the 'ing' is extracted from the origin e.g. juice from lemon => origin = lemon",
    "multiple_keyword": "options: '' for single ing | 'or' for alternative options | 'and' for multiple required ingredients",
    "culture": "two letter iso code of the language of the ingredient",
    "add_info": "additional information not in the other json attributes",
}]
json_output_string = json.dumps(json_output)

prompt_template= "You are a multilingual food ingredient parser agent. For each ingredient in the list that i will provide, give the following details in a RFC8259 compliant JSON response format. Do not include any explanations and only provide the json response without deviation.: "+json_output_string + "If there are multiple ingredients, put them in their respective json entities within an array and respective keyword filled."
