<a href="https://colab.research.google.com/github/mikola11/cs224u/blob/main/dspy_synthetic_data_test1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers dspy-ai openai python-dotenv

In [2]:
import openai
import dspy
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
from tqdm import tqdm
import re
import random
from dspy.evaluate.metrics import answer_exact_match
from dspy.evaluate import Evaluate
from dspy.teleprompt import LabeledFewShot, BootstrapFewShot, BootstrapFewShotWithRandomSearch
import pdb

In [3]:
load_dotenv('keys.env')
openai_key = os.getenv('OPENAI_API_KEY')
repo_path = 'dspy'
os.environ["DSP_NOTEBOOK_CACHEDIR"] = os.path.join(repo_path, 'cache')

In [8]:
lm = dspy.OpenAI(model='gpt-4-turbo', api_key=openai_key, max_tokens=4096)
rm = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')
dspy.settings.configure(lm=lm, rm=rm)

In [9]:
class Deviation(dspy.Signature):
    """
    You are a quality assurance specialist working on the floor where a Biologics product is manufactured.
    You just encountered a deviation that you need to report. Write a deviation report based on the the following information below.
    The report should contain only two fields: 1) deviation title and 2) deviation description.
    Brevity is important. Use shorter sentences and common biomanufacturing acronyms where possible.
    """

    date = dspy.InputField(desc='Date of occurrence')
    problem = dspy.InputField(desc='Nature of the problem')
    problem_context = dspy.InputField(desc='Context of the problem') #To be provided through a retreiver
    process = dspy.InputField(desc='Process where the problem occurred')
    batch_id = dspy.InputField(desc='Manufactured product Batch ID that was involved')
    order_nr = dspy.InputField(desc='Process Order number for batch')
    product_name = dspy.InputField(desc='Name of the product manufactured')
    operator = dspy.InputField(desc='Name and ID of the person who performed the operation')
    verifier = dspy.InputField(desc='Name and ID of the person who verified the operation')
    qa_name = dspy.InputField(desc='Name and ID of the person who answered the question')
    material = dspy.InputField(desc='Material used in the operation')
    equipment = dspy.InputField(desc='Equipment used in the operation')
    document = dspy.InputField(desc='Document, SOP or work instruction, governing the operation and that was breached')
    text_structure = dspy.InputField(desc='Desired text structure of the deviation description')

    deviation_title = dspy.OutputField(desc='Deviation title, usually no more than 255 symbols')
    deviation_description = dspy.OutputField(desc='Deviation description, usually between 100 and 600 words')

In [10]:
rec = {
    "date": "Apr-1, 2023",
    "problem": "Filter failed FIT ",
    "problem_context": "Prior to actual filtration of the product, the filter should be flushed with either product or water for injection to reduce potential extractables and downstream particles. The filter is then subjected to a filter integrity test (pre-filtration filter integrity test), and after the solution is filtered, the filter is again subjected to a second filter integrity test (post-filtration filter integrity test). This integrity test is usually performed either as the bubble point test or as the diffusion or forward flow test. The principle of the bubble point test is that a fully wetted membrane filter of extremely small pore size will hold liquid in the pores by surface tension and capillary force. The pressure of a gas required to force the entrapped liquid both through and out of the fully wetted pore capillary is referred to as the bubble point because air bubbles will appear after the liquid is forced out. The bubble point is a function of the type and pore size of the filter membrane, the surface tension of the liquid, and temperature. The bubble point pressure is correlated to the microbial log reduction value as determined by the filter manufacturer, so that filter having a bubble point greater than the established specification is shown to retain B.diminuta. Filters and the corresponding bubble points: 0.1 μm PVDF: 70; 0.22 μm PVDF: 50, 0.2 μm MCE: 55, 0.45 μm MCE: 30, 0.65 μm MCE: 17.",
    "process": "Aseptic fill of the final product",
    "batch_id": "A23001",
    "order_nr": "1030002",
    "product_name": "ferolizumab",
    "operator": "John Smith 130299",
    "verifier": "Barbara Johnson 120392",
    "qa_name": "Bill Maher 140002",
    "material": "1002201 GVWP04700 PVDF Membrane Filter, 0.22 μm MILLIPORE",
    "equipment": "HMBG:A1:000-FLT-120",
    "location": "Suite A1",
    "site": "Hamburg",
    "document": "SOP-080299 Aseptic Filling Operations in Biologic Drug Production v2.0",
    "text_structure": "The 5Ws and H framework"
    }

example = dspy.Example(date=rec['date'],
             problem=rec['problem'],
             problem_context=rec['problem_context'],
             process=rec['process'],
             batch_id=rec['batch_id'],
             order_nr=rec['order_nr'],
             product_name=rec['product_name'],
             operator=rec['operator'],
             verifier=rec['verifier'],
             qa_name=rec['qa_name'],
             material=rec['material'],
             equipment=rec['equipment'],
             document=rec['document'],
             text_structure=rec['text_structure']
             ).with_inputs('date',
                           'problem',
                           'problem_context',
                           'process',
                           'batch_id',
                           'order_nr',
                           'product_name',
                           'operator',
                           'verifier',
                           'qa_name',
                           'material',
                           'equipment',
                           'document',
                           'text_structure')

In [12]:
generator = dspy.Predict(Deviation, temperature=1, n=3)
result = generator(date=rec['date'],
             problem=rec['problem'],
             problem_context=rec['problem_context'],
             process=rec['process'],
             batch_id=rec['batch_id'],
             order_nr=rec['order_nr'],
             product_name=rec['product_name'],
             operator=rec['operator'],
             verifier=rec['verifier'],
             qa_name=rec['qa_name'],
             material=rec['material'],
             equipment=rec['equipment'],
             document=rec['document'],
             text_structure=rec['text_structure'])
result

Prediction(
    deviation_title='Deviation Title: Failed Pre-Filtration Integrity Test of PVDF Membrane Filter during Aseptic Fill of Ferolizumab Batch A23001',
    deviation_description='On Apr-1, 2023, during the aseptic filling process of Ferolizumab (Batch ID: A23001, Order Nr: 1030002), a deviation was noted concerning a PVDF membrane filter (Material: 1002201 GVWP04700, 0.22 μm MILLIPORE) which failed the pre-filtration integrity test. Specifically, the 0.22 μm PVDF membrane filter did not meet the bubble point specification of 50 psi required to ensure microbial retention as per SOP-080299 Aseptic Filling Operations in Biologic Drug Production v2.0. The operation was performed by Operator John Smith (ID: 130299), and verified by Barbara Johnson (ID: 120392). The deviation was identified and recorded by QA representative Bill Maher (ID: 140002). The equipment involved in the incident was identified as HMBG:A1:000-FLT-120. This failure to meet the bubble point specification raised

In [16]:
result.completions['deviation_description'][1]

'On April 1, 2023, during the aseptic fill process of ferolizumab Batch ID A23001, process order number 1030002, a critical deviation occurred involving a filter integrity failure. The operation, performed by Operator John Smith (ID 130299) and verified by Barbara Johnson (ID 120392), required a pre-filtration integrity test on a 0.22 μm PVDF membrane filter (Material ID: 1002201 GVWP04700, MILLIPORE). The equipment used was designated HMBG:A1:000-FLT-120. According to SOP-080299, Aseptic Filling Operations in Biologic Drug Production v2.0, the filter should initially be flushed followed by the integrity testing, usually by the bubble point method. This method necessitates a minimum bubble point pressure corresponding to the filter’s microbial retention capability, with specifications for a 0.22 μm PVDF filter set at a bubble point of 50. However, the filter failed to meet this requirement. The QA oversight was provided by Bill Maher (ID 140002). Immediate actions included isolating th

In [70]:
lm.inspect_history(1)




You are a quality assurance specialist working on the floor where a Biologics product is manufactured.
    You just encountered a deviation that you need to report. Write a deviation report based on the the following information below.
    The report should contain only two fields: 1) deviation title and 2) deviation description.
    Brevity is important. Use shorter sentences and common biomanufacturing acronyms where possible.

---

Follow the following format.

Date: Date of occurrence

Problem: Nature of the problem

Problem Context: Context of the problem

Process: Process where the problem occurred

Batch Id: Manufactured product Batch ID that was involved

Order Nr: Process Order number for batch

Product Name: Name of the product manufactured

Operator: Name and ID of the person who performed the operation

Verifier: Name and ID of the person who verified the operation

Qa Name: Name and ID of the person who answered the question

Material: Material used in the operation

Eq

'\n\n\nYou are a quality assurance specialist working on the floor where a Biologics product is manufactured.\n    You just encountered a deviation that you need to report. Write a deviation report based on the the following information below.\n    The report should contain only two fields: 1) deviation title and 2) deviation description.\n    Brevity is important. Use shorter sentences and common biomanufacturing acronyms where possible.\n\n---\n\nFollow the following format.\n\nDate: Date of occurrence\n\nProblem: Nature of the problem\n\nProblem Context: Context of the problem\n\nProcess: Process where the problem occurred\n\nBatch Id: Manufactured product Batch ID that was involved\n\nOrder Nr: Process Order number for batch\n\nProduct Name: Name of the product manufactured\n\nOperator: Name and ID of the person who performed the operation\n\nVerifier: Name and ID of the person who verified the operation\n\nQa Name: Name and ID of the person who answered the question\n\nMaterial: M