In [None]:
!pip install --upgrade --quiet  langchain-openai pypdf sentence_transformers faiss-cpu

In [None]:
import os, yaml, wandb
os.chdir("/Users/1zuu/Desktop/Desktop - Isuru’s Mac mini/ML Research/MLOps Projects/wandb practice/")

from rich.markdown import Markdown
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
from langchain.vectorstores.faiss import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [None]:
with open('secrets.yaml') as f:
    secrets = yaml.safe_load(f)

os.environ["LANGCHAIN_WANDB_TRACING"] = "true"
os.environ["WANDB_API_KEY"] = secrets["WANDB_API_KEY"]
os.environ["WANDB_NOTEBOOK_NAME"] = "00-simple-rag-experiment/pipe.ipynb"
os.environ["WANDB_PROJECT"] = "llmops-wandb"

In [None]:
wandb.login()

In [None]:
chat_llm = OpenAI(
                api_key=secrets.get("OPENAI_API_KEY"),
                max_tokens=500
                )

embed_llm = HuggingFaceBgeEmbeddings(
                                    model_name = "BAAI/bge-small-en", 
                                    model_kwargs = {"device": "mps"}, 
                                    encode_kwargs = {"normalize_embeddings": True}
                                    )

In [None]:
loader = PyPDFLoader("data/00/2022-annual-report.pdf")

with wandb.init(
                job_type="upload_docs",
                id="00-rag-experiment"
                ) as run:
    artifact = wandb.Artifact(
                            "raw_doc",
                            type="dataset"
                            )
    artifact.add_dir(local_path="data/00/", name="2022-annual-report")
    run.log_artifact(artifact)

documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
                                                chunk_size=500,
                                                chunk_overlap=50
                                                )
texts = text_splitter.split_documents(documents)

In [None]:
docsearch = FAISS.from_documents(
                                texts, 
                                embed_llm
                                )

In [None]:
qa_chain = RetrievalQA.from_chain_type(
                                    llm=chat_llm,
                                    chain_type="stuff",
                                    retriever=docsearch.as_retriever(),
                                )

In [None]:
question = "How much is Consumer Products net revenues?"

In [44]:
wandb_table = wandb.Table(columns=["response", "question"])

with wandb.init(
                job_type="run_rag",
                id="00-rag-experiment",
                ) as run:
        response = qa_chain.run(question)
        wandb_table.add_data(response, question)
        wandb.log({"qa_table": wandb_table})