In [1]:
import os
from getpass import getpass

from dotenv import load_dotenv



In [2]:
import pstuts_rag

In [3]:
%load_ext autoreload
%autoreload 2


In [4]:

load_dotenv()

def set_api_key_if_not_present(key_name, prompt_message=""):
    if len(prompt_message) == 0:
        prompt_message=key_name
    if key_name not in os.environ or not os.environ[key_name]:
        os.environ[key_name] = getpass.getpass(prompt_message)

set_api_key_if_not_present("OPENAI_API_KEY")

# Data Preparation

First, we will read in the transcripts of the videos and convert them to Documents
with appropriate metadata.

In [6]:
import json
filename = "../data/test.json"

data = json.load(open(filename, "rb"))


In [8]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from pstuts_rag.datastore import transcripts_load

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
docs_chunks_semantic = transcripts_load(data,embeddings)

## R - retrieval

Let's hit it with a semantic chunker.

In [45]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

client = QdrantClient(":memory:")

collection_name = f"{filename}_qdrant"

client.create_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=1536, distance=Distance.COSINE),
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)

In [46]:
_ = vector_store.add_documents(documents=docs_chunks_semantic)

In [47]:
retriever = vector_store.as_retriever(search_kwargs={"k":2})

def retrieve(state):
    retrieved_docs = retriever.invoke(state["question"])
    return {"context":retrieved_docs}


In [None]:
a = retrieve({"question":"What is a layer?"})
[ pp(d.page_content) for d in a["context"] ]

## A - Augmentation

We need to populate a prompt for LLM.


In [49]:
from langchain.prompts import ChatPromptTemplate

SYSTEM_PROMPT = """\
You are a helpful an expert on Photoshop and your goal is to help users
gain knowledge from a database of training videos. 
You answer questions based on provided context. 
Your answers use emojis for emphasis.

IMPORTANT: You must only use the provided context, and cannot use your own knowledge.
If there is no context that corresponds to the query, respond by saying
"I don't know. This is not available in our training library."

Most of the users questions will be in the form:
"How can I do ..."
or
"What is ..."

When appropriate, provide your answers in a step-by-step form.
ALWAYS list the URL and the title of the reference video.
NEVER invent the explanation. ALWAYS use ONLY the context information.

"""

RAG_PROMPT="""\

### Question
{question}

NEVER invent the explanation. ALWAYS use ONLY the context information.

### Context
{context}


"""

rag_prompt = ChatPromptTemplate(
    [("system",SYSTEM_PROMPT), 
     ("human",RAG_PROMPT)
     ]
    )

## Generation

We will use a 4.1-nano to generate answers.

In [50]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4.1-nano",temperature=0)

In [51]:
def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])

  references = [ 
                {k: doc.metadata[k] for k in ("title","source","start","stop")} 
                for doc in state["context"] 
  ] 


  messages = rag_prompt.format_messages(question=state["question"], 
                                        context=docs_content)
  response = llm.invoke(messages)
  retval = {"response":f"{response.content}\n\n**References**:\n{json.dumps(references,indent=2)}",
            "context":state["context"]}
  
  return retval


In [52]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict,Annotated
from langchain_core.documents import Document
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from langchain_openai.chat_models import ChatOpenAI
import operator

class State(TypedDict):
    question: str
    context: List[Document]
    response: str
        
graph_builder = StateGraph(State).add_sequence([retrieve, generate ])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [53]:
from langchain.schema.output_parser import StrOutputParser
response = graph.invoke({"question" : "What is the layer in Photoshop"})

In [None]:
response.keys()

In [None]:
type(response)

In [None]:
pp(response)

In [None]:
response.keys()