In [None]:
import os
os.rename("/content/file.env", "/content/.env")

In [None]:
# Installing required modules
!pip install openai

In [None]:
!pip install langchain-community langchain

In [None]:
!pip install python-dotenv

In [None]:
from dotenv import load_dotenv
from openai import OpenAI

In [None]:
# This will help if the key is saved in the .env file
load_dotenv()

In [None]:
# Explicitly add key if not mentioned in the .env file
ashu_key = "<OPENAI_KEY>"

In [None]:
from openai import OpenAI
client = OpenAI(api_key=ashu_key) # to use key explicitly
# client = OpenAI() # to use key from .env file

In [None]:

response = client.responses.create(
    model="gpt-5-nano",
    input="What is a RAG in the context of LLM?",
    # max_output_tokens=1000
)

print(response.output_text)


# Prompt Templating using Langchain

In [None]:
# from langchain.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_community.chat_models import ChatOpenAI

In [None]:
# creating prompt from template
my_query = "What are LLM models available by {topic}. Describe one liner for each."
# using prompt_template
mytemplate = PromptTemplate(
    input_variables=["topic"],
    template=my_query
)
# prompt_template = PromptTemplate.from_template(my_query)

In [None]:
mytemplate

In [None]:
# calling mytemplate to fill variable
my_prompt = mytemplate.format(topic="OpenAI")
print(my_prompt)

In [None]:
my_str = "What are LLM models available by Anthropic. Describe one liner for each."
# Langchain supports agent, RAG, execution, flow chain
# There is no native support in str for LLM
# Template can be stored in hub
# Variables is checked at compile time itself
# created promts can be shared in community

In [None]:
response = client.responses.create(
    model="gpt-5-nano",
    input=my_prompt,
    # input=my_str,
    # max_output_tokens=1000
)

print(response.output_text)

In [None]:
# Another way to invoke LLM

In [None]:
# Use langchain to call LLM itself
llm = ChatOpenAI(
    temperature=0.5,
    model="gpt-4.1",
    api_key=ashu_key
)

In [None]:
llm.invoke(my_prompt)

In [None]:
# using llm to call template
response1 = llm.invoke(my_prompt)

In [None]:
print(response1.content)

# Vector embedding for semantic **similarity**

In [None]:
# Vector embedding for semantic similarity

In [None]:
# importing scikit learn module

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
from openai import OpenAI
client = OpenAI(api_key=ashu_key)

def get_embedding(text, model="text-embedding-3-large"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input = [text], model=model)

    return np.array(response.data[0].embedding)

# df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
# df.to_csv('output/embedded_1k_reviews.csv', index=False)

In [None]:
text1 = "Exploring and learning RAG is fun!"
text2 = "FAISS is one of the embeddings that can be used in building RAG"
text3 = "I am big fan of Sachin Tendulkar!"

In [None]:
# getting the embedding of texts

In [None]:
emb1 = get_embedding(text1)
emb2 = get_embedding(text2)
emb3 = get_embedding(text3)

In [None]:
print(emb1)

In [None]:
# calling similarity
similarity1 = cosine_similarity([emb1], [emb2])
similarity2 = cosine_similarity([emb1], [emb3])
similarity3 = cosine_similarity([emb3], [emb2])


In [None]:
# print similarity
print(f"Similarity between text1 and text2: {similarity1[0][0]}")
print(f"Similarity between text1 and text3: {similarity2[0][0]}")
print(f"Similarity between text3 and text2: {similarity3[0][0]}")

# Implementing RAG with external CSV: Using FAISS

In [None]:
# installing required library
!pip install faiss-cpu pandas openai langchain-community langchain python-dotenv scikit-learn langchain_openai

In [None]:
# Importing required modules

In [None]:
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_openai import OpenAIEmbeddings

import os

In [None]:
# loading csv file
df = pd.read_csv("user.csv")
df.info()
df.head(3)

In [None]:
# converting dataframe to langchain document
loader = DataFrameLoader(
    df, page_content_column="name"
)
data = loader.load()

In [None]:
loader

In [None]:
# before embedding we need to tokenization


In [None]:
# Step 3: Split into chunks (though small, we'll treat each row as a chunk)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
chunks = text_splitter.split_documents(data)

In [None]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    # openai_api_key=os.getenv("OPENAI_API_KEY")  # or use the direct variable
    openai_api_key=ashu_key
)


In [None]:
embeddings

In [None]:
print(embeddings.json())

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOpenAI
from langchain_core.runnables import Runnable
from langchain_core.runnables import RunnablePassthrough

In [None]:
# store data in any vector DB
# Create vector DB to store vector in FAISS
vector_db = FAISS.from_documents(chunks, embeddings)

In [None]:
print(vector_db)

In [None]:
# To get the number of vectors in the FAISS index:
print(vector_db.index.ntotal)

In [None]:
# define rag prompt template
# Step 7: Define RAG prompt
template = """Answer the question based only on the following employee database context:
{context}

Question: {question}

Format your answer with these details:
- Name: [full name]
- Email: [email]
- Department: [department]
- Position: [position]
- Salary: [salary]
- Hire Date: [hire_date]

If multiple employees match, list them all.

If the user is asking very specific question or field then no need to follow the format above."""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
prompt

In [None]:
# Calling any ChatOpenAI
llm = ChatOpenAI(model_name='gpt-3.5-turbo', openai_api_key=ashu_key)

In [None]:
# create RAG chain to call prompt template + LLM
# chain = prompt | llm # this is okay to call it

rag_chain = (
    {"context": vector_db.as_retriever(search_kwargs={"k": 30}),  # Retrieve top 3 matches
    "question": RunnablePassthrough()
} | prompt | llm)

In [None]:
response = rag_chain.invoke("Can you print distinct positions of employee?")
print(response.content)

In [None]:
response = rag_chain.invoke("Compare all salaries and find Who has the highest salary. share details of only that employee")
print(response.content)

In [None]:
response = rag_chain.invoke("Explain machine learning.")
print(response.content)

In [None]:
response = rag_chain.invoke("Tell me something about employee.")
print(response.content)