In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

False

In [2]:
os.environ['GOOGLE_API_KEY'] = os.getenv('GG_API_KEY')

In [3]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

1. Load document
2. Convert to text
3. Chunking
4. Embed
5. Store 
6. Retrieval

In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = PyPDFLoader('NguyenThienNhan_CV.pdf')
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)
chunks = text_splitter.split_documents(docs)

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings_model = HuggingFaceEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
"""
!docker run \
    --name pgvector-container \
    -e POSTGRES_USER=langchain \
    -e POSTGRES_PASSWORD=langchain \
    -e POSTGRES_DB=langchain \
    -p 6024:5432 \
    -d pgvector/pgvector:pg16
"""

'\n!docker run     --name pgvector-container     -e POSTGRES_USER=langchain     -e POSTGRES_PASSWORD=langchain     -e POSTGRES_DB=langchain     -p 6024:5432     -d pgvector/pgvector:pg16\n'

In [7]:
conn = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"

In [8]:
from langchain_postgres.vectorstores import PGVector
import uuid

In [9]:
db = PGVector.from_documents(chunks, embeddings_model, connection=conn)

In [10]:
db.similarity_search("What is education of this applicant",k=4)

[Document(id='1d051d5f-560d-41ab-9005-d78f0b23f578', metadata={'page': 0, 'title': "Nhan's CV", 'author': 'Thien Nhan', 'source': 'NguyenThienNhan_CV.pdf', 'creator': 'LaTeX with RenderCV', 'moddate': '2025-06-25T12:48:35+00:00', 'subject': '', 'trapped': '/False', 'keywords': '', 'producer': 'pdfTeX-1.40.26', 'page_label': '', 'total_pages': 1, 'creationdate': '2025-06-25T12:48:35+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.26 (TeX Live 2024) kpathsea version 6.4.0'}, page_content='Nguyen Thien Nhan\nAI Engineer\nHo Chi Minh | nhanthien.tnn@gmail.com | 30/11/2003 | 0328980403 | github.com/nhanth301| Linkedin\nSummary\nAnalytical AI Engineer with a passion for rigorous research and creative problem-solving. Skilled in transforming complex\nrequirements into innovative, data-driven solutions. Seeking to apply these abilities to solve challenging real-world problems.\nSkills\nLanguages & Frameworks:C++, Python, SQL, PyTorch, Scikit-learn, NumPy , Pandas, Open

In [11]:
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import chain
template = PromptTemplate.from_template("""
    Answer the question based on the context provided
    Context: {context}
    Question: {question}
    Answer:
""")
@chain 
def chat_with_docs(question):
    topk_docs = db.similarity_search(question, k=2)
    context = ""
    for doc in topk_docs:
        context += '\n' + doc.page_content 
    print(context)
    print("=============")
    prompt = template.invoke({'context' : context, 'question' : question})
    return llm.invoke(prompt)

In [12]:
print(chat_with_docs.invoke("Describe briefly about Smart Public Library System project of the applicant").content)


content preservation across multiple, sequential style applications.
• Outperformed pure Mamba (MambaST) and Transformer (StyTr²)architectures in content preservation, achieving a
superior Content Feature Structural Distance (CFSD) of 0.2551.
• Dominated serial style transfer benchmarksby reducing perceptual error (LPIPS) by 61.4% and improving structural
similarity (SSIM) by 48.9% compared to previous works.
Smart Public Library System[GitHub]
Oct 2024 - Dec 2024
• Developed an end-to-end smart library systemfeaturing a real-time recommendation engine using Reinforcement
Learning (REINFORCE) and Contrastive Learning to personalize user experience.
• Engineered a real-time data pipelinewith Apache Kafka and Quix Stream to process live user interactions, feeding
dynamic data into the recommendation model.
• Implemented a smart chatbot using Retrieval-Augmented Generation (RAG)to provide users with intuitive,
context-aware book suggestions based on natural language queries.
content pres

In [13]:
from pydantic import BaseModel
class Answer(BaseModel):
    """An answer as brief summary along with rating"""
    summary : str
    """Summary of a project"""
    technologies : str
    """the technologies have been used in the project"""
structured_llm = llm.with_structured_output(Answer)

In [14]:
@chain 
def chat_with_docs(question):
    topk_docs = db.similarity_search(question, k=2)
    context = ""
    for doc in topk_docs:
        context += '\n' + doc.page_content 
    prompt = template.invoke({'context' : context, 'question' : question})
    return structured_llm.invoke(prompt)

In [15]:
result = chat_with_docs.invoke("Describe briefly about Smart Public Library System project of the applicant")
print(result.summary)
print(result.technologies)

The applicant developed an end-to-end smart library system featuring a real-time recommendation engine using Reinforcement Learning (REINFORCE) and Contrastive Learning to personalize user experience. They also engineered a real-time data pipeline with Apache Kafka and Quix Stream to process live user interactions, feeding dynamic data into the recommendation model. Additionally, they implemented a smart chatbot using Retrieval-Augmented Generation (RAG) to provide users with intuitive, context-aware book suggestions based on natural language queries.
Reinforcement Learning (REINFORCE), Contrastive Learning, Apache Kafka, Quix Stream, Retrieval-Augmented Generation (RAG


In [17]:
from langchain.indexes import SQLRecordManager, index
from langchain.docstore.document import Document

In [18]:
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
collection_name = "my_docs"
embeddings_model = HuggingFaceEmbeddings()
namespace = "my_docs_namespace"
	
vectorstore = PGVector(
    embeddings=embeddings_model,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)
	
record_manager = SQLRecordManager(
    namespace,
    db_url="postgresql+psycopg://langchain:langchain@localhost:6024/langchain",
)

record_manager.create_schema()

In [19]:
docs = [
    Document(page_content='there are cats in the pond', metadata={
        "id": 1, "source": "cats.txt"}),
    Document(page_content='ducks are also found in the pond', metadata={
        "id": 2, "source": "ducks.txt"}),
]

In [20]:
# Index the documents
index_1 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",  # prevent duplicate documents
    source_id_key="source",  # use the source field as the source_id
)
	
print("Index attempt 1:", index_1)

Index attempt 1: {'num_added': 2, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}


In [21]:
# second time you attempt to index, it will not add the documents again
index_2 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)
	
print("Index attempt 2:", index_2)

Index attempt 2: {'num_added': 0, 'num_updated': 0, 'num_skipped': 2, 'num_deleted': 0}


In [22]:
docs[0].page_content = "I just modified this document!"
	
index_3 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup="incremental",
    source_id_key="source",
)
	
print("Index attempt 3:", index_3)

Index attempt 3: {'num_added': 1, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 1}


In [25]:
docs = [Document(page_content='there are dogs in the pond', metadata={"id": 1, "source": "dogs.txt"})]

index_4 = index(
    docs,
    record_manager,
    vectorstore,
    cleanup='full',
    source_id_key='source'
)
print(index_4)

{'num_added': 1, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 2}
