## A simple Langchain system
Using Semantic Chunking -> Get top k -> Using cohere reranking to filter again

In [52]:
from langchain_experimental.text_splitter import SemanticChunker
# from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

import openai
import cohere

from dotenv import load_dotenv
import os
import sys
import numpy as np
import fitz
import google.generativeai as genai


In [3]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

def read_pdf_to_string(path):
    """
    Read a PDF document from the specified path and return its content as a string.
    """
    # Open the PDF document located at the specified path
    doc = fitz.open(path)
    content = ""
    # Iterate over each page in the document
    for page_num in range(len(doc)):
        # Get the current page
        page = doc[page_num]
        # Extract the text content from the current page and append it to the content string
        content += page.get_text()
    return content

Text_splitter:
https://python.langchain.com/docs/how_to/semantic-chunker/

embeddings: Huggingface 
https://python.langchain.com/docs/integrations/providers/huggingface/#huggingfaceembeddings

or OpenAI (money)
https://python.langchain.com/docs/integrations/text_embedding/openai/

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

text_splitter = SemanticChunker(embeddings=embeddings, 
                                breakpoint_threshold_type='percentile',     # Determine breakpoints type
                                breakpoint_threshold_amount=90)             # Threshold

### Split documents

In [None]:
path = "C:/Users/PC/Downloads/Understanding_Climate_Change.pdf"
content = read_pdf_to_string(path)   # Documents

docs = text_splitter.create_documents([content])
docs

### Embedding + storing the documents in vector space

Vectorstore:
https://python.langchain.com/docs/integrations/vectorstores/

In [44]:
vectorstore = FAISS.from_documents(docs, embeddings)   # Default using FAISS IndexFlatL2 (Euclidean Distance) -> Smaller mean better
                                                       # IndexFlatIP (Inner Product) -> Larger score means better 

### Retriever and similar search

In [None]:
chunks_query_retriever = vectorstore.as_retriever(search_kwargs={"k": 2})  # top-k most semantically similar chunks 

query = "What will happen if climate change happen?"
best_suit_docs = chunks_query_retriever.invoke(query)

for doc in best_suit_docs:
    print("Retrieved chunk:", doc.page_content)


In [None]:
# Convert query to embedding
query_embedding = embeddings.embed_query(query)

# Search with FAISS, retrieving scores
scores, indices = vectorstore.index.search(np.array([query_embedding]), k=4)    # Compare between vector store (docs embedding) and query embedding

# Get the documents and their similarity scores
retrieved_docs = [docs[i] for i in indices[0]]
similarity_scores = scores[0]

# Display results (the smaller the score the better)
for doc, score in zip(retrieved_docs, similarity_scores):
    print(f"Similarity Score: {score}, Document: {doc.page_content}\n")

In [56]:
# In case they're not just plain text yet
retrieved_docs = [doc.page_content if hasattr(doc, "page_content") else str(doc) for doc in retrieved_docs]
retrieved_docs

 'Understanding Climate Change \nChapter 1: Introduction to Climate Change \nClimate change refers to significant, long-term changes in the global climate. The term \n"global climate" encompasses the planet\'s overall weather patterns, including temperature, \nprecipitation, and wind patterns, over an extended period. Over the past century, human \nactivities, particularly the burning of fossil fuels and deforestation, have significantly \ncontributed to climate change. Historical Context \nThe Earth\'s climate has changed throughout history. Over the past 650,000 years, there have \nbeen seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about \n11,700 years ago marking the beginning of the modern climate era and human civilization. Most of these climate changes are attributed to very small variations in Earth\'s orbit that \nchange the amount of solar energy our planet receives. During the Holocene epoch, which \nbegan at the end of the last ice age,

### Cohere Rerank

In [None]:
co = cohere.Client('QNDfdLA0HfCwQhl43nXc68afzP8gaPfP40ZpUj5O')

response = co.rerank(
    model="rerank-v3.5",
    query="What will happen if climate change happen?",
    documents=retrieved_docs,
    top_n=3,
)
print(response)

In [None]:
for r in response.results:
    print(f"Rank: {r.index}, Score: {r.relevance_score:.4f}, Text: {retrieved_docs[r.index]}")

### Use LLM to response (Optional)

In [None]:
GOOGLE_API_KEY = "your_api_genai"
genai.configure(api_key=GOOGLE_API_KEY)
model_gen = genai.GenerativeModel("gemini-1.5-flash")


prompt = # User information + query + response after Cohere Rerank

response = model_gen.generate_content(prompt)