In [2]:
import os
from dotenv import load_dotenv
import numpy as np
import faiss
import json
from pathlib import Path
from pypdf import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from tqdm.notebook import tqdm

import pandas as pd
from typing import Optional, List, Tuple
#from datasets import Dataset
import matplotlib.pyplot as plt

from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

In [3]:
load_dotenv()

True

# load and chunk

In [4]:
pdf_files = Path("data").glob("*.pdf")
text = ""

for pdf_file in pdf_files:
    reader = PdfReader(pdf_file)
for page in reader.pages:
    text += page.extract_text() + "\n\n"

In [5]:
# We use a hierarchical list of separators specifically tailored for splitting Markdown documents
# This list is taken from LangChain's MarkdownTextSplitter class
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,  # The maximum number of characters in a chunk: we selected this value arbitrarily
    chunk_overlap=40,  # The number of characters to overlap between chunks
    add_start_index=True,  # If `True`, includes chunk's start index in metadata
    strip_whitespace=True,  # If `True`, strips whitespace from the start and end of every document
    separators=MARKDOWN_SEPARATORS,
)

docs_processed = []
# for doc in text:
#     docs_processed += text_splitter.split_documents([doc])
#docs_processed += text_splitter.split_documents(text)
docs_processed += text_splitter.split_text(text)

In [6]:
len(docs_processed), type(docs_processed)

(96, list)

# embed and store to VDB for free

In [7]:
len(docs_processed), type(docs_processed)

(96, list)

In [1]:
# To get the value of the max sequence_length, we will query the underlying `SentenceTransformer` object used in the RecursiveCharacterTextSplitter
#print(f"Model's maximum sequence length: {SentenceTransformer('thenlper/gte-small').max_seq_length}")

In [8]:
EMBEDDING_MODEL_NAME = "thenlper/gte-small"

In [9]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

: 

In [None]:
tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-small")
lengths = [len(tokenizer.encode(doc.page_content)) for doc in tqdm(docs_processed)]

In [None]:
# Create a dictionary to store chunks with unique keys
chunk_dict = {i: chunk for i, chunk in enumerate(chunks)}

# Save chunks to a JSON file
with open("data/cv_chunks.json", "w", encoding="utf-8") as f:
    json.dump(chunk_dict, f)

print(f"Number of chunks: {len(chunks)}")

In [None]:
api_key = os.environ["MISTRAL_API_KEY"]
client = MistralClient(api_key=api_key)

def embed(input: str):
    return client.embeddings("mistral-embed", input=input).data[0].embedding


embeddings = np.array([embed(chunk) for chunk in chunks])
dimension = embeddings.shape[1]

In [None]:
embeddings.shape

In [None]:
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

In [None]:
faiss.write_index(index, "data/vector_cv.index")
print(f"Index saved with {len(embeddings)} embeddings.")

# Read VDB, query w/ COS

In [4]:
# Load the saved FAISS index
index = faiss.read_index("data/vector_cv.index")

# Load the text chunks from JSON file
with open("data/cv_chunks.json", "r", encoding="utf-8") as f:
    chunk_dict = json.load(f)

# Check if the chunks are loaded correctly
print(f"Loaded {len(chunk_dict)} chunks.")

Loaded 73 chunks.


# query test w/ MISTRAL

In [6]:
api_key = os.environ["MISTRAL_API_KEY"]
client = MistralClient(api_key=api_key)
def embed(input: str):
    return client.embeddings("mistral-embed", input=input).data[0].embedding

In [7]:
# Example query embedding
query_text = "Does Antoine speak spanish?"
query_embedding = np.array([embed(query_text)])

MistralAPIException: Status: 403. Message: {"message":"Inactive subscription or usage limit reached"}

In [8]:
# Search the index
D, I = index.search(query_embedding, k=2)  # Top-2 nearest chunks with D -> Distance and I -> Index

# Retrieve the most relevant chunks
retrieved_chunks = [chunk_dict[f"{i}"] for i in I[0]]

print("Retrieved chunks:")
for chunk in retrieved_chunks:
    print(chunk)
    print("-----")

NameError: name 'query_embedding' is not defined

# query function w/ MISTRAL

In [9]:
prompt = """
Context information is below.
---------------------
{context}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query}
Answer:
"""


def ask(query: str, index, chunk_dict):
    embedding = embed(query)
    embedding = np.array([embedding])

    _, indexes = index.search(embedding, k=2)
    context = [chunk_dict[f"{i}"] for i in indexes.tolist()[0]]

    user_message = prompt.format(context=context, query=query)

    messages = [ChatMessage(role="user", content=user_message)]
    chat_response = client.chat(model="mistral-medium", messages=messages)
    return chat_response.choices[0].message.content

In [10]:
ask("What work experience does he have?", index, chunk_dict)

MistralAPIException: Status: 403. Message: {"message":"Inactive subscription or usage limit reached"}