In [5]:
import openai
import faiss
import numpy as np
from langchain.document_loaders import PyPDFLoader


# Load environment variables (assuming .env is set up with OpenAI API key)
from dotenv import load_dotenv
import os

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = PyPDFLoader("../dataset/about_me.pdf"),
    pages = []
    for loader in doc:
        pages.extend(loader.load())
    return pages

# Example function to get embeddings
def get_embeddings(texts, model="text-embedding-ada-002"):
    response = openai.Embedding.create(input=texts, model=model)
    embeddings = [e["embedding"] for e in response["data"]]
    return np.array(embeddings, dtype=np.float32)

# Embed and index the documents
def index_documents(docs):
    embeddings = get_embeddings(docs)
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Build the index
    index.add(embeddings)  # Add vectors to the index
    return index, embeddings

# Example query function
def search_index(index, query, k=5):
    query_embedding = get_embeddings([query])
    distances, indices = index.search(query_embedding, k)
    return distances, indices

# Extract text from your PDF
pdf_path = "../dataset/about_me.pdf"
document_text = extract_text_from_pdf(pdf_path)

# Assume the document is split into 4 pages for simplicity
docs = [document_text[i:i+500] for i in range(0, len(document_text), 500)]  # Adjust splitting logic as needed

# # Index the documents
# index, embeddings = index_documents(docs)

# # Query the index
# query = "Example query text"
# distances, indices = search_index(index, query)

# # Print the results
# print("Top matches:")
# for i in range(len(indices[0])):
#     print(f"Document: {docs[indices[0][i]]}, Distance: {distances[0][i]}")


In [10]:
docs = []
for idx in range(len(document_text)):
    docs.append(document_text[idx].page_content)

In [13]:
np.save("../dataset/docs.npy", docs)

In [16]:
np.load("../dataset/docs.npy")

array(['Mazi Prima Reza - Data Scientist  \nMazi Prima Reza is a skilled Data Scientist with over three years of experience in the data \nfield, currently working at Metrodata in Jakarta, Indonesia. Her professional journey is \nmarked by a strong background in handling diverse and end -to-end projects, ranging from \ndata analysis to the implementation of advanced generative AI technologies. Mazi is \ndedicated to building innovative solutions that automate repetitive tasks and enhance user \nexperiences through cutting -edge data analytics.  \nMetrodata , Jakarta,  Indonesia, Apr 2024 - Present  \nMetrodata is an information IT company in Indonesia. As a Data Scientist at a consultant \ncompany, Mazi has  been working with several leading companies and industries in Indonesia \nto build automation products involving Machine  Learning and Generative AI.  \nNotable Projects  \n1. HR Chatbot Development  - Led the development of a  internal company policies \nchatbot  in an Indonesia le