In [12]:
from USOSDataLoader import USOSDataLoader
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents.base import Document
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from dotenv import load_dotenv, find_dotenv
from pydantic import BaseModel, Field
import re
import os
import time
from uuid import uuid4

load_dotenv(find_dotenv())

True

In [2]:
documents = USOSDataLoader().get_documents()

Fetching links...


Fetching pdf links...: 100%|██████████| 34/34 [00:39<00:00,  1.18s/it]


Loading web data...


Loading pdf data...: 100%|██████████| 47/47 [00:30<00:00,  1.56it/s]
Preprocessing documents...: 100%|██████████| 81/81 [00:00<00:00, 3559.52it/s]


In [8]:
embeddings = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v3",
                                   model_kwargs={"trust_remote_code": True},
                                   encode_kwargs={"task": "retrieval.query"})

pc = Pinecone(os.environ.get("PINECONE_API_KEY"))

In [11]:
index_name = "usos-bot-questions"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

vectorstore = PineconeVectorStore(index=index, embedding=embeddings)

In [13]:
class QuestionList(BaseModel):
    question_list: list[str] = Field(..., title="List of questions generated for the document or fragment")


def clean_and_filter_questions(questions: list[str]) -> list[str]:
    cleaned_questions = []
    for question in questions:
        cleaned_question = re.sub(r'^\d+\.\s*', '', question.strip())
        if cleaned_question.endswith('?'):
            cleaned_questions.append(cleaned_question)
    return cleaned_questions


def generate_questions(text: str, n_questions) -> list[str]:
    llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0)
    prompt = PromptTemplate(
        input_variables=["context", "num_questions"],
        template="Using the context data: {context}\n\nGenerate a list of at least {num_questions} "
                 "possible questions that can be asked about this context. Ensure the questions are "
                 "directly answerable within the context and do not include any answers or headers. "
                 "The questions should be in the same language as the context. "
                 "Separate the questions with a new line character."
    )
    chain = prompt | llm.with_structured_output(QuestionList)
    input_data = {"context": text, "num_questions": n_questions}
    result = chain.invoke(input_data)

    questions = result.question_list

    filtered_questions = clean_and_filter_questions(questions)
    return list(set(filtered_questions))


def split_document(document: str, chunk_size: int, chunk_overlap: int) -> list[str]:
    tokens = re.findall(r'\b\w+\b', document)
    chunks = []
    for i in range(0, len(tokens), chunk_size - chunk_overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunks.append(chunk_tokens)
        if i + chunk_size >= len(tokens):
            break
    return [" ".join(chunk) for chunk in chunks]

def print_document(comment: str, document: Document) -> None:
    print(f'{comment} (type: {document.metadata["type"]}, index: {document.metadata["index"]}): {document.page_content}')

In [18]:
def add_document(document, chunk_size, chunk_overlap, i, counter):
    knowledge_base = []
    text = document.page_content
    text_fragments = split_document(text, chunk_size, chunk_overlap)
    print(f"Document {i} - split into {len(text_fragments)}")
    for j, fragment in enumerate(text_fragments):
        knowledge_base.append(
            Document(
                page_content=fragment,
                metadata=dict(type="ORIGINAL", index=counter, source=document.metadata["source"], text=fragment)
            )
        )
        questions = generate_questions(text, n_questions=20)
        knowledge_base.extend([
            Document(page_content=question,
                     metadata={"type": "AUGMENTED", "index": counter + idx, "text": fragment})
            for idx, question in enumerate(questions)
        ])
        counter += len(questions)
        print(f'Text document {i} Text fragment {j} - generated: {len(questions)} questions')
    for doc in knowledge_base:
        print_document("Dataset", doc)
    
    uuids = [str(uuid4()) for _ in range(len(knowledge_base))]
    vectorstore.add_documents(documents=knowledge_base, ids=uuids)


def process_documents(documents: list[Document], chunk_size: int, chunk_overlap: int):
    counter = 0
    for i, document in enumerate(documents):
        try:
            counter = add_document(document, chunk_size, chunk_overlap, i, counter)
        except Exception as e:
            time.sleep(120)
            counter = add_document(document, chunk_size, chunk_overlap, i, counter)

In [None]:
process_documents(documents, 1000, 200)

Document 0 - split into 2
Text document 0 Text fragment 0 - generated: 21 questions
