In [3]:
from USOSDataLoader import USOSDataLoader
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.documents.base import Document
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from dotenv import load_dotenv, find_dotenv
from pydantic import BaseModel, Field
import re
import os
import time
from uuid import uuid4
import math

load_dotenv(find_dotenv())

True

In [4]:
documents = USOSDataLoader().get_documents()

Fetching links...


Fetching pdf links...: 100%|██████████| 34/34 [01:40<00:00,  2.95s/it]


Loading web data...


Loading pdf data...: 100%|██████████| 47/47 [00:27<00:00,  1.69it/s]
Preprocessing documents...: 100%|██████████| 81/81 [00:00<00:00, 3708.61it/s]


In [5]:
embeddings = HuggingFaceEmbeddings(model_name="jinaai/jina-embeddings-v3",
                                   model_kwargs={"trust_remote_code": True},
                                   encode_kwargs={"task": "retrieval.query"})

pc = Pinecone(os.environ.get("PINECONE_API_KEY"))

In [6]:
index_name = "usos-bot-questions"
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

vectorstore = PineconeVectorStore(index=index, embedding=embeddings)

In [7]:
class QuestionList(BaseModel):
    question_list: list[str] = Field(..., title="List of questions generated for the document or fragment")


def clean_and_filter_questions(questions: list[str]) -> list[str]:
    cleaned_questions = []
    for question in questions:
        cleaned_question = re.sub(r'^\d+\.\s*', '', question.strip())
        if cleaned_question.endswith('?'):
            cleaned_questions.append(cleaned_question)
    return cleaned_questions


def llm_chain(llm, text, n_questions, prompt):
    chain = prompt | llm.with_structured_output(QuestionList)
    input_data = {"context": text, "num_questions": n_questions}
    result = chain.invoke(input_data)
    return result


def generate_questions(text: str, n_questions) -> list[str]:
    llm = ChatGroq(model="llama-3.1-70b-versatile", temperature=0)
    prompt = PromptTemplate(
        input_variables=["context", "num_questions"],
        template="Using the context data: {context}\n\nGenerate a list of at least {num_questions} "
                 "possible questions that can be asked about this context. Ensure the questions are "
                 "directly answerable within the context and do not include any answers or headers. "
                 "The questions should be in the same language as the context. "
                 "Separate the questions with a new line character."
    )

    prompt_secondary = PromptTemplate(
        input_variables=["context", "num_questions"],
        template="Using the context data: {context}\n\nGenerate a list of at least {num_questions} "
                 "possible questions that can be asked about this context. Ensure the questions are "
                 "directly answerable within the context and do not include any answers or headers. "
                 "The questions should be in the Polish language. "
                 "Separate the questions with a new line character."
    )

    try:
        result = llm_chain(llm, text, n_questions, prompt)
    except Exception as e:
        result = llm_chain(llm, text, n_questions, prompt_secondary)

    questions = result.question_list

    filtered_questions = clean_and_filter_questions(questions)
    return list(set(filtered_questions))


def split_document(document: str, chunk_size: int, chunk_overlap: int) -> list[str]:
    tokens = re.findall(r'\b\w+\b', document)
    chunks = []
    for i in range(0, len(tokens), chunk_size - chunk_overlap):
        chunk_tokens = tokens[i:i + chunk_size]
        chunks.append(chunk_tokens)
        if i + chunk_size >= len(tokens):
            break
    return [" ".join(chunk) for chunk in chunks]


def print_document(comment: str, document: Document) -> None:
    print(
        f'{comment} (type: {document.metadata["type"]}, index: {document.metadata["index"]}): {document.page_content}')

In [2]:
def parse_sleep_time(raw_sleep):
    SECONDS_IN_MINUTE = 60
    mins_match = re.search("\\d+(?=m)", raw_sleep)
    if mins_match:
        mins_match = int(mins_match.group(0))
        
    seconds_match = re.search(r"\\d+.\\d+(?=s)", raw_sleep)
    if seconds_match:
        seconds_match = float(seconds_match.group(0))
        
    return SECONDS_IN_MINUTE * mins_match + seconds_match

In [8]:
def add_document(fragment, i, j, counter, source):
    knowledge_base = [Document(
        page_content=fragment,
        metadata=dict(type="ORIGINAL", index=counter, source=source, orig_text=fragment)
    )]
    questions = generate_questions(fragment, n_questions=20)
    print("SLEEPING AFTER QUESTION GENERATION")
    time.sleep(60)
    knowledge_base.extend([
        Document(page_content=question,
                 metadata=dict(type="AUGMENTED", index=counter + idx, orig_text=fragment))
        for idx, question in enumerate(questions)
    ])
    counter += len(questions)
    print(f'Text document {i} Text fragment {j} - generated: {len(questions)} questions')
    for doc in knowledge_base:
        print_document("Dataset", doc)

    uuids = [str(uuid4()) for _ in range(len(knowledge_base))]
    vectorstore.add_documents(documents=knowledge_base, ids=uuids)

    return counter


def process_documents(documents: list[Document], chunk_size: int, chunk_overlap: int):
    counter = 0
    for i, document in enumerate(documents):
        text = document.page_content
        text_fragments = split_document(text, chunk_size, chunk_overlap)
        for j, fragment in enumerate(text_fragments):
            print(f"Document {i} - split into {len(text_fragments)}")
            try:
                counter = add_document(fragment, i, j, counter, source=document.metadata["source"])
            except Exception as e:
                print(e)
                match = re.search(r"\\d+m\\d+.\\d+s|\\d+.\\ds|\\d+m", e["error"]["message"])
                if match:
                    match = match.group(0)
                    sleep_parsed = parse_sleep_time(match)
                    time.sleep(sleep_parsed)
                # if no match - let it crash
                counter = add_document(fragment, i, j, counter, source=document.metadata["source"])

In [10]:
process_documents(documents[37:], 1000, 200)

Document 23 - split into 1
Text document 23 Text fragment 0 - generated: 20 questions
Dataset (type: ORIGINAL, index: 0): Instrukcja uzupełniania dokumentów tożsamości osoby w USOS Java Aby uzupełnić dokumenty tożsamości osoby należy wyjeść do formularza Osoby Osoby odnaleźć szukaną osobę a następnie przejść do zakładki dokumenty W zakładce mogą znaleźć się już wcześniej wprowadzone dokumenty aby dodać kolejny wpis należy kliknąć przycisk po prawej stronie Dodaj Uzupełniamy wymagane pola i klikamy ikonę dyskietki aby zapisać zmiany
Dataset (type: AUGMENTED, index: 0): Jak zapisać zmiany w dokumentach tożsamości?
Dataset (type: AUGMENTED, index: 1): Czy mogę wydrukować dokumenty tożsamości?
Dataset (type: AUGMENTED, index: 2): Co się dzieje, gdy nie wypełnię wymaganych pól?
Dataset (type: AUGMENTED, index: 3): Czy mogę zapisać dokumenty bez klikania ikony dyskietki?
Dataset (type: AUGMENTED, index: 4): Co się dzieje, gdy kliknię przycisk Dodaj?
Dataset (type: AUGMENTED, index: 5): Jak z

KeyboardInterrupt: 