In [None]:
%pip install tqdm
%pip install -q langchain langchain_community 
%pip install -q pdfplumber




In [None]:
from tqdm.autonotebook import tqdm
import pdfplumber
import os 
import re
import json

  from tqdm.autonotebook import tqdm


## Util Functions

In [None]:

def extract_text_from_pdf(file_path, start_page=0):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            if i >= start_page:
                text += page.extract_text()
                text += "\n"
    return text

def clean_data(text):
    # Remove lines that contain a number surrounded by whitespace characters on both sides.
    # This likely removes remaining page numbers.
    text = re.sub(r"^\s*\d+\s*\n", "", text, flags=re.MULTILINE)

    # Remove the page number at the very end of the file
    text = re.sub(r"\s*\d+\s*$", "", text)

    text = re.sub(r'\([^()]*\)', '', text)

    
    # Replace single newlines with a space. This is likely to merge lines that were
    # split incorrectly during text extraction.
    text = re.sub(r"(?<![?.!—])(\n)(?!\d)", " ", text)

    text = text.replace("- ", "")

    return text

def split_text_by_questions_answers(text):
    # Split text into paragraphs based on new line character
    paragraphs = text.split("\n")

    # Initialize list to store question and answer pairs
    result = []
    question = ""
    answer = ""

    # Iterate through the paragraphs
    for paragraph in paragraphs:
        # Check if paragraph is a question
        if paragraph.strip().startswith("—") and "?" in paragraph:
            # Strip trailing whitespace and remove "— " to get question
            question = paragraph.strip().replace("— ", "")
        # Check if paragraph is an answer and there is a previous question
        elif paragraph.strip().startswith("—") and question:
            # Strip trailing whitespace to get answer
            answer = paragraph.strip().replace("— ", "")
            # Append question and answer pair to result
            result.append({"question": question, "answer": answer})
            question = ""
            answer = ""

    return result
            
    
def convert_to_jsonl(qa_pairs, filename):
    with open(filename, "w", encoding="utf-8") as f:
        # Write the initial system message as the first line of the file
        initial_message = {
            "messages": [
                {
                    "role": "System",
                    "content": "Jesteś katolickim teologiem uznającym jedynie naukę Kościoła sprzed Soboru Watykańskiego II. Odpowiedz zwieźle na pytania doktrynalne podając odpowiednie źródła."
                },
                {"role": "User", "content": "Laudatur Jesus Christus!"},
                {"role": "Chatbot", "content": "In saecula saeculorum!"}
            ]
        }
        json.dump(initial_message, f, ensure_ascii=False)
        f.write("\n")

        # Write the question-answer pairs as subsequent lines of the file
        for qa_pair in qa_pairs:
            messages = [
                {"role": "User", "content": qa_pair["question"]},
                {"role": "Chatbot", "content": qa_pair["answer"]}
            ]
            json.dump({"messages": messages}, f, ensure_ascii=False)
            f.write("\n")

In [18]:
file_path = "source-data/Katechizm według Summy Teologicznej św. Tomasza z Akwinu - o. Tomasz Pegues OP (1919).pdf"
pdf_text = extract_text_from_pdf(file_path,start_page=9)



In [None]:

clean_text = clean_data(pdf_text)
# print(clean_text)
qa_list = split_text_by_questions_answers(clean_text)
for qa in qa_list:
   print(f"Question: {qa['question']}")
   print(f"Answer: {qa['answer']}\n")

In [28]:
# Convert the question-answer pairs into JSON format and save to a file
convert_to_jsonl(qa_list, "data-sets/aquinus_data_set.jsonl")