In [None]:
%pip install tqdm
%pip install -q langchain langchain_community 
%pip install -q pdfplumber




In [1]:
from tqdm.autonotebook import tqdm
import pdfplumber
import os 
import re
import json

  from tqdm.autonotebook import tqdm


## Util Functions

In [28]:

def extract_text_from_pdf(file_path, start_page=0):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages):
            if i >= start_page:
                text += page.extract_text()
                text += "\n"
    return text

def clean_data(text):
    # Remove lines that contain a number surrounded by whitespace characters on both sides.
    # This likely removes remaining page numbers.
    text = re.sub(r"^\s*\d+\s*\n", "", text, flags=re.MULTILINE)

    # Remove the page number at the very end of the file
    text = re.sub(r"\s*\d+\s*$", "", text)

    
    # Replace single newlines with a space. This is likely to merge lines that were
    # split incorrectly during text extraction.
    text = re.sub(r"(?<![?.!])(\n)(?!\d)", " ", text)

    return text

def split_text_by_questions_answers(text):
    # Split text into paragraphs based on empty lines
    paragraphs = text.split("\n")

    # Initialize list to store question and answer pairs
    result = []
    question = ""
    answer = ""

    # Iterate through the paragraphs
    for paragraph in paragraphs:
        #print(f'PARAGRAPH: {paragraph}')
        # Check if paragraph is a question
        if re.match(r"^\d{1,4}\sQ\.\s.*\?$", paragraph):
            #print('QUESTUIN')
            # Remove "# Q. " and strip trailing whitespace to get question
            question = re.sub(r"^\d{1,3}\sQ\.\s", "", paragraph).strip()
        # Check if paragraph is an answer
        elif paragraph.startswith("A. "):
            #print('ANSWER')
            # Remove "A." and strip trailing whitespace to get answer
            answer = paragraph.replace("A.", "").strip()
            # Append question and answer pair to result
            if question:
                result.append({"question": question, "answer": answer})
                question = ""
                answer = ""
        #else:
            #print('NEITHER')
            

    return result
    
def convert_to_jsonl(qa_pairs, filename):
    with open(filename, "w", encoding="utf-8") as f:
        # Write the initial system message as the first line of the file
        initial_message = {
            "messages": [
                {
                    "role": "System",
                    "content": "Jesteś katolickim teologiem uznającym jedynie naukę Kościoła sprzed Soboru Watykańskiego II. Odpowiedz zwieźle na pytania doktrynalne podając odpowiednie źródła."
                },
                {"role": "Chatbot", "content": "Laudatur Jesus Christus!"}
            ]
        }
        json.dump(initial_message, f, ensure_ascii=False)
        f.write("\n")

        # Write the question-answer pairs as subsequent lines of the file
        for qa_pair in qa_pairs:
            messages = [
                {"role": "User", "content": qa_pair["question"]},
                {"role": "Chatbot", "content": qa_pair["answer"]}
            ]
            json.dump({"messages": messages}, f, ensure_ascii=False)
            f.write("\n")

In [15]:
file_path = "source-data/The Catechism of Saint Pope Pius X.pdf"
pdf_text = extract_text_from_pdf(file_path,start_page=9)



In [None]:

clean_text = clean_data(pdf_text)
#print(clean_text)
qa_list = split_text_by_questions_answers(clean_text)
for qa in qa_list:
   print(f"Question: {qa['question']}")
   print(f"Answer: {qa['answer']}\n")

In [30]:
# Convert the question-answer pairs into JSON format and save to a file
convert_to_jsonl(qa_list, "data-sets/pius_x_data_set.jsonl")