In [None]:
import spacy

In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
import PyPDF2
import os

In [None]:
with open('stop.txt',encoding='utf-8') as stopwords_file:
    stopwords = set(map(lambda x: x[:-1],stopwords_file.readlines()))

In [None]:
class LemmaMaker:
    
    def __init__(self):
        self.spacy = spacy.load('pl_core_news_lg')
    
    def lemmatise(self, word):
        return self.spacy(word)[0].lemma_

In [None]:
lemmatiser = LemmaMaker() 

In [None]:
from haystack.document_stores import InMemoryDocumentStore

document_store_pages = InMemoryDocumentStore(use_bm25=True, embedding_dim=1024)

In [None]:
processed_directory_pages = 'processed_documents_pages_intfloat'
os.makedirs(processed_directory_pages, exist_ok=True)

In [None]:
def pdf_txt_pages(directory_path):
    for filename in os.listdir(directory_path):
        reader = PyPDF2.PdfReader(os.path.join(directory_path,filename))
        new_file_name = filename.replace(".pdf","_")
        for index,page in enumerate(reader.pages):
            with open(os.path.join(processed_directory_pages,f"{new_file_name}{index}.txt"),'w',encoding='utf-8') as output_file:
                text = page.extract_text()
                text = " ".join(word for word in map(lambda x: lemmatiser.lemmatise(x), text.split()) if word not in stopwords)
                text = "passage: " + text
                output_file.write(text.replace('\n',''))

In [None]:
pdf_txt_pages('input')

In [None]:
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack.nodes import PreProcessor

preprocessor_pages = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    remove_substrings=None,
    split_by="word",
    split_length=1000,
    split_respect_sentence_boundary=True,
    split_overlap=0,
    max_chars_check=10_000
)


In [None]:
files_to_index_pages = [os.path.join(processed_directory_pages,f) for f in os.listdir(processed_directory_pages)]
indexing_pipeline_pages = TextIndexingPipeline(document_store_pages, preprocessor=preprocessor_pages)
return_val = indexing_pipeline_pages.run_batch(file_paths=files_to_index_pages)

In [None]:
id_dict = {k:v for k,v in zip(map(lambda x: x.id,return_val['documents']),return_val['file_paths'])}

In [None]:
from haystack.nodes import BM25Retriever

retriever_pages = BM25Retriever(document_store=document_store_pages)

In [None]:
from haystack import Pipeline


p_p = Pipeline()
p_p.add_node(component=retriever_pages, name="ESRetriever1", inputs=["Query"])

In [None]:
cutoff_t = 0.9999

In [None]:
accept_t = 0.9999

In [None]:
docs = p_p.run(query="ROLAP")["documents"]

In [None]:
def process_query(pipeline, query, cutoff_t = 0.93, accept_t = 0.8):
    
    docs = pipeline.run(query=query)["documents"]
    
    selected_docs = list(filter(lambda x: x.score > cutoff_t, docs))
    docs = selected_docs if len(selected_docs) > 0 else sorted(docs, key=lambda x: x.score)[-2:]
    
    split_pattern = r'( |\.|,)i |(oraz)|,|\.|\(|\)'
    phrase_parts = re.split(split_pattern, query)
    print(phrase_parts)
    combined_text = ' '.join(map(lambda x: x.content, docs))
    
    for phrase_part in phrase_parts:
        if phrase_part and phrase_part not in combined_text:
            new_docs = pipeline.run(query=phrase_part)["documents"]
            selected_docs = list(filter(lambda x: x.score > accept_t, new_docs))
            new_docs = selected_docs if len(selected_docs) > 0 else sorted(new_docs, key=lambda x: x.score)[-1:]
            docs.extend(new_docs)
            
    return docs


In [None]:
import re

def run_queries(pipeline, query_dir, top_answers=1):
    query_files = [os.path.join(query_dir,f) for f in os.listdir(query_dir)]
    res = {}
    scores = {}
    for file_path in query_files:
        with open(file_path,"r",encoding="utf-8") as queries:
            for query_line in queries:
                query = re.sub(r'\s*[0-9]+\.\s*','',query_line)
                modified_query = "query: " + (" ".join(map(lambda x: lemmatiser.lemmatise(x), query.split()))).lower()
                docs = process_query(pipeline, modified_query)
                res[query] = set(map(lambda x: id_dict[x.id], docs))
                scores[query] = set(map(lambda x: x.score, docs))
                
    
    #print_dict(scores)
    return res

In [None]:
def print_dict(dictionary):
    for key in dictionary.keys():
        print(key)
        print(dictionary[key])
        print()

In [None]:
print_dict(run_queries(p_p,'queries'))

In [None]:
import PyPDF2
from PyPDF2 import PdfReader, PdfWriter
from PyPDF2.generic import AnnotationBuilder

In [None]:
answers = run_queries(p_p,'queries')

In [None]:
from reportlab.lib.pagesizes import landscape
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib.units import inch

# Create a function to generate the title page
def create_title_page(pdf_file, text):
    doc = SimpleDocTemplate(pdf_file, pagesize=landscape((5 * inch, 7.5 * inch)))

    # Create a list to hold the elements of the title page
    elements = []

    # Define the title style
    styles = getSampleStyleSheet()
    title_style = styles["Title"]

    # Create the title and add it to the elements list
    title = Paragraph(text, title_style)
    elements.append(title)

    # Add some space below the title
    elements.append(Spacer(1, 0.5 * inch))

    # Add any other content you want on the title page
    # For example:
    # author = Paragraph("Author: Your Name", styles["Normal"])
    # elements.append(author)

    # Build the title page
    doc.build(elements)

In [None]:
def create_page(output_pdf_writer, welcome_text):
    page = PyPDF2.PageObject.create_blank_page(width=500, height=500)
    page.mergeScaledTranslatedPage(welcome_text, 0.5, 100, 200)
    output_pdf_writer.addPage(page)

def get_files(file_set):
    input_directory = "input"
    file_set = map(lambda x: x.replace(processed_directory_pages,input_directory), file_set)
    file_set = map(lambda x: ("_".join((x.split("_")[:-1]))  + ".pdf", int(x.split("_")[-1].split(".")[0])) , file_set)
    return sorted(list(file_set))
    
def gen_pdf(queries, output_pdf_name):
    output_pdf_writer = PyPDF2.PdfWriter()
    curr_page = 0
    for query in queries.keys():
        
        create_title_page("temp.pdf",query)
        with open("temp.pdf", 'rb') as temp_page_pdf_file:
            temp_page_pdf_reader = PyPDF2.PdfReader(temp_page_pdf_file)
            page = temp_page_pdf_reader.pages[0]
            output_pdf_writer.add_page(page)

        curr_page += 1
        
        for input_pdf, page in get_files(queries[query]):
            print(input_pdf, page)
            
            with open(input_pdf, 'rb') as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                out_page = pdf_reader.pages[page]
                output_pdf_writer.add_page(out_page)
                curr_page += 1

    with open(output_pdf_name, 'wb') as output_file:
        output_pdf_writer.write(output_file)

In [None]:
gen_pdf(answers, "res.pdf")