In [3]:
import joblib
import os
import PyPDF2
import glob
import fitz  # PyMuPDF
import tabula
import pandas as pd
from nltk import sent_tokenize
from fuzzywuzzy import process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import os

In [4]:
# Set the path to the folder containing PDFs
path = r"C:\Users\maxim.oweyssi\Energy Saving Trust\Lutz Lemmer - Sample_AI_Chatbot_Documents"

# Function to extract text from a PDF using PyPDF2
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

# Use glob to get all PDF files in the specified folder
pdf_files = glob.glob(os.path.join(path, '*.pdf'))

In [5]:
def extract_text_from_pdf(pdf_path):
    text_by_page = {}
    doc = fitz.open(pdf_path)
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text_by_page[page_num + 1] = page.get_text()
    doc.close()
    return text_by_page

def identify_paragraphs_and_tables(pdf_path,context_window_size = 5):
    doc = fitz.open(pdf_path)
    paragraphs_by_page = {}
    tables_by_page = {}

    for page_num in range(doc.page_count):
        page = doc[page_num]

        # Use get_text("blocks") to get text blocks
        blocks = page.get_text("blocks")

        paragraphs = []
        current_paragraph = ""

        # Calculate the threshold as 1/10th of the page length from the top and bottom
        page_height = page.rect.height
        threshold = page_height / 10

        # Filter out blocks likely to be headers or footers based on their vertical position
        filtered_blocks = [block for block in blocks if block[3] > threshold and block[3] < (page_height - threshold)]

        # Extract text from filtered blocks
        page_text = " ".join([block[4] for block in filtered_blocks])

        # Use sent_tokenize to split the text into sentences
        sentences = sent_tokenize(page_text)
        
        for i in range(len(sentences) - context_window_size + 1):
            # Concatenate the sentences from the context window
            current_paragraph = " ".join(sentences[i:i + context_window_size])

            # Append the paragraph to the list
            paragraphs.append(current_paragraph.strip())

        # Table extraction using tabula-py
        try:
            tables = tabula.read_pdf(pdf_path, pages=page_num + 1, multiple_tables=True,encoding='latin1')
            # Convert tables to pandas dataframes for further processing
            tables_dataframes = [table for table in tables]
            tables_by_page[page_num + 1] = tables_dataframes
        except Exception as e:
            print(f"Error extracting tables on page {page_num + 1}: {e}")
            tables_by_page[page_num + 1] = []

        paragraphs_by_page[page_num + 1] = paragraphs

    doc.close()
    return paragraphs_by_page, tables_by_page

In [7]:
# Example usage:
pdf_filepath = pdf_files[0]
paragraphs_by_page, tables_by_page = identify_paragraphs_and_tables(pdf_filepath)

In [None]:
def compute_tfidf_vectors(texts):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix

def preprocess_text(text):
    # Remove special characters, extra whitespaces, and convert to lowercase
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = cleaned_text.lower()
    return cleaned_text

def search_term_cosine_similarity(query, documents, vectorizer):
    query_vector = vectorizer.transform([query])
    document_vectors = vectorizer.transform(documents)
    similarities = cosine_similarity(query_vector, document_vectors).flatten()
    return similarities

all_paragraphs = []
all_page_numbers = []
all_document_names = []
all_tables = []
for pdf_filepath in pdf_files:
    print(pdf_filepath)
    paragraphs_by_page, tables_by_page = identify_paragraphs_and_tables(pdf_filepath)
    
    # Extract paragraphs and tables
    doc_paragraphs = [paragraph for page_num, page_paragraphs in paragraphs_by_page.items() for paragraph in page_paragraphs]
    all_paragraphs.extend(doc_paragraphs)
    all_page_numbers.extend([page_num for page_num, page_paragraphs in paragraphs_by_page.items() for paragraph in page_paragraphs])
    all_document_names.extend([os.path.basename(pdf_filepath)]*len(doc_paragraphs))
    all_tables.extend([table.to_string(index=False) for page_num, page_tables in tables_by_page.items() for table in page_tables])


# Convert paragraphs to TF-IDF vectors
paragraphs_vectorizer, paragraphs_tfidf_matrix = compute_tfidf_vectors([preprocess_text(paragraph) for paragraph in all_paragraphs])

# Convert tables to TF-IDF vectors
tables_vectorizer, tables_tfidf_matrix = compute_tfidf_vectors([preprocess_text(table) for table in all_tables])


In [9]:
pagenum = 4
for i in range(len(paragraphs_by_page[pagenum])):
    print(paragraphs_by_page[pagenum][i])
    print("---------------------------------------")

The EPC is populated using results from a SAP calculation. This is a static 
building physics modelling method derived from the Building Research 
Establishment’s Domestic Energy Model (BREDEM). It is less flexible 
than the BREDEM model itself as certain parameters are either fixed or 
restricted to a specific range. A SAP calculation will model heat loss, 
internal gains, solar gains, energy balance, carbon emissions, heating, 
ventilation, internal lighting, cooling and renewable energy sources. Building Regulations Part L1A sets out how SAP should be used to 
determine whether a new building will meet current building 
regulations.
---------------------------------------
This is a static 
building physics modelling method derived from the Building Research 
Establishment’s Domestic Energy Model (BREDEM). It is less flexible 
than the BREDEM model itself as certain parameters are either fixed or 
restricted to a specific range. A SAP calculation will model heat loss, 
internal gains

: 

In [222]:
#dump results into files
joblib.dump(paragraphs_vectorizer, 'vectorizer.pkl', compress=True)
joblib.dump(all_paragraphs, 'all_paragraphs.pkl', compress=True)
joblib.dump(all_page_numbers, 'all_page_numbers.pkl', compress=True)
joblib.dump(all_document_names, 'all_document_names.pkl', compress=True)


['all_document_names.pkl']