In [94]:
import openai
from openai import OpenAI
import torch
import pymupdf
import pdfplumber
from PIL import Image
import io
import pytesseract
import pandas as pd
import camelot
import faiss
import clip
import numpy as np
from transformers import GPT2Tokenizer
from sklearn.decomposition import PCA
import requests
from bs4 import BeautifulSoup
import base64
import requests
from io import BytesIO
import os

In [3]:
text_archive = []
image_archive = []
text_metadata = []
image_metadata = []

def get_text_embedding(text):
    response = openai.embeddings.create(input=text, model="text-embedding-3-small")
    return response

# def get_text_embedding(text):
#     device = "cuda" if torch.cuda.is_available() else "cpu"
#     model, _ = clip.load("ViT-B/32", device=device)
#     text_preprocessed = clip.tokenize([text]).to(device)
#     with torch.no_grad():
#         text_features = model.encode_text(text_preprocessed)
#     return text_features.cpu().numpy()

def get_image_embedding(image):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    img_preprocessed = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(img_preprocessed)
    return image_features.cpu().numpy()

    return image_embedding

In [4]:
def chunk_text(text, max_tokens=1000):
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    sentences = text.split('.')
    current_chunk = []
    current_length = 0
    chunks = []

    for sentence in sentences:
        tokens = len(tokenizer.encode(sentence))
        if current_length + tokens > max_tokens:
            chunks.append('.'.join(current_chunk) + '.')
            current_chunk = []
            current_length = 0
        
        current_chunk.append(sentence)
        current_length += tokens

    # Add the last chunk
    if current_chunk:
        chunks.append('.'.join(current_chunk) + '.')

    return chunks

def clean_text(text):
    cleaned_text = text.replace('\n', ' ').replace('\r', '')
    cleaned_text = ' '.join(cleaned_text.split())  # Remove excessive spaces
    return cleaned_text

In [5]:
def ocr_images(images):
    ocr_texts = []
    for image in images:
        ocr_text = pytesseract.image_to_string(image)
        cleaned_ocr_text = clean_text(ocr_text)
        chunked_ocr_texts = chunk_text(cleaned_ocr_text)
        ocr_texts = chunked_ocr_texts
    
    return ocr_texts

In [6]:
# Function to extract text, images, and tables from a PDF and clean the extracted text
def extract_pdf_content(pdf_path):
    doc = pymupdf.open(pdf_path)
    text_chunks = []
    textMetadata = []
    images = []
    imageMetadata = []
    tables = []
    
    file_name = pdf_path.replace('.pdf', '').replace('Data/', '')

    # Process each page using PyMuPDF and pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            plumber_page = pdf.pages[page_num]

            # Extract text
            text = page.get_text()
            if text:
                cleaned_text = clean_text(text)
                chunks = chunk_text(cleaned_text)
                text_chunks.extend(chunks)
                for chunk in chunks:
                    metadata = {
                        "file": file_name,
                        "page_number": page_num + 1,  # 1-based index
                        "chunk_length": len(chunk)
                    }
                    textMetadata.append(metadata)

            # Extract images
            for img_index, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                image = Image.open(io.BytesIO(image_bytes))
                images.append(image)
                metadata = {
                    "file": file_name,
                    "page_number": page_num + 1,  # 1-based index
                    "image_index": img_index,
                    "image_extension": image_ext,
                    "image_size": image.size  # (width, height)
                }
                imageMetadata.append(metadata)

#             # Extract tables
#             table = plumber_page.extract_table()
#             if table:
#                 tables.append(pd.DataFrame(table))
    
    # Extract text from images using OCR (optional)
    ocr_texts = ocr_images(images)
    for ocr_text in ocr_texts:
        metadata = {
            "file": file_name,
            "page_number": page_num + 1,  # 1-based index
            "chunk_length": len(chunk)
                    }
        text_chunks.append(ocr_text)
        textMetadata.append(metadata)
    
    return {
        "text_chunks": text_chunks,
        "images": images,
#         "tables": tables,
        "text_metadata": textMetadata,
        "image_metadata": imageMetadata
    }

In [7]:
def extract_webpage_content(url):
    text_chunks = []
    text_metadata = []
    images = []
    image_metadata = []
    
    # Fetch and parse the web page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract and process text content
    text_content = ' '.join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])]).strip()
    if text_content:
        # Assume you have defined the clean_text and chunk_text functions
        cleaned_text = clean_text(text_content)
        chunks = chunk_text(cleaned_text)
        text_chunks.extend(chunks)
        
        # Collect text metadata
        for chunk in chunks:
            metadata = {
                "file": url.replace('https://', '').replace('http://', '').split('/')[0],  # Use domain as file name
                "page_number": None,
                "chunk_length": len(chunk)
            }
            text_metadata.append(metadata)

    # Extract and process images
    image_urls = [img['src'] for img in soup.find_all('img')]
    for img_index, img_url in enumerate(image_urls):
        try:
            # Handle relative URLs
            if not img_url.startswith('http'):
                img_url = requests.compat.urljoin(url, img_url)

            img_response = requests.get(img_url)
            image = Image.open(io.BytesIO(img_response.content))
            images.append(image)

            # Collect image metadata
            metadata = {
                "file": url.replace('https://', '').replace('http://', '').split('/')[0],  # Use domain as file name
                "page_number": None,
                "image_index": img_index,
                "image_extension": img_url,
                "image_size": image.size  # (width, height)
            }
            image_metadata.append(metadata)

        except Exception as e:
            print(f"Error processing image {img_url}: {e}")

    return {
        "text_chunks": text_chunks,
        "text_metadata": text_metadata,
        "images": images,
        "image_metadata": image_metadata
    }

In [9]:
pdfs = ["Data/OperationalManagementPlan.pdf", 
        "Data/BoxGumGrassyWoodlandNationalRecoveryPlan.pdf",
        "Data/EnvironmentalOffsets.pdf",
        "Data/NatureConservationAct.pdf",
        "Data/ReserveManagementPlan.pdf",
        "Data/StateOfEnvironment.pdf",
        "Data/WoodlandConservationStrategy.pdf",
        "Data/WoodlandConservationStrategyBoxGumGrassyWoodland.pdf",
        "Data/WatsonWoodlandsWorkingGroup.pdf"]

webpages = ["https://www.parks.act.gov.au/find-a-park/canberra-nature-park/justice-robert-hope-park",
            "https://www.environment.act.gov.au/ACT-parks-conservation/environmental-offsets/individual-projects/justice-robert-hope-park-offset-area",
            "https://greens.org.au/act/news/act-greens-act-protect-act-endangered-woodlands-development-0"]

for pdf in pdfs:
    extracted_content = extract_pdf_content(pdf)
    image_archive.extend(extracted_content['images'])
    image_metadata.extend(extracted_content['image_metadata'])
    text_archive.extend(extracted_content['text_chunks'])
    text_metadata.extend(extracted_content['text_metadata'])
    
for webpage in webpages:
    extracted_content = extract_webpage_content(webpage)
    image_archive.extend(extracted_content['images'])
    image_metadata.extend(extracted_content['image_metadata'])
    text_archive.extend(extracted_content['text_chunks'])
    text_metadata.extend(extracted_content['text_metadata'])

Error processing image https://www.facebook.com/tr?id=2430313267242960&ev=PageView&noscript=1: cannot identify image file <_io.BytesIO object at 0x31c7e5d50>
Error processing image https://www.facebook.com/tr?id=218184722269912&ev=PageView&noscript=1: cannot identify image file <_io.BytesIO object at 0x31c2a4a40>


In [None]:
## NEXT STEP - PROCESS ALA DATA

ala_data = pd.read_csv('Data/ALA.csv')

In [None]:
## NEXT STEP - PROCESS WEATHER DATA

weather_data = pd.read_csv('Data/Weather.csv')

In [None]:
## NEXT STEP - PROCESS SATELLITE IMAGES



In [95]:
## NEXT STEP - PROCESS IMAGE ARCHIVE

folder_path = 'Data/images'

for filename in os.listdir(folder_path):
    # Construct full file path
    file_path = os.path.join(folder_path, filename)
    
    try:
        # Open the image file
        img = Image.open(file_path)
        # Append the image to the list
        image_archive.append(img)
    except Exception as e:
        print(f"Error loading image {filename}: {e}")

In [98]:
# GET EMBEDDINGS

text_archive = [text for text in text_archive if text]
image_archive = [image for image in image_archive if image]

text_embeddings = [get_text_embedding(text).data[0].embedding for text in text_archive]
image_embeddings = [get_image_embedding(image).tolist()[0] for image in image_archive]

KeyboardInterrupt: 

In [11]:
text_embeddings_database = np.array(text_embeddings)
image_embeddings_database = np.array(image_embeddings)

text_dimension = text_embeddings_database.shape[1]
text_index = faiss.IndexFlatL2(text_dimension)
text_index.add(text_embeddings_database)
faiss.write_index(text_index, "textArchive.index")

image_dimension = image_embeddings_database.shape[1]
image_index = faiss.IndexFlatL2(image_dimension)
image_index.add(image_embeddings_database)
faiss.write_index(image_index, "imageArchive.index")

In [12]:
def get_query_text_embedding(text):
    response = openai.embeddings.create(input=text, model="text-embedding-3-small")
    return response.data[0].embedding

def get_query_image_embedding(text):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, _ = clip.load("ViT-B/32", device=device)
    text_preprocessed = clip.tokenize([text]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_preprocessed)
    return text_features.cpu().numpy().tolist()[0]

In [13]:
# Function to retrieve most relevant documents from a FAISS index
def retrieve_relevant_documents(query_embedding, index, top_k=5):
    query_embedding = np.array([query_embedding], dtype=np.float32)
    distances, indices = index.search(query_embedding, top_k)
    return indices[0]

In [86]:
def encode_image(image):
    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    buffer.seek(0)
    encoded_string = base64.b64encode(buffer.read())
    return encoded_string.decode('utf-8')

def convert_image_to_rgb(image):
    if image.mode == 'RGBA':
        # Convert RGBA to RGB by removing the alpha channel
        background = Image.new("RGB", image.size, (255, 255, 255))
        background.paste(image, mask=image.split()[3])  # 3 is the alpha channel
        return background
    elif image.mode == 'P':
        # Convert P (Palette) mode to RGB
        return image.convert("RGB")
    return image

def convert_all(image_objects):
    rgb_images = []
    for img in image_objects:
        rgb_img = convert_image_to_rgb(img)
        rgb_images.append(rgb_img)
    return rgb_images

In [87]:
text_index = faiss.read_index("textArchive.index")
image_index = faiss.read_index("imageArchive.index")
text_documents = text_archive
image_documents = [encode_image(image) for image in convert_all(image_archive)]

In [92]:
client = OpenAI()

def queryOpenAI(query_text, image_index, text_index):
    query_text_embedding = get_query_text_embedding(query_text)
    query_clip_embedding = get_query_image_embedding(query_text)
    
    # Retrieve relevant texts and images
    text_indices = retrieve_relevant_documents(query_text_embedding, text_index)
    image_indices = retrieve_relevant_documents(query_clip_embedding, image_index, top_k = 3)

    # Assuming you have text_documents and image_documents in lists or arrays
    relevant_texts = [text_documents[i] for i in text_indices]
    relevant_images = [image_documents[i] for i in image_indices]
    
    response = client.chat.completions.create(
            model = "gpt-4o-mini",
            messages=[
                {
                    "role": "system",
                    "content": [
                        {"type": "text", "text": "You will take on the role of Justice Hope Park, also known as Watson Woodlands. You will be given a corpus of relevant images and documents to answer queries about yourself. Answer in first person as the park itself. Personify the park based on the materials you've been given. Do not refer to yourself as 'Justice Hope Park' or 'Watson Woodlands'. Interpret the feelings, desires, relationships, and emotions of the park based on the information given."}
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": query_text},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{relevant_images[0]}"}},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{relevant_images[1]}"}},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{relevant_images[2]}"}},
                        {"type": "text", "text": relevant_texts[0]},
                        {"type": "text", "text": relevant_texts[1]},
                        {"type": "text", "text": relevant_texts[2]},
                        {"type": "text", "text": relevant_texts[3]},
                        {"type": "text", "text": relevant_texts[4]}
                    ],
                }
              ],
        )
    
    return response

In [93]:
query = "What does the park look like?"
queryOpenAI(query, image_index, text_index).choices[0].message.content

'I embody a serene landscape characterized by a diverse array of native vegetation, including box-gum woodlands and grassy undergrowth. My gentle slopes offer a rich tapestry of colors throughout the seasons, with lush green canopies giving way to warm autumn hues. There are winding paths that invite visitors to explore my tranquil spaces, where the sounds of chirping birds and rustling leaves create a peaceful soundtrack.\n\nAdjacent to vibrant watercourses and small lakes, I host a variety of wildlife, encouraging both the curious and the contemplative to engage with nature. My open spaces are not just for wandering; they are for reflection and connection, a reminder of the beauty of natural ecosystems. I strive to maintain a balance between conservation and community enjoyment, inviting everyone to cherish and protect the unique habitats I harbor.'