In [61]:
import os, re, json
from math import floor
from datetime import datetime
from h2ogpte import H2OGPTE
from loguru import logger
import docx2txt
import pandas as pd

In [3]:
def extract_doc(chunks, h2ogpte_url, api_key, prompt_input, llm):
    logger.info("")

    client = H2OGPTE(address=h2ogpte_url, 
                 api_key=api_key
                 )

    # Engineer the Prompt
    
    t_system_prompt = "You are a satellite imagery analyst."
    t_pre_prompt_summary = "Look at the satellite image and count the number of distinct buildings in the picture. Do not count landscape features, only manmade buildings such as houses or other roofed structures. Respond with only one number."
    t_prompt_summary = prompt_input
    t_llm = llm

    extraction = client.summarize_content(
            system_prompt=t_system_prompt,
            pre_prompt_summary=t_pre_prompt_summary,
            text_context_list=chunks,
            prompt_summary=t_prompt_summary,
            llm=t_llm,
        )
    extraction = client.process_document()
    
    return extraction #.content

In [15]:
def extract_number_from_text(text):
    # Find all numbers in the text
    numbers = re.findall(r'\d+', text)
    if not numbers:
        return None
    # Return the first number found
    return int(numbers[0])

def process_files(folder_path):
    data = []
    
    # Get all .txt files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            # Get the base name before "_pre_disaster.pdf.txt"
            base_name = filename.split('_pre_disaster.pdf.txt')[0]
            
            # Read the file content
            file_path = os.path.join(folder_path, filename)
            try:
                with open(file_path, 'r') as file:
                    content = file.read().strip()
                    number = extract_number_from_text(content)
                    
                data.append({
                    'filename_base': base_name,
                    'extracted_number': number
                })
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    return df

In [35]:
# Login to the Environment
#h2ogpte_url = "https://h2ogpte.internal-genai.dedicated.h2o.ai"
#api_key = open("h2ogpte_key.txt", "r").read().strip("\n")

h2ogpte_url = "https://h2ogpte.dev.h2o.ai"
api_key = open("mlandry-personal-api-key-dev2.txt", "r").read().strip("\n")

client = H2OGPTE(address=h2ogpte_url, api_key=api_key)

Please install the correct version of H2OGPTE with `pip install h2ogpte==1.6.13`.
You can enable strict version checking by passing strict_version_check=True.


In [36]:
system_prompt = "You are a satellite imagery analyst."
pre_prompt_summary = "Look at the satellite image and count the number of distinct buildings in the picture. Do not count landscape features or vehicles, only manmade buildings such as houses or other roofed structures. Respond with only one number."
prompt_summary = "Combine separate answers into one by adding them, if more than one exists."
image_prompt = "<response_instructions>You are a satellite imagery analyst. Look at the satellite image and count the number of distinct buildings in the picture. Do not count landscape features or vehicles, only manmade buildings such as houses or other roofed structures.</response_instructions>"
image_final = "<response_instructions>Combine separate answers into one by adding them, if more than one exists. Respond with only one number.</response_instructions>"

In [62]:
document_info = []
collections_to_get = ['z-malawi-200', 'z-malawi-300']

for collection in collection_info:
    if collection.name in collections_to_get:
        collection_id = collection.id
        document_info.extend(client.list_documents_in_collection(collection_id, 0, 500))

In [63]:
llms = ['Qwen/Qwen2-VL-7B-Instruct','mistralai/Pixtral-12B-2409','meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo'
        ,'meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo','Qwen/Qwen2-VL-72B-Instruct'
       ,'h2oai/h2ovl-mississippi-2b']
llm_shorts = ['Qwen2-VL-7B-Instruct','Pixtral-12B-2409','Llama-3.2-11B-Vision-Instruct-Turbo'
              ,'Llama-3.2-90B-Vision-Instruct-Turbo','Qwen2-VL-72B-Instruct'
              ,'h2ovl-mississippi-2b']

In [64]:
for i in range(0, 5):
    llm = llms[i]
    llm_short = llm_shorts[i]
    print(llm + " //// "+llm_short)

Qwen/Qwen2-VL-7B-Instruct //// Qwen2-VL-7B-Instruct
mistralai/Pixtral-12B-2409 //// Pixtral-12B-2409
meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo //// Llama-3.2-11B-Vision-Instruct-Turbo
meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo //// Llama-3.2-90B-Vision-Instruct-Turbo
Qwen/Qwen2-VL-72B-Instruct //// Qwen2-VL-72B-Instruct


In [65]:
for j in range(0, 5):
    llm = llms[j]
    llm_short = llm_shorts[j]
    print(f"Model {llm_short}: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    folder_path = 'malawi-cyclone/' + llm_short
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    i = 0
    for document in document_info:
        i=i+1
        if i%floor(len(document_info)/8) == 0:
            print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - Iteration {i}")
        doc_id = document.id
        doc_name = document.name
        response = client.process_document(
            document_id=doc_id,
            system_prompt=system_prompt,
            pre_prompt_summary=pre_prompt_summary,
            prompt_summary=prompt_summary,
            image_batch_image_prompt=image_prompt,
            image_batch_final_prompt=image_final,
            llm=llm
        )
        if response:
            with open('malawi-cyclone/'+llm_short+'/'+doc_name+'.txt', 'w') as file:
                file.write(response.content)

Model Qwen2-VL-7B-Instruct: 2024-12-25 12:41:03
2024-12-25 12:44:48 - Iteration 37
2024-12-25 12:48:36 - Iteration 74
2024-12-25 12:52:29 - Iteration 111
2024-12-25 12:56:22 - Iteration 148
2024-12-25 13:00:13 - Iteration 185
2024-12-25 13:04:02 - Iteration 222
2024-12-25 13:07:51 - Iteration 259
2024-12-25 13:11:43 - Iteration 296
Model Pixtral-12B-2409: 2024-12-25 13:11:49
2024-12-25 13:16:10 - Iteration 37
2024-12-25 13:20:44 - Iteration 74
2024-12-25 13:25:13 - Iteration 111
2024-12-25 13:29:42 - Iteration 148
2024-12-25 13:34:08 - Iteration 185
2024-12-25 13:38:35 - Iteration 222
2024-12-25 13:43:08 - Iteration 259
2024-12-25 13:47:37 - Iteration 296
Model Llama-3.2-11B-Vision-Instruct-Turbo: 2024-12-25 13:47:45
2024-12-25 13:54:20 - Iteration 37
2024-12-25 14:00:59 - Iteration 74
2024-12-25 14:07:40 - Iteration 111
2024-12-25 14:14:23 - Iteration 148
2024-12-25 14:21:00 - Iteration 185
2024-12-25 14:27:34 - Iteration 222
2024-12-25 14:34:14 - Iteration 259
2024-12-25 14:40:49 - I

In [16]:
# Use the function
folder_path = 'malawi-cyclone/Qwen2-VL-72B-Instruct'
result_df = process_files(folder_path)

# Display first few rows to verify
print(result_df.head())

export_df = pd.DataFrame({
    'id': result_df['filename_base'] + '_X_no_damage',
    'target': result_df['extracted_number']
})

# Export to CSV
export_df.to_csv('malawi-cyclone/sub_qwen_nd.csv', index=False)

             filename_base  extracted_number
0  malawi-cyclone_00000051                35
1  malawi-cyclone_00000108               150
2  malawi-cyclone_00000222               120
3  malawi-cyclone_00000181                15
4  malawi-cyclone_00000174                15


In [18]:
# Use the function
folder_path = 'malawi-cyclone/Pixtral-12B-2409/'
result_df = process_files(folder_path)

# Display first few rows to verify
print(result_df.head())

export_df = pd.DataFrame({
    'id': result_df['filename_base'] + '_X_no_damage',
    'target': result_df['extracted_number']
})

# Export to CSV
export_df.to_csv('malawi-cyclone/presub_pixtral_nd.csv', index=False)

             filename_base  extracted_number
0  malawi-cyclone_00000051                15
1  malawi-cyclone_00000108                35
2  malawi-cyclone_00000222                35
3  malawi-cyclone_00000181                15
4  malawi-cyclone_00000174                15


In [19]:
# Use the function
folder_path = 'malawi-cyclone/Llama-3.2-11B-Vision-Instruct-Turbo/'
result_df = process_files(folder_path)

# Display first few rows to verify
print(result_df.head())

export_df = pd.DataFrame({
    'id': result_df['filename_base'] + '_X_no_damage',
    'target': result_df['extracted_number']
})

# Export to CSV
export_df.to_csv('malawi-cyclone/presub_llama_nd.csv', index=False)

             filename_base  extracted_number
0  malawi-cyclone_00000051              30.0
1  malawi-cyclone_00000108             100.0
2  malawi-cyclone_00000222              63.0
3  malawi-cyclone_00000181              25.0
4  malawi-cyclone_00000174              25.0


In [25]:
# Use the function
folder_path = 'malawi-cyclone/Llama-3.2-90B-Vision-Instruct-Turbo/'
result_df = process_files(folder_path)

# Display first few rows to verify
print(result_df.head())

export_df = pd.DataFrame({
    'id': result_df['filename_base'] + '_X_no_damage',
    'target': result_df['extracted_number']
})

# Export to CSV
export_df.to_csv('malawi-cyclone/presub_llama90_nd.csv', index=False)

             filename_base  extracted_number
0  malawi-cyclone_00000051              37.0
1  malawi-cyclone_00000108             123.0
2  malawi-cyclone_00000222             123.0
3  malawi-cyclone_00000181              23.0
4  malawi-cyclone_00000174              23.0
