In [40]:
import pandas as pd
import openai
import time
import requests
from tqdm import tqdm
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type, RetryError
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse
from pathlib import Path


#insert openAI key
openai.api_key = 'my_key'

In [None]:
# read file containing 300 randomly slected urls from the total urls 
#that we got from the search result: jerusalem, from the LOC
with open('./random_jerusalem.csv', 'r') as file:
      df = (pd.read_csv(file))

In [None]:
#convert csv to list
random_urls = df['URLs'].to_list()

In [10]:
#Function to scrape and download the image from the LOC website, taking the image URL as input
def download_image(web_page_url):
    # Send a GET request to the web page
    response = requests.get(web_page_url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the element containing the image URL
        # Adjust the selector based on the actual HTML structure
        image_element = soup.find('option', {'data-file-download': 'JPEG'})
        
        if image_element and 'value' in image_element.attrs:
            image_url = image_element['value']

            # Extract the image name from the URL
            image_name = urlparse(image_url).path.split('/')[-1]
            # Creating a filename from the web page URL
            filename = f"{Path(web_page_url).name.split('/')[-1]}.jpeg"
            filepath = os.path.join('images', filename)

            # Download the image
            img_response = requests.get(image_url, stream=True)
            if img_response.status_code == 200:
                with open(filepath, 'wb') as f:
                    for chunk in img_response.iter_content(chunk_size=128):
                        f.write(chunk)
                print(f"Downloaded image to {filepath}")
                return filepath
            else:
                print(f"Failed to download image from {image_url}")
        else:
            print("Image URL not found.")
    else:
        print("Failed to fetch the web page.")

    return None

def get_selected_metadata(image_url):
    metadata_url = image_url + "?fo=json"
    try:
        response = requests.get(metadata_url)
        if response.status_code == 200:
            data = response.json()

            # Extracting nested fields from the JSON response
            interesting_fields = {
                "id": data.get("item", {}).get("id"),
                "contributor_names": data.get("item", {}).get("contributor_names"),
                "created_published": data.get("item", {}).get("created_published"),
                "created_published_date": data.get("item", {}).get("created_published_date"),
                "description": data.get("item", {}).get("description"),
                "format": data.get("item", {}).get("format"),
                "location": data.get("item", {}).get("location"),
                "title": data.get("item", {}).get("title"),
                "notes": data.get("item", {}).get("notes"),
                "number_lccn": data.get("item", {}).get("number_lccn")  # Added "number_lccn"
            }

            return interesting_fields
    except Exception as e:
        tqdm.write(f"An error occurred while fetching metadata for {image_url}: {e}")
    return None

@retry(wait=wait_exponential(multiplier=1, min=4, max=10), 
       stop=stop_after_attempt(3), 
       retry=retry_if_exception_type(openai.error.RateLimitError))
def call_openai_api(prompt, metadata, image_path):
    request_message = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt.format(metadata=metadata)},
        {"role": "user", "content": image_path}
    ]

    response = openai.ChatCompletion.create(
        model="gpt-4-vision-preview",
        messages=request_message,
        max_tokens=300,
    )

    return response.choices[0].message['content']

def save_results(results_list):
    with open('results.txt', 'w', encoding='utf-8') as file:
        for result in results_list:
            file.write(result + '\n')

def batch_process(urls, batch_size):
    for i in range(0, len(urls), batch_size):
        yield urls[i:i + batch_size]

def analyze_image_metadata(prompt, image_urls, batch_size=20, delay_seconds=5):
    start_time = time.time()
    results_list = []

    for batch in batch_process(image_urls, batch_size):
        for url in tqdm(batch, desc='Processing Batch', unit='image'):
            while True:
                try:
                    # Download the image and process it
                    image_path = download_image(url)
                    if not image_path:
                        tqdm.write(f"Failed to download image: {url}")
                        break

                    # Retrieve metadata
                    metadata = get_selected_metadata(url)
                    if metadata:
                        tqdm.write(f"Processing image {url}...")
                        
                        # Analyze image and metadata
                        result = call_openai_api(prompt, metadata, image_path)
                        if result:
                            tqdm.write(f"Result: {result}")
                            results_list.append(result)
                        
                        time.sleep(delay_seconds)  # Fixed delay between each API call
                        break  # Break the while loop once the API call is successful
                    else:
                        tqdm.write(f"No metadata available for image {url}.")
                        break  # Break the while loop if no metadata is available
                except RetryError:
                    tqdm.write("Maximum retry attempts reached. Pausing for 15 minutes before retrying.")
                    save_results(results_list)  # Save progress
                    time.sleep(900)  # Pause for 15 minutes

            # Remove the downloaded image file
            if image_path and os.path.exists(image_path):
                os.remove(image_path)

        # Sleep for 15 minutes between each batch
        tqdm.write("Completed a batch. Pausing for 15 minutes before starting the next batch.")
        time.sleep(900)

    end_time = time.time()
    tqdm.write(f"Total time taken: {end_time - start_time:.2f} seconds")
    save_results(results_list)


#usage
prompt = """
I am going to give you a historical photograph and its metadata. From it, extract the city and landmark(s) represented in
the picture.Your response should be in JSON format.
Don’t give any polite introduction on your response, just JSON format, following this template: 
{{
  "id": "2019696982",
  "city": "Jerusalem",
  "landmarks": ["Hebrew University"]
}}
where, id refers to a field already provided in the metadata. city refers to the city where the photo was shot. landmark refers 
to any specific building or location inside of the city.
If it is impossible to detect a specific landmark or city, use the word "Unknown".
If there are multiple landmarks depicted, list them.

Metadata: {metadata}
"""  
 

# list of image URLs
image_urls = random_urls

analyze_image_metadata(prompt, image_urls)


Processing Batch:   0%|                                                                      | 0/20 [00:00<?, ?image/s]

Downloaded image to images\2004663247.jpeg


Processing Batch:   0%|                                                                      | 0/20 [00:02<?, ?image/s]

Processing image http://www.loc.gov/item/2004663247/...


Processing Batch:   0%|                                                                      | 0/20 [00:03<?, ?image/s]

Result: ```json
{
  "id": "2004663247",
  "city": "Unknown",
  "landmarks": ["Unknown"]
}
```


Processing Batch:   5%|███                                                           | 1/20 [00:08<02:47,  8.81s/image]

Downloaded image to images\2019694539.jpeg


Processing Batch:   5%|███                                                           | 1/20 [00:11<02:47,  8.81s/image]

Processing image http://www.loc.gov/item/2019694539/...


Processing Batch:   5%|███                                                           | 1/20 [00:12<02:47,  8.81s/image]

Result: ```json
{
  "id": "2019694539",
  "city": "Jerusalem",
  "landmarks": ["Dome of the Rock"]
}
```


Processing Batch:  10%|██████▏                                                       | 2/20 [00:17<02:42,  9.01s/image]

Downloaded image to images\2019693935.jpeg


Processing Batch:  10%|██████▏                                                       | 2/20 [00:21<02:42,  9.01s/image]

Processing image http://www.loc.gov/item/2019693935/...


Processing Batch:  10%|██████▏                                                       | 2/20 [00:28<02:42,  9.01s/image]

Result: ```json
{
  "id": "2019693935",
  "city": "Tiberias",
  "landmarks": ["Scots Mission Hospital"]
}
```


Processing Batch:  15%|█████████▎                                                    | 3/20 [00:33<03:12, 11.30s/image]


KeyboardInterrupt: 

In [None]:
# To read the saved results later
with open('results.txt', 'r') as file:
    saved_results = file.readlines()
    for result in saved_results:
        print(result)