In [4]:
import pandas as pd
import openai
import time
import requests
from tqdm import tqdm
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type, RetryError

#insert openAI key
openai.api_key = my_key

In [9]:
# read file containing 300 randomly slected urls from the total urls 
#that we got from the search result: jerusalem, from the LOC
with open('./random_jerusalem.csv', 'r') as file:
      df = (pd.read_csv(file))

In [13]:
#convert csv to list
random_urls = df['URLs'].to_list()

In [None]:
# Function to fetch metadata from a given URL
def get_metadata(image_url):
    metadata_url = image_url + "/?fo=json"
    try:
        response = requests.get(metadata_url)
        if response.status_code == 200:
            metadata = response.json()
            return metadata
    except Exception as e:
        tqdm.write(f"An error occurred while fetching metadata for {image_url}: {e}")
    return None

# Function to call the OpenAI API with retry mechanism
@retry(wait=wait_exponential(multiplier=1, min=4, max=10), 
       stop=stop_after_attempt(3), 
       retry=retry_if_exception_type(openai.error.RateLimitError))
def call_openai_api(prompt, metadata, results_list):
    response = openai.ChatCompletion.create(
        model="gpt-4-1106-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt.format(metadata=metadata)}
        ]
    )
    result = response.choices[0].message['content']
    results_list.append(result)
    return result

# Function to save the results to a file
def save_results(results_list):
    with open('results.txt', 'w', encoding='utf-8') as file:
        for result in results_list:
            file.write(result + '\n')

# Generator function to create batches from a list of URLs
def batch_process(urls, batch_size):
    for i in range(0, len(urls), batch_size):
        yield urls[i:i + batch_size]

# Main function to analyze image metadata
def analyze_image_metadata(prompt, image_urls, batch_size=20, delay_seconds=5):
    start_time = time.time()
    results_list = []

    # Loop through each batch of URLs
    for batch in batch_process(image_urls, batch_size):
        for url in tqdm(batch, desc='Processing Batch', unit='image'):
            while True:
                try:
                    metadata = get_metadata(url)
                    if metadata:
                        tqdm.write(f"Processing image {url}...")
                        result = call_openai_api(prompt, metadata, results_list)
                        if result:
                            tqdm.write(f"Result: {result}")
                        time.sleep(delay_seconds)  # Fixed delay between each API call
                        break  # Break the while loop once the API call is successful
                    else:
                        tqdm.write(f"No metadata available for image {url}.")
                        break  # Break the while loop if no metadata is available
                except RetryError:
                    tqdm.write("Maximum retry attempts reached. Pausing for 15 minutes before retrying.")
                    save_results(results_list)  # Save progress
                    time.sleep(900)  # Pause for 15 minutes

        # Sleep for 15 minutes between each batch
        tqdm.write("Completed a batch. Pausing for 15 minutes before starting the next batch.")
        time.sleep(900)

    end_time = time.time()
    tqdm.write(f"Total time taken: {end_time - start_time:.2f} seconds")
    save_results(results_list)
    

In [None]:
#usage
prompt = """I am going to give you the metadata of a historical photograph. From it, extract the city and landmark(s) represented in
the picture.Your response should be in JSON format.
Don’t give any polite introduction on your response, just JSON format, following this template: 
{{
  "id": "2019696982",
  "city": "Jerusalem",
  "landmarks": ["Hebrew University"]
}}
where, id refers to a field already provided in the metadata. city refers to the city where the photo was shot. landmark refers 
to any specific building or location inside of the city.
If it is impossible to detect a specific landmark or city, use the word "Unknown".
If there are multiple landmarks depicted, list them.

Metadata: {metadata}"""  

# list of image URLs
image_urls = random_urls

# Run the analysis
analyze_image_metadata(prompt, image_urls,  delay_seconds=5)

In [None]:
# To read the saved results later
with open('results.txt', 'r') as file:
    saved_results = file.readlines()
    for result in saved_results:
        print(result)