In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from PIL import Image
from PIL.ExifTags import TAGS
from io import BytesIO

In [5]:
# Step 1: Download the XML file
url = 'https://urbanriverrangers.s3.us-east-2.amazonaws.com/'  # Update with actual URL if needed
response = requests.get(url)
xml_content = response.content

# Step 2: Parse the XML file
root = ET.fromstring(xml_content)

# Define the namespace
namespace = {'ns': 'http://s3.amazonaws.com/doc/2006-03-01/'}

# Step 3: Extract data and convert to DataFrame
data = []
for item in root.findall('.//ns:Contents', namespace):
    row = {
        'Key': item.find('ns:Key', namespace).text,
        'URL': '=Image("https://urbanriverrangers.s3.us-east-2.amazonaws.com/"'+item.find('ns:Key', namespace).text+'",1)',
        'ETag': item.find('ns:ETag', namespace).text,
        'Size': item.find('ns:Size', namespace).text
    }
    data.append(row)

media_df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_filename = 'media.csv'
media_df.to_csv(csv_filename, index=False)
print(f"Media data saved to {csv_filename}")

Media data saved to media.csv


In [None]:

# Define the base URL for the public S3 bucket
base_url = 'https://urbanriverrangers.s3.us-east-2.amazonaws.com/'


# Function to extract metadata using Pillow
def extract_metadata_from_image(image):
    exif_data = {}
    if hasattr(image, '_getexif'):  # Check if image has EXIF data
        exif_info = image._getexif()
        if exif_info is not None:
            for tag, value in exif_info.items():
                tag_name = TAGS.get(tag, tag)
                exif_data[tag_name] = value
    return exif_data

# Extract metadata for each image
metadata_list = []
for key in media_df['Key']:
    # Download the image from the public S3 URL into memory
    response = requests.get(base_url + key)
    response.raise_for_status()  # Ensure the request was successful
    file_content = response.content
    
    try:
        # Open the image using Pillow
        image = Image.open(BytesIO(file_content))
        
        # Check if the image format is supported
        if image.format is None:
            raise ValueError("Unsupported image format")
        
        # Extract metadata
        metadata = extract_metadata_from_image(image)
        metadata['Key'] = key  # Include the Key in the metadata
        metadata_list.append(metadata)
    except (IOError, ValueError) as e:
        print(f"Error processing image '{key}': {e}")

# Create a DataFrame from the metadata
metadata_df = pd.DataFrame(metadata_list)


# Save the DataFrame to a CSV file
csv_filename = 'metadata.csv'
metadata_df.to_csv(csv_filename, index=False)
print(f"Metadata saved to {csv_filename}")