# Extract Data

In [6]:
import osxphotos
from geopy.geocoders import Nominatim
import csv
import time

In [7]:
geolocator = Nominatim(user_agent="photorag-dm")

def geocode_location(latitude, longitude):
    try:
        location = geolocator.reverse((latitude, longitude), exactly_one=True)
        if location:
            address = location.raw.get('address', {})
            # Attempt to find the most specific location information available
            city = address.get('city', '')
            state = address.get('state', '')
            country = address.get('country', '')

            location_detail = f"{city}, {state}, {country}" if city else f"{state}, {country}"
            
            return location_detail if location_detail.strip() else "Unknown Location"
        else:
            return "Unknown Location"
    except Exception as e:
        print(f"Error during geocoding: {e}")
        return "Error"
    finally:
        # Wait for 1 second to respect Nominatim's usage policy
        time.sleep(1)


In [8]:
def get_keywords_location(folder_name):
    photosdb = osxphotos.PhotosDB()
    photos = photosdb.photos(albums=[folder_name])
    
    photo_metadata = {}
    
    for photo in photos:
        filename = photo.original_filename
        # if HEIC in filename, replace with jpeg
        if filename.endswith(".HEIC"):
            filename = filename.replace(".HEIC", ".jpeg")
        keywords = photo.keywords
        date = photo.date
        location = ""
        if photo.location and photo.location != (None, None):
            latitude, longitude = photo.location
            location = geocode_location(latitude, longitude)
        
        photo_metadata[filename] = (keywords, location, date)
    
    return photo_metadata

In [9]:
def get_metadata(folder_name, output_file):
    photo_metadata = get_keywords_location(folder_name)
    
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["filename", "people", "location", "date"])
        
        for filename, (keywords, location, date) in photo_metadata.items():
            # Convert keywords list to a string separated by commas
            keywords_str = ', '.join(keywords)
            # Ensure location is treated as a single string
            # Assuming location is a string already formatted correctly (e.g., "City, State, Country")
            writer.writerow([filename, keywords_str, location, date])

In [10]:
folder_name = "RAG App"  # Change to your folder name
output_file = "output/photo_metadata.csv"
get_metadata(folder_name, output_file)



Error during geocoding: HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Max retries exceeded with url: /reverse?lat=48.8567879&lon=2.3510768&format=json&addressdetails=1 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='nominatim.openstreetmap.org', port=443): Read timed out. (read timeout=1)"))
