# **Extracting metadata and creating a dataframe**

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# Function to create a dataframe with the metadata

from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
import os
import pandas as pd
import shutil

def get_exif_data(image_path):
    """Extracting EXIF data from an image."""
    image = Image.open(image_path)
    exif_data = {}

    if hasattr(image, '_getexif'):
        exif_info = image._getexif()
        if exif_info:
            for tag, value in exif_info.items():
                decoded = TAGS.get(tag, tag)
                exif_data[decoded] = value

    return exif_data

def get_gps_info(exif_data):
    """Extracting the GPSInfo dict from EXIF data."""
    for key, val in exif_data.items():
        if key == 'GPSInfo':
            gps_info = {}
            for t in val:
                sub_decoded = GPSTAGS.get(t, t)
                gps_info[sub_decoded] = val[t]
            return gps_info
    return None

def gps_info_to_decimal(gps_info):
    """Converting GPSInfo to decimal degrees for latitude and longitude."""
    def convert_to_degrees(value):
        """Converts GPS coordinates to decimal degrees."""
        d, m, s = value
        return d + (m / 60.0) + (s / 3600.0)

    if gps_info:
        lat = gps_info.get('GPSLatitude')
        lat_ref = gps_info.get('GPSLatitudeRef')
        lon = gps_info.get('GPSLongitude')
        lon_ref = gps_info.get('GPSLongitudeRef')

        if lat and lat_ref and lon and lon_ref:
            lat_decimal = convert_to_degrees(lat)
            lon_decimal = convert_to_degrees(lon)

            if lat_ref == 'S':
                lat_decimal = -lat_decimal
            if lon_ref == 'W':
                lon_decimal = -lon_decimal

            return lat_decimal, lon_decimal
    return None, None


def extract_date_time(exif_data):
    """Extracting the DateTime from EXIF data if available."""
    if 'DateTime' in exif_data:
        return exif_data['DateTime']
    return None

def process_images(folder_path):
    """Processes all images in the specified folder and its subfolders."""
    data = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg')):
                try:
                    image_path = os.path.join(root, file)
                    exif_data = get_exif_data(image_path)
                    gps_info = get_gps_info(exif_data)
                    latitude, longitude = gps_info_to_decimal(gps_info) if gps_info else (None, None)
                    date_time = extract_date_time(exif_data)
                    data.append({
                        "Id": file,
                        "Latitude": latitude,
                        "Longitude": longitude,
                        "Date and Time": date_time
                    })
                except Exception as e:
                    print(f"Error processing {file}: {e}")

    return pd.DataFrame(data)

In [3]:
import time
start_time = time.time()

folder_path = "/content/drive/MyDrive/Data 298B Project Data/Test Dataset - Workbook 2"  # Folder that has the subfolder(s) containing the images
df = process_images(folder_path) # After this point, can run the following code to obtain a 'Date' column (used later)

elapsed_time = time.time() - start_time
print(f"It took {elapsed_time:.4f} seconds to extract the metadata from the images.")

It took 0.1031 seconds to extract the metadata from the images.


In [4]:
# Adding a 'Date' column and exporting the df

# Converting the datetime string to a pandas datetime object
df['Date'] = pd.to_datetime(df['Date and Time'], format='%Y:%m:%d %H:%M:%S')

# Extracting the date part and assigning it to a new column
df['Date'] = df['Date'].dt.date

# Converting the date column to a string type in the 'YYYY-MM-DD' format
df['Date'] = df['Date'].astype(str)

df.to_csv(f"{folder_path}/image_metadata.csv", index=False)