# **Extracting metadata and creating a dataframe**

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install ImageHash

Collecting ImageHash
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/296.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/296.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.5/296.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ImageHash
Successfully installed ImageHash-4.3.1


In [None]:
# Function to create a dataframe with the metadata

from PIL import Image
from PIL.ExifTags import TAGS, GPSTAGS
import os
import pandas as pd
import shutil

def get_exif_data(image_path):
    """Extracting EXIF data from an image."""
    image = Image.open(image_path)
    exif_data = {}

    if hasattr(image, '_getexif'):
        exif_info = image._getexif()
        if exif_info:
            for tag, value in exif_info.items():
                decoded = TAGS.get(tag, tag)
                exif_data[decoded] = value

    return exif_data

def get_gps_info(exif_data):
    """Extracting the GPSInfo dict from EXIF data."""
    for key, val in exif_data.items():
        if key == 'GPSInfo':
            gps_info = {}
            for t in val:
                sub_decoded = GPSTAGS.get(t, t)
                gps_info[sub_decoded] = val[t]
            return gps_info
    return None

def gps_info_to_decimal(gps_info):
    """Converting GPSInfo to decimal degrees for latitude and longitude."""
    def convert_to_degrees(value):
        """Converts GPS coordinates to decimal degrees."""
        d, m, s = value
        return d + (m / 60.0) + (s / 3600.0)

    if gps_info:
        lat = gps_info.get('GPSLatitude')
        lat_ref = gps_info.get('GPSLatitudeRef')
        lon = gps_info.get('GPSLongitude')
        lon_ref = gps_info.get('GPSLongitudeRef')

        if lat and lat_ref and lon and lon_ref:
            lat_decimal = convert_to_degrees(lat)
            lon_decimal = convert_to_degrees(lon)

            if lat_ref == 'S':
                lat_decimal = -lat_decimal
            if lon_ref == 'W':
                lon_decimal = -lon_decimal

            return lat_decimal, lon_decimal
    return None, None


def extract_date_time(exif_data):
    """Extracting the DateTime from EXIF data if available."""
    if 'DateTime' in exif_data:
        return exif_data['DateTime']
    return None

def process_images(folder_path):
    """Processes all images in the specified folder and its subfolders."""
    data = []

    for root, dirs, files in os.walk(folder_path):
        label = os.path.basename(root)
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg')):
                try:
                    image_path = os.path.join(root, file)
                    exif_data = get_exif_data(image_path)
                    gps_info = get_gps_info(exif_data)
                    latitude, longitude = gps_info_to_decimal(gps_info) if gps_info else (None, None)
                    date_time = extract_date_time(exif_data)
                    data.append({
                        "Id": file,
                        "Latitude": latitude,
                        "Longitude": longitude,
                        "Date and Time": date_time,
                        "Class": label
                    })
                except Exception as e:
                    print(f"Error processing {file}: {e}")

    return pd.DataFrame(data)

# Showing an example usage
#folder_path = 'path_to_main_folder'  # Folder that has the subfolder(s) containing the images
#df = process_images(folder_path) # After this point, can run the following code to obtain a 'Date' column (used later)

In [None]:
'''
# Adding a 'Date' column and exporting the df

# Converting the datetime string to a pandas datetime object
df['Date'] = pd.to_datetime(df['Date and Time'], format='%Y:%m:%d %H:%M:%S')

# Extracting the date part and assigning it to a new column
df['Date'] = df['Date'].dt.date

# Converting the date column to a string type in the 'YYYY-MM-DD' format
df['Date'] = df['Date'].astype(str)

df.to_csv(f"{folder_path}/image_metadata.csv", index=False)
'''

# **Filtering the Taiwan Dataset (Removing Duplicates, Keeping Images with Location and Date Information)**

In [None]:
base_dir = "/content/drive/MyDrive/Data 298B Project Data/Rice Image Datasets - with Location and Time"

new_path_taiwan = f"{base_dir}/Rice Leaf Diseases - Taiwan Filtered"

if not os.path.exists(new_path_taiwan):
    os.makedirs(new_path_taiwan)

In [None]:
# Taiwan Data

folder_path_taiwan = f"{base_dir}/Rice Leaf Diseases - Taiwan"
df_taiwan = process_images(folder_path_taiwan)

df_taiwan.to_csv(f"{new_path_taiwan}/image_metadata_taiwan_full.csv", index=False)

In [None]:
# Removing duplicates from the Taiwan data (and keeping the file with the shortest name)

import os
import imagehash
from PIL import Image

def find_and_delete_duplicates(root_folder):
    # Dictionary to store the image hash as the key and a tuple of image path and filename length as the value
    hash_dict = {}

    for subdir, dirs, files in os.walk(root_folder):
        for filename in files:
            if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                file_path = os.path.join(subdir, filename)

                # Opening the image and calculating its hash
                with Image.open(file_path) as img:
                    img_hash = imagehash.average_hash(img)

                    if img_hash in hash_dict:
                        existing_path, existing_name_length = hash_dict[img_hash]

                        # Comparing filename lengths and deleting the longer one
                        if len(filename) < existing_name_length:
                            os.remove(existing_path)
                            # Updating the dictionary with the new, shorter filename
                            hash_dict[img_hash] = (file_path, len(filename))
                        else:
                            # Deleting the current file as its name is longer
                            os.remove(file_path)
                    else:
                        # Adding the new image hash to the dictionary
                        hash_dict[img_hash] = (file_path, len(filename))

find_and_delete_duplicates(folder_path_taiwan) # Folder containing the subfolders with images

In [None]:
# Creating a new CSV with image metadata after duplicates are removed

df_taiwan_no_duplicates = process_images(folder_path_taiwan)
df_taiwan_no_duplicates.to_csv(f"{new_path_taiwan}/image_metadata_taiwan_no_duplicates.csv", index=False)

In [None]:
# Creating a new CSV with image metadata after filtering for images that have a 'Latitude', 'Longitude', and 'Date and Time'

filtered_df_taiwan = df_taiwan_no_duplicates.dropna(subset=['Latitude', 'Longitude', 'Date and Time'])

In [None]:
# Adding a 'Date' column and exporting the df

# Converting the datetime string to a pandas datetime object
filtered_images_df_taiwan['Date'] = pd.to_datetime(filtered_images_df_taiwan['Date and Time'], format='%Y:%m:%d %H:%M:%S')

# Extracting the date part and assigning it to a new column
filtered_images_df_taiwan['Date'] = filtered_images_df_taiwan['Date'].dt.date

# Converting the date column to a string type in the 'YYYY-MM-DD' format
filtered_images_df_taiwan['Date'] = filtered_images_df_taiwan['Date'].astype(str)

filtered_images_df_taiwan.to_csv(f"{new_path_taiwan}/image_metadata_taiwan_filtered_location.csv", index=False)

In [None]:
# Copying the images from the filtered Taiwan dataset to a new folder while maintaining the same subdirectory structure

# Loading the filtered image metadata
df_filtered_taiwan = pd.read_csv(f"{new_path_taiwan}/image_metadata_taiwan_filtered_location.csv")

original_base_path = f"{base_dir}/Rice Leaf Diseases - Taiwan"
new_base_path = f"{base_dir}/Rice Leaf Diseases - Taiwan Filtered"

if not os.path.exists(new_base_path):
    os.makedirs(new_base_path)

# Copying the images
for _, row in df_filtered_taiwan.iterrows():
    original_subfolder_path = os.path.join(original_base_path, row['Class'])
    new_subfolder_path = os.path.join(new_base_path, row['Class'])

    if not os.path.exists(new_subfolder_path):
        os.makedirs(new_subfolder_path)

    source_path = os.path.join(original_subfolder_path, row['Id'])
    destination_path = os.path.join(new_subfolder_path, row['Id'])

    shutil.copy2(source_path, destination_path)
    #print(f"Copied: {source_path} to {destination_path}")

print("All images have been copied.")


All images have been copied.
