### Extract metadata from images

The first step is to extract as much data as we can from images metadata in order to have a dataset to start from. 

In [12]:
import os

import numpy as np
import pandas as pd

import exiftool
from tqdm import tqdm

import imagehash
from PIL import Image

In [13]:
image_folder_path = '../images/aculei-images-test/'
image_names = np.array([f for f in os.listdir(image_folder_path) if f.lower().endswith('.jpg')])

In [14]:
# defining tags we are instered in
tags_for_cam = ["Keywords", "Subject", "WeightedFlatSubject"]
tags_for_datetime = ["CreateDate", "DateCreated"]

tags = tags_for_cam + tags_for_datetime

df = pd.DataFrame()
for image in tqdm(image_names[:50], desc="Extracting metadata"):
    file_path = os.path.join(image_folder_path, image)
    image_hash = imagehash.average_hash(Image.open(file_path))
    
    with exiftool.ExifToolHelper() as et:
        try:
            metadata = {}

            # caluclating the hash to have a unique identifier
            metadata['hash'] = image_hash
            metadata['image_name'] = image
            for d in et.get_tags([file_path], tags=tags):
                for k, v in d.items():
                    metadata[k] = v
            df = pd.concat([df, pd.DataFrame.from_records([metadata])], ignore_index=True)
        except Exception as e:
            print(e)

Extracting metadata: 100%|██████████| 50/50 [00:07<00:00,  6.31it/s]


In [15]:
df.shape

(50, 10)

In [16]:
df.isnull().sum()


hash                        0
image_name                  0
SourceFile                  0
IPTC:Keywords              13
XMP:Subject                13
XMP:WeightedFlatSubject    13
EXIF:CreateDate             0
XMP:CreateDate              0
IPTC:DateCreated            0
XMP:DateCreated             0
dtype: int64

In [17]:
# clean some columns
df.drop(columns=['SourceFile', 'XMP:WeightedFlatSubject', 'IPTC:Keywords', 'EXIF:CreateDate', 'IPTC:DateCreated', 'XMP:DateCreated'], 
        inplace=True)

In [18]:
df['date_time'] = pd.to_datetime(df['XMP:CreateDate'], format='%Y:%m:%d %H:%M:%S')

# split into 'date' and 'time' columns
df['date'] = df['date_time'].dt.date
df['time'] = df['date_time'].dt.time

# Drop the original 'DateTime' column if not needed
df.drop(columns=['XMP:CreateDate'], inplace=True)

In [19]:
# rename some columns
df.rename(columns={"XMP:Subject": "camera"}, inplace=True)

In [20]:
df.head(10)

Unnamed: 0,hash,image_name,camera,date_time,date,time
0,0000001e1e3cfe7f,TF_ACULEI_11374_DSCF0064.jpg,CAM_6,2022-06-09 21:29:59,2022-06-09,21:29:59
1,080808587cfefff8,TF_ACULEI_6224_DSCF6567.jpg,CAM_1,2021-06-25 00:31:53,2021-06-25,00:31:53
2,000818383e3e3e3e,TF_ACULEI_16003_DSCF0614.jpg,,2023-06-01 20:57:55,2023-06-01,20:57:55
3,000808187ffffffb,TF_ACULEI_3530_DSCF3748.jpg,CAM_1,2021-06-15 23:14:11,2021-06-15,23:14:11
4,00080813f97cfe3f,TF_ACULEI_5001_DSCF5288.jpg,CAM_1,2021-06-20 02:03:59,2021-06-20,02:03:59
5,000000383e7efe7c,TF_ACULEI_8764_DSCF0840.jpg,CAM_1,2021-07-30 01:02:17,2021-07-30,01:02:17
6,18180838fcfefefc,TF_ACULEI_6126_DSCF6442.jpg,CAM_1,2021-06-24 05:05:27,2021-06-24,05:05:27
7,f8f0f0c0c0800000,TF_ACULEI_10101_IMAG0148.jpg,CAM_3,2021-12-15 22:58:47,2021-12-15,22:58:47
8,000000003c7e7f7f,TF_ACULEI_13117_DSCF0020.jpg,,2023-01-01 23:42:51,2023-01-01,23:42:51
9,00000000107e7f7f,TF_ACULEI_13975_DSCF0742.jpg,,2023-01-23 20:37:53,2023-01-23,20:37:53


In [21]:
from moon_phase import get_moon_phase

# extract moon phase
df['moon'] = df['date'].apply(lambda x: get_moon_phase(x))

In [22]:
df.sample(10)

Unnamed: 0,hash,image_name,camera,date_time,date,time,moon
31,00000000107e7f7f,TF_ACULEI_13145_DSCF0023.jpg,,2023-01-03 22:56:28,2023-01-03,22:56:28,Waning Crescent
36,0008183c3e3e7e3e,TF_ACULEI_14677_DSCF0167.jpg,,2023-02-13 05:30:04,2023-02-13,05:30:04,Waning Crescent
23,0008000cfcfcfefe,TF_ACULEI_1113_DSCF0969.jpg,CAM_1,2021-05-27 22:32:57,2021-05-27,22:32:57,Waning Crescent
12,0008183c3e3e3e3e,TF_ACULEI_15771_DSCF0249.jpg,,2023-05-08 22:09:26,2023-05-08,22:09:26,Waning Crescent
21,000800b8f8fcfefe,TF_ACULEI_2205_DSCF2215.jpg,CAM_1,2021-06-08 00:40:24,2021-06-08,00:40:24,Waxing Crescent
49,000038787efefe1c,TF_ACULEI_9100_DSCF0013.jpg,CAM_1,2021-08-27 02:17:58,2021-08-27,02:17:58,Waning Crescent
8,000000003c7e7f7f,TF_ACULEI_13117_DSCF0020.jpg,,2023-01-01 23:42:51,2023-01-01,23:42:51,Waning Crescent
42,000000163efffcfc,TF_ACULEI_3818_DSCF4036.jpg,CAM_1,2021-06-16 00:08:35,2021-06-16,00:08:35,Full Moon
29,000808187c7efe7e,TF_ACULEI_4370_DSCF4634.jpg,CAM_1,2021-06-17 23:29:08,2021-06-17,23:29:08,Last Quarter
15,0e0e0e183cfcffff,TF_ACULEI_4902_DSCF5182.jpg,CAM_1,2021-06-20 01:43:35,2021-06-20,01:43:35,Waning Crescent


In [23]:
# save the dataframe to a csv 
csv_path = '../test.csv'
df.to_csv(csv_path)