### Extract metadata from images

The first step is to extract as much data as we can from images metadata in order to have a dataset to start from. 

In [10]:
import os

import numpy as np
import pandas as pd

import exiftool
from tqdm import tqdm

import imagehash
from PIL import Image

In [11]:
image_folder_path = '../images/aculei-images/'
image_names = np.array([f for f in os.listdir(image_folder_path) if f.lower().endswith('.jpg')])

In [12]:
# defining tags we are interested in
tags_for_cam = ["Keywords", "Subject", "WeightedFlatSubject"]
tags_for_datetime = ["CreateDate", "DateCreated"]

tags = tags_for_cam + tags_for_datetime

df = pd.DataFrame()
for image in tqdm(image_names, desc="Extracting metadata"):
    file_path = os.path.join(image_folder_path, image)
    # image_hash = imagehash.average_hash(Image.open(file_path))
    
    with exiftool.ExifToolHelper() as et:
        try:
            metadata = {}
            
            # caluclating the hash to have a unique identifier
            # metadata['hash'] = image_hash
            metadata['image_name'] = image
            for d in et.get_tags([file_path], tags=tags):
                for k, v in d.items():
                    metadata[k] = v
            df = pd.concat([df, pd.DataFrame.from_records([metadata])], ignore_index=True)
        except Exception as e:
            print(e)

Extracting metadata: 100%|██████████| 16874/16874 [21:14<00:00, 13.24it/s]


In [13]:
df.shape

(16874, 9)

In [14]:
df.isnull().sum()

image_name                    0
SourceFile                    0
IPTC:Keywords              4451
XMP:Subject                4451
XMP:WeightedFlatSubject    4451
EXIF:CreateDate             103
XMP:CreateDate              103
IPTC:DateCreated            103
XMP:DateCreated             103
dtype: int64

In [15]:
# clean some columns
df.drop(columns=['SourceFile', 'XMP:WeightedFlatSubject', 'IPTC:Keywords', 'EXIF:CreateDate', 'IPTC:DateCreated', 'XMP:DateCreated'], 
        inplace=True)

In [16]:
df['date_time'] = pd.to_datetime(df['XMP:CreateDate'], format='%Y:%m:%d %H:%M:%S')

# split into 'date' and 'time' columns
df['date'] = df['date_time'].dt.date
df['time'] = df['date_time'].dt.time

# Drop the original 'DateTime' column has it is no longer needed
df.drop(columns=['XMP:CreateDate'], inplace=True)

In [17]:
# rename some columns
df.rename(columns={"XMP:Subject": "camera"}, inplace=True)

In [21]:
df.head(10)

Unnamed: 0,image_name,camera,date_time,date,time
0,TF_ACULEI_8040_DSCF0129.jpg,CAM_1,2021-07-22 23:04:07,2021-07-22,23:04:07
1,TF_ACULEI_900_DSCF0756.jpg,CAM_1,2021-05-27 22:01:44,2021-05-27,22:01:44
2,TF_ACULEI_15294_DSCF0133.jpg,,2023-03-30 20:35:58,2023-03-30,20:35:58
3,TF_ACULEI_11374_DSCF0064.jpg,CAM_6,2022-06-09 21:29:59,2022-06-09,21:29:59
4,TF_ACULEI_4106_DSCF4336.jpg,CAM_1,2021-06-16 23:41:09,2021-06-16,23:41:09
5,TF_ACULEI_1601_DSCF1490.jpg,CAM_1,2021-06-02 00:39:46,2021-06-02,00:39:46
6,TF_ACULEI_8326_DSCF0393.jpg,CAM_1,2021-07-24 23:20:01,2021-07-24,23:20:01
7,TF_ACULEI_16157_DSCF0165.jpg,,2023-06-20 07:33:59,2023-06-20,07:33:59
8,TF_ACULEI_16077_DSCF0744.jpg,,2023-06-09 04:27:57,2023-06-09,04:27:57
9,TF_ACULEI_2994_DSCF3044.jpg,CAM_1,2021-06-10 21:55:12,2021-06-10,21:55:12


In [96]:
from moon_phase import phase

# extract moon phase
df['date_time'] = pd.to_datetime(df['date_time'])
df['moon'] = df['date_time'].apply(lambda x: phase(str(x)) if not pd.isnull(x) else None)

In [116]:
df.sample(10)

Unnamed: 0,image_name,camera,date_time,date,time,moon
11752,TF_ACULEI_1420_DSCF1324.jpg,CAM_1,2021-06-01 22:17:49,2021-06-01,22:17:49,Waning Crescent
16032,TF_ACULEI_3667_DSCF3885.jpg,CAM_1,2021-06-15 23:36:44,2021-06-15,23:36:44,Waxing Gibbous
3724,TF_ACULEI_3316_DSCF3425.jpg,CAM_1,2021-06-15 00:17:11,2021-06-15,00:17:11,Waxing Gibbous
11228,TF_ACULEI_13908_DSCF0214.jpg,,NaT,NaT,NaT,
7634,TF_ACULEI_6293_DSCF6636.jpg,CAM_1,2021-06-25 00:46:03,2021-06-25,00:46:03,Waning Crescent
12191,TF_ACULEI_9875_DSCF0289.jpg,CAM_1,2021-11-24 03:34:57,2021-11-24,03:34:57,Waning Crescent
13647,TF_ACULEI_7885_DSCF0183.jpg,CAM_1,2021-07-15 04:58:22,2021-07-15,04:58:22,Waxing Gibbous
3634,TF_ACULEI_14626_IMAG0299.jpg,,2023-02-09 07:18:03,2023-02-09,07:18:03,Waning Crescent
12857,TF_ACULEI_16417_DSCF0108.jpg,,2023-07-27 22:27:23,2023-07-27,22:27:23,Waning Crescent
4112,TF_ACULEI_15060_DSCF0079.jpg,,2023-03-15 07:30:13,2023-03-15,07:30:13,Waning Crescent


In [117]:
# save the dataframe to a csv 
csv_path = '../aculei.csv'
df.to_csv(csv_path)