### Extract metadata from images 

The first step is to extract as much data as we can from images metadata in order to have a dataset to start from. 

In [1]:
import os

import numpy as np
import pandas as pd

import exiftool
from tqdm import tqdm

import imagehash
from PIL import Image

In [2]:
image_folder_path = '../images/aculei-images/'
image_names = np.array([f for f in os.listdir(image_folder_path) if f.lower().endswith('.jpg')])

Using [exiftool wrapper](https://pypi.org/project/PyExifTool/) is possible to extract tons of metadata from the images. Unfortunately some images do not come with metadata we are looking for so in the next episode we'll introduce OCR to read the metadata directly from the image. 

In [3]:
# defining tags we are interested in
tags_for_cam = ["Keywords", "Subject", "WeightedFlatSubject"]
tags_for_datetime = ["CreateDate", "DateCreated"]

tags = tags_for_cam + tags_for_datetime

df = pd.DataFrame()
for image in tqdm(image_names, desc="Extracting metadata"):
    file_path = os.path.join(image_folder_path, image)
    # image_hash = imagehash.average_hash(Image.open(file_path))
    
    with exiftool.ExifToolHelper() as et:
        try:
            metadata = {}
            
            # caluclating the hash to have a unique identifier
            # metadata['hash'] = image_hash
            metadata['image_name'] = image
            for d in et.get_tags([file_path], tags=tags):
                for k, v in d.items():
                    metadata[k] = v
            df = pd.concat([df, pd.DataFrame.from_records([metadata])], ignore_index=True)
        except Exception as e:
            print(e)

Extracting metadata:   0%|          | 0/16874 [00:00<?, ?it/s]

Extracting metadata: 100%|██████████| 16874/16874 [21:22<00:00, 13.16it/s]


In [4]:
df.shape

(16874, 9)

In [5]:
df.isnull().sum()

image_name                    0
SourceFile                    0
IPTC:Keywords              4451
XMP:Subject                4451
XMP:WeightedFlatSubject    4451
EXIF:CreateDate             103
XMP:CreateDate              103
IPTC:DateCreated            103
XMP:DateCreated             103
dtype: int64

In [6]:
# clean some columns
df.drop(columns=['SourceFile', 'XMP:WeightedFlatSubject', 'IPTC:Keywords', 'EXIF:CreateDate', 'IPTC:DateCreated', 'XMP:DateCreated'], 
        inplace=True)

In [7]:
df['date_time'] = pd.to_datetime(df['XMP:CreateDate'], format='%Y:%m:%d %H:%M:%S')

# split into 'date' and 'time' columns
df['date'] = df['date_time'].dt.date
df['time'] = df['date_time'].dt.time

# Drop the original 'DateTime' column has it is no longer needed
df.drop(columns=['XMP:CreateDate'], inplace=True)

In [8]:
# rename some columns
df.rename(columns={"XMP:Subject": "camera"}, inplace=True)

In [9]:
df.head(10)

Unnamed: 0,image_name,camera,date_time,date,time
0,TF_ACULEI_8040_DSCF0129.jpg,CAM_1,2021-07-22 23:04:07,2021-07-22,23:04:07
1,TF_ACULEI_900_DSCF0756.jpg,CAM_1,2021-05-27 22:01:44,2021-05-27,22:01:44
2,TF_ACULEI_15294_DSCF0133.jpg,,2023-03-30 20:35:58,2023-03-30,20:35:58
3,TF_ACULEI_11374_DSCF0064.jpg,CAM_6,2022-06-09 21:29:59,2022-06-09,21:29:59
4,TF_ACULEI_4106_DSCF4336.jpg,CAM_1,2021-06-16 23:41:09,2021-06-16,23:41:09
5,TF_ACULEI_1601_DSCF1490.jpg,CAM_1,2021-06-02 00:39:46,2021-06-02,00:39:46
6,TF_ACULEI_8326_DSCF0393.jpg,CAM_1,2021-07-24 23:20:01,2021-07-24,23:20:01
7,TF_ACULEI_16157_DSCF0165.jpg,,2023-06-20 07:33:59,2023-06-20,07:33:59
8,TF_ACULEI_16077_DSCF0744.jpg,,2023-06-09 04:27:57,2023-06-09,04:27:57
9,TF_ACULEI_2994_DSCF3044.jpg,CAM_1,2021-06-10 21:55:12,2021-06-10,21:55:12


We can extract an important feature just from the datetime: the moon phase! it could be a relevant feature to discover some data patterns! 

In [39]:
from moon_phase import phase

# extract moon phase
df['date_time'] = pd.to_datetime(df['date_time'])
df['moon'] = df['date_time'].apply(lambda x: phase(str(x)) if not pd.isnull(x) else None)

In [40]:
df.sample(10)

Unnamed: 0,image_name,camera,date_time,date,time,moon
4237,TF_ACULEI_1195_DSCF1061.jpg,CAM_1,2021-05-27 22:56:12,2021-05-27,22:56:12,Full Moon
2186,TF_ACULEI_10934_DSCF0115.jpg,CAM_4,2022-04-09 03:05:42,2022-04-09,03:05:42,First Quarter
4188,TF_ACULEI_6959_DSCF7325.jpg,CAM_1,2021-06-28 03:49:41,2021-06-28,03:49:41,Waning Gibbous
12645,TF_ACULEI_13130_DSCF0125.jpg,,2023-01-02 17:09:11,2023-01-02,17:09:11,Waxing Gibbous
14527,TF_ACULEI_8333_DSCF0400.jpg,CAM_1,2021-07-24 23:21:03,2021-07-24,23:21:03,Full Moon
7383,TF_ACULEI_3192_DSCF3241.jpg,CAM_1,2021-06-10 22:28:32,2021-06-10,22:28:32,New Moon
3996,TF_ACULEI_10307_IMAG0105.jpg,CAM_3,2022-01-07 21:51:17,2022-01-07,21:51:17,Waxing Crescent
8792,TF_ACULEI_13981_DSCF0748.jpg,,2023-01-23 20:43:51,2023-01-23,20:43:51,Waxing Crescent
4215,TF_ACULEI_2167_DSCF2172.jpg,CAM_1,2021-06-08 00:32:33,2021-06-08,00:32:33,Waning Crescent
14131,TF_ACULEI_14864_DSCF0027.jpg,,2023-02-25 01:09:41,2023-02-25,01:09:41,Waxing Crescent


In [33]:
# save the dataframe to a csv 
csv_path = '../aculei.csv'
df.to_csv(csv_path)

We have to clean up the camera column a little bit because there are unwanted classes

In [34]:
dd = pd.read_csv('../aculei.csv', index_col=0)


In [35]:
dd['camera'].value_counts()

CAM_1                                                                            9076
CAM_4                                                                            1047
CAM_3                                                                             995
CAM_5                                                                             671
CAM_6                                                                             201
CAM_7                                                                             197
CAM_2                                                                             188
[2022, 'CAM_6', 'Laghetto']                                                        46
['11/22', 'CAM_7']                                                                  1
[2023, 'analogue shutterino', 'backstage', 'banco', 'bosco', 'laghetto gigi']       1
Name: camera, dtype: int64

In [36]:
dd['camera'] = dd['camera'].apply(lambda x: 'CAM_6' if x == "[2022, 'CAM_6', 'Laghetto']" 
                                  else x)
dd['camera'] = dd['camera'].apply(lambda x: None if x == "[2023, 'analogue shutterino', 'backstage', 'banco', 'bosco', 'laghetto gigi']" 
                                  else x)
dd['camera'] = dd['camera'].apply(lambda x: 'CAM_7' if x == "['11/22', 'CAM_7']" 
                                  else x)

In [37]:
dd['camera'].value_counts()

CAM_1    9076
CAM_4    1047
CAM_3     995
CAM_5     671
CAM_6     247
CAM_7     198
CAM_2     188
Name: camera, dtype: int64

In [38]:
# save the dataframe to a csv 
csv_path = '../aculei.csv'
dd.to_csv(csv_path)