### Extract metadata from images 

The first step is to extract as much data as we can from images metadata in order to have a dataset to start from. 

In [1]:
import os

import numpy as np
import pandas as pd

import exiftool
from tqdm import tqdm

import imagehash
from PIL import Image

In [2]:
image_folder_path = '../images/aculei-images/'
image_names = np.array([f for f in os.listdir(image_folder_path) if f.lower().endswith('.jpg')])

Using [exiftool wrapper](https://pypi.org/project/PyExifTool/) is possible to extract tons of metadata from the images. Unfortunately some images do not come with metadata we are looking for so in the next episode we'll introduce OCR to read the metadata directly from the image. 

In [3]:
# defining tags we are interested in
tags_for_cam = ["Keywords", "Subject", "WeightedFlatSubject"]
tags_for_datetime = ["CreateDate", "DateCreated"]

tags = tags_for_cam + tags_for_datetime

df = pd.DataFrame()
for image in tqdm(image_names, desc="Extracting metadata"):
    file_path = os.path.join(image_folder_path, image)
    image_hash = imagehash.average_hash(Image.open(file_path))
    
    with exiftool.ExifToolHelper() as et:
        try:
            metadata = {}
            
            # caluclating the hash to have a unique identifier
            metadata['hash'] = image_hash
            metadata['image_name'] = image
            for d in et.get_tags([file_path], tags=tags):
                for k, v in d.items():
                    metadata[k] = v
            df = pd.concat([df, pd.DataFrame.from_records([metadata])], ignore_index=True)
        except Exception as e:
            print(e)

Extracting metadata:   0%|          | 0/16874 [00:00<?, ?it/s]

Extracting metadata: 100%|██████████| 16874/16874 [46:23<00:00,  6.06it/s]


In [4]:
df.shape

(16874, 10)

In [5]:
df.isnull().sum()

hash                          0
image_name                    0
SourceFile                    0
IPTC:Keywords              4451
XMP:Subject                4451
XMP:WeightedFlatSubject    4451
EXIF:CreateDate             103
XMP:CreateDate              103
IPTC:DateCreated            103
XMP:DateCreated             103
dtype: int64

In [6]:
# clean some columns
df.drop(columns=['SourceFile', 'XMP:WeightedFlatSubject', 'IPTC:Keywords', 'EXIF:CreateDate', 'IPTC:DateCreated', 'XMP:DateCreated'], 
        inplace=True)

In [7]:
df['date_time'] = pd.to_datetime(df['XMP:CreateDate'], format='%Y:%m:%d %H:%M:%S')

# split into 'date' and 'time' columns
df['date'] = df['date_time'].dt.date
df['time'] = df['date_time'].dt.time

# Drop the original 'DateTime' column has it is no longer needed
df.drop(columns=['XMP:CreateDate'], inplace=True)

In [8]:
# rename some columns
df.rename(columns={"XMP:Subject": "camera"}, inplace=True)

In [9]:
df.head(10)

Unnamed: 0,hash,image_name,camera,date_time,date,time
0,00100018fdffffff,TF_ACULEI_8040_DSCF0129.jpg,CAM_1,2021-07-22 23:04:07,2021-07-22,23:04:07
1,000000b8fcfcffff,TF_ACULEI_900_DSCF0756.jpg,CAM_1,2021-05-27 22:01:44,2021-05-27,22:01:44
2,0000001000787f7f,TF_ACULEI_15294_DSCF0133.jpg,,2023-03-30 20:35:58,2023-03-30,20:35:58
3,0000001e1e3cfe7f,TF_ACULEI_11374_DSCF0064.jpg,CAM_6,2022-06-09 21:29:59,2022-06-09,21:29:59
4,00080818fcfeffff,TF_ACULEI_4106_DSCF4336.jpg,CAM_1,2021-06-16 23:41:09,2021-06-16,23:41:09
5,000800b8fcfdfffd,TF_ACULEI_1601_DSCF1490.jpg,CAM_1,2021-06-02 00:39:46,2021-06-02,00:39:46
6,00100c3cfcfdffff,TF_ACULEI_8326_DSCF0393.jpg,CAM_1,2021-07-24 23:20:01,2021-07-24,23:20:01
7,ff0f000000000fff,TF_ACULEI_16157_DSCF0165.jpg,,2023-06-20 07:33:59,2023-06-20,07:33:59
8,e6e6f4faf2808000,TF_ACULEI_16077_DSCF0744.jpg,,2023-06-09 04:27:57,2023-06-09,04:27:57
9,fc78080038fcfc78,TF_ACULEI_2994_DSCF3044.jpg,CAM_1,2021-06-10 21:55:12,2021-06-10,21:55:12


We can extract an important feature just from the datetime: the moon phase! it could be a relevant feature to discover some data patterns! 

In [10]:
from moon_phase import phase

# extract moon phase
df['date_time'] = pd.to_datetime(df['date_time'])
df['moon'] = df['date_time'].apply(lambda x: phase(str(x)) if not pd.isnull(x) else None)

In [11]:
df.sample(10)

Unnamed: 0,hash,image_name,camera,date_time,date,time,moon
4419,e0180206367c7e1c,TF_ACULEI_9333_DSCF0141.jpg,CAM_1,2021-09-30 04:32:42,2021-09-30,04:32:42,Last Quarter
4097,00080038fcfcfffe,TF_ACULEI_7220_DSCF7566.jpg,CAM_1,2021-07-03 02:24:25,2021-07-03,02:24:25,Last Quarter
12639,00080038fcfefefe,TF_ACULEI_7159_DSCF7505.jpg,CAM_1,2021-07-03 02:00:47,2021-07-03,02:00:47,Last Quarter
14087,0000001038383e7f,TF_ACULEI_9161_IMAG0042.jpg,CAM_3,2021-09-09 21:40:54,2021-09-09,21:40:54,Waxing Crescent
4030,00081098fcfcfebe,TF_ACULEI_2414_DSCF2424.jpg,CAM_1,2021-06-08 01:16:52,2021-06-08,01:16:52,Waning Crescent
1309,00080030f8fcfffc,TF_ACULEI_848_DSCF0704.jpg,CAM_1,2021-05-27 21:50:52,2021-05-27,21:50:52,Full Moon
3159,000808387cfeffff,TF_ACULEI_4900_DSCF5177.jpg,CAM_1,2021-06-20 00:25:41,2021-06-20,00:25:41,Waxing Gibbous
2863,00000038fcfeffff,TF_ACULEI_7434_DSCF7772.jpg,CAM_1,2021-07-06 05:17:48,2021-07-06,05:17:48,Waning Crescent
13761,000808387cfcfefe,TF_ACULEI_2053_DSCF2058.jpg,CAM_1,2021-06-08 00:08:25,2021-06-08,00:08:25,Waning Crescent
2387,001000387cfefeff,TF_ACULEI_8623_DSCF0704.jpg,CAM_1,2021-07-28 04:14:27,2021-07-28,04:14:27,Waning Gibbous


In [12]:
# save the dataframe to a csv 
csv_path = '../datasets/metadata.csv'
df.to_csv(csv_path)

We have to clean up the camera column a little bit because there are unwanted classes

In [13]:
dd = pd.read_csv(csv_path, index_col=0)


In [14]:
dd['camera'].value_counts()

CAM_1                                                                            9076
CAM_4                                                                            1047
CAM_3                                                                             995
CAM_5                                                                             671
CAM_6                                                                             201
CAM_7                                                                             197
CAM_2                                                                             188
[2022, 'CAM_6', 'Laghetto']                                                        46
['11/22', 'CAM_7']                                                                  1
[2023, 'analogue shutterino', 'backstage', 'banco', 'bosco', 'laghetto gigi']       1
Name: camera, dtype: int64

In [15]:
dd['camera'] = dd['camera'].apply(lambda x: 'CAM_6' if x == "[2022, 'CAM_6', 'Laghetto']" 
                                  else x)
dd['camera'] = dd['camera'].apply(lambda x: None if x == "[2023, 'analogue shutterino', 'backstage', 'banco', 'bosco', 'laghetto gigi']" 
                                  else x)
dd['camera'] = dd['camera'].apply(lambda x: 'CAM_7' if x == "['11/22', 'CAM_7']" 
                                  else x)

In [16]:
dd['camera'].value_counts()

CAM_1    9076
CAM_4    1047
CAM_3     995
CAM_5     671
CAM_6     247
CAM_7     198
CAM_2     188
Name: camera, dtype: int64

In [17]:
# save the dataframe to a csv 
dd.to_csv(csv_path)