### OCR

In [26]:
import pandas as pd
import numpy as np
from datetime import datetime

import os

import pytesseract
import PIL.Image
if not hasattr(PIL.Image, 'Resampling'):
    PIL.Image.Resampling = PIL.Image
from PIL import Image
from tqdm import tqdm
import re

In [27]:
# image directory
image_dir = '../images/aculei-images/'

In [28]:
# reading the dataframe
df = pd.read_csv('../datasets/metadata.csv', index_col=0)

We want to fill the missing data which were not available through metadata using ocr techniques

The missing data are:

- some **cameras**
- some **date(time)**
- all **temperatures**

So we have to filter the dataframe and try to fill in the missing data for the camera column and then we'll do the same thing for the temparature column (which is not present yet)


In [29]:
df.head()

Unnamed: 0,image_name,camera,date_time,date,time,moon
0,TF_ACULEI_8040_DSCF0129.jpg,CAM_1,2021-07-22 23:04:07,2021-07-22,23:04:07,Full Moon
1,TF_ACULEI_900_DSCF0756.jpg,CAM_1,2021-05-27 22:01:44,2021-05-27,22:01:44,Full Moon
2,TF_ACULEI_15294_DSCF0133.jpg,,2023-03-30 20:35:58,2023-03-30,20:35:58,First Quarter
3,TF_ACULEI_11374_DSCF0064.jpg,CAM_6,2022-06-09 21:29:59,2022-06-09,21:29:59,Waxing Gibbous
4,TF_ACULEI_4106_DSCF4336.jpg,CAM_1,2021-06-16 23:41:09,2021-06-16,23:41:09,First Quarter


In [30]:
df.shape

(16874, 6)

In [31]:
df.isnull().sum()

image_name       0
camera        4452
date_time      103
date           103
time           103
moon           103
dtype: int64

From the output above we can see we are missing

- 4452 cameras
- 103 dates
- 16874 temperatures

We'll split the jobs to keep it simple

### Cameras

We'll filter the dataset keeping only the empty camera rows (we expect 4452 rows)

In [32]:
df_ocr = df.copy()

In [33]:
df_ocr.shape

(16874, 6)

In [34]:
df_ocr.sample(5)

Unnamed: 0,image_name,camera,date_time,date,time,moon
16723,TF_ACULEI_4695_DSCF4959.jpg,CAM_1,2021-06-18 03:32:23,2021-06-18,03:32:23,First Quarter
10826,TF_ACULEI_9563_DSCF0378.jpg,CAM_1,2021-10-22 02:45:10,2021-10-22,02:45:10,Full Moon
3022,TF_ACULEI_13878_DSCF0646.jpg,,2023-01-22 04:22:01,2023-01-22,04:22:01,New Moon
10026,TF_ACULEI_7350_DSCF7696.jpg,CAM_1,2021-07-03 02:53:55,2021-07-03,02:53:55,Last Quarter
6928,TF_ACULEI_690_DSCF0544.jpg,CAM_1,2021-05-25 23:58:40,2021-05-25,23:58:40,Full Moon


Using [pytesseract](https://pypi.org/project/pytesseract/) we can extract text from the images. We decided to crop the images to achieve better results.
The output string from pytesseract is not tokenized or anything so we used RegExp to extract relevant information from the whole string


We had to provide different time formats translations because the metadata about date are diffent from image to image

In [35]:
def parse_date(date_string):
    for format in ["%Y/%m/%d", "%d/%m/%Y"]:
        try:
            return datetime.strptime(date_string, format)
        except ValueError:
            pass
    return None

In [36]:
for index, row in tqdm(df_ocr.iterrows(), total=len(df_ocr)):
    image_path = os.path.join(image_dir, row['image_name'])
    if os.path.isdir(image_path) and image_path.startswith("."):
        continue 
    
    image = Image.open(image_path)
    
    # get image dimensions
    width, height = image.size
    
    # define the coordinates for cropping
    left = 0
    upper = 0
    right = width
    lower = int(height * 17 / 18)  # keep only the bottom 1/3 of the image
    
    # crop the image
    cropped_image = image.crop((left, lower, right, height))
    
    text = pytesseract.image_to_string(cropped_image, config='--psm 3')

    '''
    now let's save the results only if the given column is empty
    save all temperatures 16874
    save some date times 103
    save some cameras 4452
    '''

    data = {}
    
    # saving the camera conditionally
    if pd.isnull(row["camera"]):
        cam_pattern = r'\bCA\S*'
        cam_matches = re.findall(cam_pattern, text)
        data["camera"] = cam_matches[0] if cam_matches else None

    # saving the datetime conditionally
    if pd.isnull(row["date_time"]):
        date_pattern = r'\b\d{4}[-/]\d{2}[-/]\d{2}\b|\b\d{2}[-/]\d{2}[-/]\d{4}\b'    
        time_pattern = r'\b\d{1,2}:\d{2}:\d{2}\b'
        date_matches = re.findall(date_pattern, text)
        time_matches = re.findall(time_pattern, text)

        # preprocessing the date
        if date_matches:
            parsed_date = parse_date(date_matches[0])
            if parsed_date is not None:
                formatted_date = parsed_date.strftime("%Y-%m-%d")
                data["date"] = formatted_date
        
        data["time"] = time_matches[0] if time_matches else None
   
        if time_matches and formatted_date:
            row["datetime"] = formatted_date + " " + time_matches[0] 
    
    # saving the temp always
    celsius_pattern = r'\b\d+\s*[°]\s*C\b'
    celsius_matches = re.findall(celsius_pattern, text)
    data["temp"] = celsius_matches[0] if celsius_matches else None

    df_ocr = pd.concat([df_ocr, pd.DataFrame.from_records([data])], ignore_index=True)

 24%|██▎       | 3994/16874 [26:42<1:26:07,  2.49it/s]


KeyboardInterrupt: 

Once the process is finished let's save the datasaet and process it in another notebook in order to manage the results safely 

In [None]:
csv_path = '../datasets/ocr.csv'
df_ocr.to_csv(csv_path)