### OCR

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np

import os

import pytesseract
import PIL.Image
if not hasattr(PIL.Image, 'Resampling'):
    PIL.Image.Resampling = PIL.Image

from PIL import Image

import cv2
import easyocr

from tqdm import tqdm

import matplotlib.pyplot as plt

import re

In [2]:
# image directory
image_dir = '../images/aculei-images/'

In [3]:
# reading the dataframe
df = pd.read_csv('../aculei.csv', index_col=0)

We want to fill the missing data which were not available through metadata using ocr techniques.

The missing data are:

- some **cameras**
- all **temperatures**

So we have to filter the dataframe and try to fill in the missing data for the camera column and then we'll do the same thing for the temparature column (which is not present yet)


In [4]:
df.head()

Unnamed: 0,image_name,camera,date_time,date,time,moon
0,TF_ACULEI_8040_DSCF0129.jpg,CAM_1,2021-07-22 23:04:07,2021-07-22,23:04:07,Full Moon
1,TF_ACULEI_900_DSCF0756.jpg,CAM_1,2021-05-27 22:01:44,2021-05-27,22:01:44,Full Moon
2,TF_ACULEI_15294_DSCF0133.jpg,,2023-03-30 20:35:58,2023-03-30,20:35:58,First Quarter
3,TF_ACULEI_11374_DSCF0064.jpg,CAM_6,2022-06-09 21:29:59,2022-06-09,21:29:59,Waxing Gibbous
4,TF_ACULEI_4106_DSCF4336.jpg,CAM_1,2021-06-16 23:41:09,2021-06-16,23:41:09,First Quarter


In [5]:
df.shape

(16874, 6)

In [6]:
df.isnull().sum()

image_name       0
camera        4452
date_time      103
date           103
time           103
moon           103
dtype: int64

From the output above we can see we are missing

- 4452 cameras
- 103 dates
- 16874 temperatures

We'll split the jobs to keep it simple

### Cameras

We'll filter the dataset keeping only the empty camera rows (we expect 4452 rows)

In [7]:
df_camera = df.copy()
df_camera = df[df['camera'].isnull()]

In [8]:
df_camera.shape

(4452, 6)

In [9]:
df_camera.sample(5)

Unnamed: 0,image_name,camera,date_time,date,time,moon
15061,TF_ACULEI_14470_DSCF0124.jpg,,2023-02-02 18:13:30,2023-02-02,18:13:30,Waxing Gibbous
3819,TF_ACULEI_14541_IMAG0179.jpg,,2023-02-04 22:26:36,2023-02-04,22:26:36,Full Moon
15942,TF_ACULEI_13967_DSCF0217.jpg,,,,,
9285,TF_ACULEI_94_DSCF0204.jpg,,2021-01-22 02:23:11,2021-01-22,02:23:11,First Quarter
726,TF_ACULEI_16291_DSCF0434.jpg,,2023-07-06 05:35:02,2023-07-06,05:35:02,Waning Gibbous


Using [pytesseract](https://pypi.org/project/pytesseract/) we can extract text from the images. We decided to crop the images to achieve better results.
The output string from pytesseract is not tokenized or anything so we used RegExp to extract relevant information from the whole string


In [10]:
for index, row in tqdm(df_camera.iterrows(), total=len(df_camera)):
    image_path = os.path.join(image_dir, row['image_name'])
    if os.path.isdir(image_path) and image_path.startswith("."):
        continue 
    
    image = Image.open(image_path)
    
    # get image dimensions
    width, height = image.size
    
    # define the coordinates for cropping
    left = 0
    upper = 0
    right = width
    lower = int(height * 17 / 18)  # keep only the bottom 1/3 of the image
    
    # crop the image
    cropped_image = image.crop((left, lower, right, height))
    
    text = pytesseract.image_to_string(cropped_image, config='--psm 3')
    
    cam_pattern = r'\bCA\S*'
    cam_matches = re.findall(cam_pattern, text)
    
    # saving results
    row["camera"] = cam_matches[0] if cam_matches else None


  4%|▍         | 196/4452 [01:07<24:36,  2.88it/s]

In [None]:
df_camera.head(5)

It seems that the ocr recognized a lot of cameras but some of them are not well formatted (e.g CAME), so we have to understand what they stands for and substitute with the correct camera if possible. Otherwise we can simply drop them.

In [None]:
df_camera['camera'].value_counts()

In [None]:
df_camera.isnull().sum()

We reached only 537 falsy cameras but we decided to drop the ones not well formatted

In [None]:
%%capture

# drop the camera from ocr not well formatted
valid_cameras = [f'CAM{i}' for i in range(1, 8)]
df_camera['camera'] = df_camera['camera'].apply(lambda x: x if x in valid_cameras else None)

In [None]:
df_camera.isnull().sum()

In [None]:
df_camera['camera'].value_counts()

We want to format the camera recognized with the ocr (from CAM7 to CAM_7)

In [None]:
fail here

So we lost only **2395** cameras using OCR for camera recognition

Now let's check some results

In [None]:
import matplotlib.pyplot as plt
import random

image_folder_path = '../images/aculei-images/'

# extract four random images from the dataset
valid_rows = df_camera.dropna(subset=['camera'])
six_random_rows = valid_rows.sample(n=6, random_state=random.seed())
image_folder_path = '../images/aculei-images/'

fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(2, 3, figsize=(16, 9))
fig.suptitle('OCR results', fontsize=15)

axes = [ax1, ax2, ax3, ax4, ax5, ax6]

for i, ax in enumerate(axes):
    image_path = os.path.join(image_folder_path, six_random_rows.iloc[i]['image_name'])
    image = Image.open(image_path)
    ax.imshow(image, aspect='auto')
    ax.axis('off')

    camera_info = six_random_rows.iloc[i]['camera']
    ax.set_title(f"Camera recognized by ocr: {camera_info}", fontsize=10)

plt.tight_layout()
plt.show()

Now we can merge results in the original dataframe and save it! we didn't compute the hash so we use the image name as merge key for now

In [None]:
# checking if image name is a safe key
print(len(df['image_name']))
print(df.shape[0])

In [None]:
merged_camera = df.merge(df_camera[['image_name', 'camera']], on='image_name', how='left', suffixes=('', '_camera'))

# Fill missing 'camera' values in 'df' with values from 'df_camera'
merged_camera['camera'] = merged_camera['camera'].combine_first(merged_camera['camera_camera'])

# Drop the extra 'camera_camera' column
merged_camera.drop('camera_camera', axis=1, inplace=True)  


In [None]:
merged_camera.isnull().sum()

In [None]:
# save it!
merged_camera.to_csv('../aculei.csv')