# Import packages

In [7]:
import os
import re
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import pandas as pd

# Core functions

In [2]:
def footageExtractDateTime(doctrModel, imageFilePath):
    tokens = extractTokens(doctrModel, imageFilePath)
    return extractDateTimeToken(tokens)

def extractTokens(ocrModel, imageFilePath):
    # Function to extract the tokens (strings) from the image
    doc = DocumentFile.from_images(imageFilePath)
    result = ocrModel(doc).export()
    
    tokens = []
    
    for page in result["pages"]:
        for block in page["blocks"]:
            for line in block["lines"]:
                tokens.append(" ".join([w["value"] for w in line["words"]]))
                    
    return tokens

def extractDateTimeToken(tokens):
    # Function to extract the datetime from the list of tokens
    for token in tokens:
        DATETIME_TOKEN_REGEX_TEST = r"([0-9]{2}-[0-9]{2}-[0-9]{4})|([0-9]{2}:[0-9]{2}:[0-9]{2})"
        isDateTimeToken = re.search(DATETIME_TOKEN_REGEX_TEST, token)
        if (isDateTimeToken):
            return token
    raise Exception("No datetime parsed in the image.")

## Initialization

In [3]:
## Initialize model to be used for OCR
doctrModel = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True)

# Function usage demo

In [4]:
print(footageExtractDateTime(doctrModel, "testdata/D27.jpg"))

12-01-2025 Mon 18:27:27


## Evaluation

In [33]:
test_data_indices_df = pd.read_csv("testdata/datetime_footage_testdata_indices.csv")
correctExtractionCount = 0
print(f"Extracted\t\t\tActual\t\t\t\tCorrect?")
for _, testdata_row in test_data_indices_df.iterrows():
    ## Check per row if the datetime string is extracted correctly
    extractedStr = footageExtractDateTime(doctrModel, f"testdata/{testdata_row["File"]}")
    isCorrectlyExtracted = extractedStr == testdata_row["Actual datetime value"]

    ## Log the results accordingly
    if (extractedStr == testdata_row["Actual datetime value"]):
        correctExtractionCount += 1
    print(f"{extractedStr}\t\t{testdata_row["Actual datetime value"]}\t\t{"CORRECT" if isCorrectlyExtracted else "INCORRECT"}")

print(f"{correctExtractionCount} out of {len(test_data_indices_df)} is correctly extracted ({(correctExtractionCount/len(test_data_indices_df))*100}%)")

Extracted			Actual				Correct?
12-01-2025 Mon 15:27:15		12-01-2025 Mon 15:27:15		CORRECT
12-01-2025 Mon 15:17:13 Y		12-01-2025 Mon 15:17:13		INCORRECT
12-01-2025 Mon 16:07:19		12-01-2025 Mon 16:07:19		CORRECT
12-01-2025 Mon 18:27:27		12-01-2025 Mon 18:27:27		CORRECT
12-02-2025 Tue 12:48:22		12-02-2025 Tue 12:48:22		CORRECT
12-05-2025 T1120812:03		12-05-2025 Fri 20:42:03		INCORRECT
12-05-2025 Fri 11:02:00		12-05-2025 Fri 11:02:00		CORRECT
12-01-2025 Mon 15:17:14		12-01-2025 Mon 15:17:14		CORRECT
12-01-2025 Monk 10 :07:21		12-01-2025 Mon 10:07:21		INCORRECT
12-05-2025 Fri 11:01:48		12-05-2025 Fri 11:01:48		CORRECT
12-01-2025 Mon 15:17:12		12-01-2025 Mon 15:17:12		CORRECT
8 out of 11 is correctly extracted (72.72727272727273%)
