## `genalog.text` module: 
This module is responsible for:
1. Text alignment
1. NER label propagation using text alignment results

In [1]:
from genalog.text import ner_label
from genalog.text import preprocess

gt_txt = "New York is big"
ocr_txt = "New Yo rkis big"

# Input to the method
gt_labels = ["B-P", "I-P", "O", "O"]
gt_tokens = preprocess.tokenize(gt_txt) # tokenize into list of tokens
ocr_tokens = preprocess.tokenize(ocr_txt)

In [2]:
# Inputs to the method
print(gt_labels)
print(gt_tokens)
print(ocr_tokens)

['B-P', 'I-P', 'O', 'O']
['New', 'York', 'is', 'big']
['New', 'Yo', 'rkis', 'big']


In [5]:
# Method returns a tuple of 4 elements (gt_tokens, gt_labels, ocr_tokens, ocr_labels, gap_char)
ocr_labels, aligned_gt, aligned_ocr, gap_char = ner_label.propagate_label_to_ocr(gt_labels, gt_tokens, ocr_tokens)

In [6]:
# Outputs
print(f"OCR labels:           {ocr_labels}")
print(f"Aligned ground truth: {aligned_gt}")
print(f"Alinged OCR text:     {aligned_ocr}")

OCR labels:           ['B-P', 'I-P', 'I-P', 'O']
Aligned ground truth: New Yo@rk is big
Alinged OCR text:     New Yo rk@is big


In [9]:
# Format result for display
print(ner_label.format_label_propagation(gt_tokens, gt_labels, ocr_tokens, ocr_labels, aligned_gt, aligned_ocr))

B-P I-P  O  O   
New York is big 
New Yo@rk is big
||||||.||.||||||
New Yo rk@is big
New Yo  rkis big 
B-P I-P I-P  O   



In [12]:
# To turn off alignment information:
print(ner_label.format_label_propagation(gt_tokens, gt_labels, ocr_tokens, ocr_labels, aligned_gt, aligned_ocr, show_alignment=False))

B-P I-P  O  O   
New York is big 
New Yo  rkis big 
B-P I-P I-P  O   



In [14]:
# Format tokens and labels
print(ner_label.format_labels(ocr_tokens, ocr_labels))

B-P I-P I-P  O   
New Yo  rkis big 

