# Extract Raw Lines from UW3 Data

This doesn't work well because the "1ground.txt" data is somewhat inconsistent. 

Use the hOCR data instead, which was derived from the DAFS output.

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import glob
import PIL
import webdataset as wds
import os
import os.path
import io
from ocrlib import extract_seg
from random import randrange

In [3]:
for i, sample in enumerate(wds.Dataset("uw3/uw3-original.tar").decode()):
    break
sample.keys()

dict_keys(['__key__', '1cond.txt', '1ground.txt', '1pageattr.txt', '1pageboxs.txt', '1zoneattr.txt', '1zoneboxs.txt', 'dafs', 'dafs.txt', 'image.tif', 'labzones.txt', 'linebox.txt', 'wordbox.txt', 'zonebox.txt'])

In [47]:
def parse_lboxes(sample):
    r = dict(lboxes=[])
    result = {}
    for l in sample["linebox.txt"].strip().split("\n")[1:] + ["LLLLL"]:
        #print("#", l)
        if l=="LLLLL":
            assert r["ZONE_ID"] not in result
            result[r["ZONE_ID"]] = r
            r = dict(lboxes=[])
            continue
        k, v = [s.strip() for s in l.split("=", 1)]
        if k == "TEXT_LINE_BOX":
            r["lboxes"].append(tuple(int(x) for x in v.split()))
        else:
            r[k] = v
    return result
parse_lboxes(sample)["002"]

{'lboxes': [(738, 591, 2201, 645),
  (738, 639, 2203, 696),
  (738, 689, 2204, 742),
  (737, 740, 2207, 791),
  (737, 789, 2208, 840),
  (737, 839, 2206, 891),
  (738, 888, 2210, 932),
  (738, 937, 2208, 994),
  (739, 988, 2207, 1044),
  (738, 1037, 2205, 1084),
  (738, 1088, 2207, 1132),
  (738, 1136, 2209, 1182),
  (738, 1186, 2208, 1230),
  (738, 1237, 2206, 1280),
  (738, 1285, 1092, 1323)],
 'DOCUMENT_ID': 'A04P',
 'ZONE_ID': '002'}

In [21]:
def parse_gt(sample):
    r = {}
    result = {}
    lines = sample["1ground.txt"].strip().split("\n") + ["GGGGG"]
    while len(lines) > 0:
        assert lines[0] == "GGGGG"
        if len(lines) == 1:
            break
        doc = lines[1]
        zone = lines[2]
        gt = []
        del lines[:3]
        while lines[0] != "GGGGG":
            gt.append(lines[0])
            del lines[0]
        assert len(gt) > 0
        if not gt[0].startswith("non-text:"):
            result[zone] = dict(DOCUMENT_ID=doc, ZONE_ID=zone, gt=gt)
        doc = None
        zone = None
        gt = None
    return result
parse_gt(sample)["002"]

{'DOCUMENT_ID': 'A002',
 'ZONE_ID': '002',
 'gt': ['Table 1. Description of the basal friction coefficients for 3 model types. For type I models the friction coefficient \\mu depends on the spa-',
  'tial variable, for type II models \\mu is velocity dependent, whereas for type III models the friction coefficient is both, position and velocity',
  'dependent']}

In [54]:
def page_text(sample):
    gt = []
    lines = sample["1ground.txt"].strip().split("\n") + ["GGGGG"]
    while len(lines) > 0:
        assert lines[0] == "GGGGG"
        if len(lines) == 1:
            break
        del lines[:3]
        while lines[0] != "GGGGG":
            gt.append(lines[0])
            del lines[0]
    return "\n".join(gt) + "\n"
print(page_text(sample))

Continuum Mech. Thermodyn. 1 (1989) 283-303
Continuum Mechanics
and
Thermodynamics
\copyright Springer-Verlag 1989
A mathematical model for the hysteresis
in shape memory alloys
Yongzhong Huo
The Preisach Model for ferromagnets is generalized and adapted for the
description of the hysteretic behaviour of a polycrystalline specimen of shape-
memory alloys. The thermodynamical properties of the individual crystallites
are described by the Landau-Devonshire free energy which contains four
parameters. The corresponding quadruplets of parameters of a polycrystalline
body fill a region in a four-dimensional Preisach space. A thermodynamical
loading path will sweep surfaces across this region and change phases in the
process. The physical problem of the response of a specimen to applied loads
is thus converted into the geometrical problem of counting volumes between
moving surfaces. This conversion facilitates the numerical evaluation of the
effect of complicated loading paths.
Load-deformati

In [63]:
def lines_and_gt(sample):
    gt = parse_gt(sample)
    bb = parse_lboxes(sample)
    result = []
    for zone in gt.keys():
        if zone not in bb:
            print(f"{zone} zone mismatch", file=sys.stderr)
            continue
        assert zone in bb, (zone, bb.keys())
        assert gt[zone]["DOCUMENT_ID"] == bb[zone]["DOCUMENT_ID"]
        assert gt[zone]["ZONE_ID"] == bb[zone]["ZONE_ID"]
        textlines = gt[zone]["gt"]
        boxes = bb[zone]["lboxes"]
        if len(textlines) != len(boxes):
            print(f"{zone} gt mismatch {len(textlines)} != {len(boxes)}", file=sys.stderr)
            continue
        for txt, bbox in zip(textlines, boxes):
            r = dict(
                doc=gt[zone]["DOCUMENT_ID"],
                zone=gt[zone]["ZONE_ID"],
                bbox=bbox,
                txt=txt
            )
            result.append(r)
    return result

In [64]:
sink = wds.TarWriter("uw3/uw3-gtseg.tar")
for i, sample in enumerate(wds.Dataset("uw3/uw3-original.tar").decode()):
    segmentation = lines_and_gt(sample)
    pagetext = page_text(sample)
    image = np.array(PIL.Image.open(io.BytesIO(sample["image.tif"])))
    sink.write({
        "__key__": sample["__key__"],
        "image.png": image,
        "gtseg.json": segmentation,
        "page.txt": pagetext
    })
sink.close()

007 gt mismatch 3 != 4
008 gt mismatch 2 != 1
001 gt mismatch 1 != 0
002 gt mismatch 7 != 6
00C gt mismatch 5 != 4
00L gt mismatch 2 != 1
002 gt mismatch 14 != 15
00B gt mismatch 2 != 1
003 gt mismatch 1 != 0
000 gt mismatch 17 != 16
004 gt mismatch 21 != 16
005 gt mismatch 64 != 50
006 gt mismatch 9 != 8
000 gt mismatch 21 != 16
002 gt mismatch 11 != 8
007 gt mismatch 2 != 1
003 gt mismatch 18 != 17
007 gt mismatch 1 != 0
000 gt mismatch 9 != 8
009 gt mismatch 1 != 0
00A gt mismatch 5 != 4
005 gt mismatch 1 != 0
00J gt mismatch 4 != 3
000 gt mismatch 1 != 0
00B zone mismatch
007 gt mismatch 6 != 5
008 gt mismatch 3 != 6
009 gt mismatch 1 != 3
00A gt mismatch 3 != 1
00B gt mismatch 5 != 3
005 gt mismatch 1 != 0
007 gt mismatch 3 != 2
00G gt mismatch 7 != 8
00H gt mismatch 7 != 6
00A gt mismatch 17 != 18
00K gt mismatch 22 != 24
00A gt mismatch 4 != 3
00E gt mismatch 2 != 1
005 gt mismatch 5 != 6
006 gt mismatch 7 != 6
00D gt mismatch 1 != 2
00E gt mismatch 2 != 1
002 gt mismatch 3 != 2

KeyError: '1ground.txt'

In [61]:
print(sample["1ground.txt"])

GGGGG
D03C
000
non-text: ruling
GGGGG
D03C
001
Towards a Capacity-Design Assessment Procedure for Reinforced Concrete Frames
GGGGG
D03C
002
427
GGGGG
D03C
003
non-text: ruling
GGGGG
D03C
004
Fig. 5, which is similar to an approach suggested for bridge columns by the Applied
Technology Council [20] and others [21]. Three situations are identified in Fig. 5. When
the shear corresponding to flexural strength V\_{f1} exceeds V\_{u} from Eq. (14a), a brittle shear
failure is expected. The strength is V\_{u} , and the ductility is \mu = 1. When the shear
associated with flexural strength is V\_{ud} \leq V\_{f2} \leq V\_{u} , then the strength is V\_{f2} , with ductility
given by
GGGGG
D03C
005
non-text: math
GGGGG
D03C
006
When the shear corresponding to flexural strength is V\_{f3} \leq V\_{ud} , then the strength is V\_{f3}
with full ductility. That is, \mu = 6 providing "good" detailing exists.
GGGGG
D03C
007
non-text: drawing
GGGGG
D03C
008
Figure 5 Relationship between shear strength an

In [62]:
print(sample["linebox.txt"])

LLLLL
DOCUMENT_ID   = D03C
ZONE_ID       = 001
TEXT_LINE_BOX = 447 155 1810 195
LLLLL
DOCUMENT_ID   = D03C
ZONE_ID       = 002
TEXT_LINE_BOX = 1965 157 2018 182
LLLLL
DOCUMENT_ID   = D03C
ZONE_ID       = 004
TEXT_LINE_BOX = 464 283 2003 329
TEXT_LINE_BOX = 464 324 2002 367
TEXT_LINE_BOX = 462 367 2004 412
TEXT_LINE_BOX = 464 411 2004 452
TEXT_LINE_BOX = 466 451 2003 495
TEXT_LINE_BOX = 467 498 598 543
LLLLL
DOCUMENT_ID   = D03C
ZONE_ID       = 006
TEXT_LINE_BOX = 468 734 2004 777
TEXT_LINE_BOX = 469 776 1609 819
LLLLL
DOCUMENT_ID   = D03C
ZONE_ID       = 008
TEXT_LINE_BOX = 626 1581 1841 1623
LLLLL
DOCUMENT_ID   = D03C
ZONE_ID       = 009
TEXT_LINE_BOX = 542 1665 2007 1709
TEXT_LINE_BOX = 477 1705 2008 1751
TEXT_LINE_BOX = 477 1751 1996 1794
LLLLL
DOCUMENT_ID   = D03C
ZONE_ID       = 00A
TEXT_LINE_BOX = 545 1837 2008 1883
TEXT_LINE_BOX = 479 1879 2008 1925
TEXT_LINE_BOX = 480 1921 2006 1968
TEXT_LINE_BOX = 479 1962 2008 2011
TEXT_LINE_BOX = 479 2011 2007 2053
TEXT_LINE_BOX = 478 2044 2