In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import json
import webdataset as wds
import dbm
import dbm.ndbm
import shelve
import os
import os.path
import glob
import random

In [3]:
def load_meta(fname="train.json"):
    with open(fname) as stream:
        meta = json.load(stream)
    categories = {c["id"]: c["name"] for c in meta["categories"]}
    images = {x["file_name"]: x for x in meta["images"]}
    images_by_id = {x["id"]: x for x in images.values()}
    for x in images.values():
        x["annotations"] = []
    for x in meta["annotations"]:
        x["category_name"] = categories[x["category_id"]]
        images_by_id[x["image_id"]]["annotations"].append(x)
    return images

meta = load_meta()

In [5]:
print(list(meta.values())[0])

{'file_name': 'PMC3866684_00003.jpg', 'height': 811, 'id': 0, 'width': 613, 'annotations': [{'segmentation': [[52.38, 444.87, 291.97, 444.87, 291.97, 456.42, 291.97, 456.42, 291.97, 465.8, 291.97, 465.8, 291.97, 476.27, 291.97, 476.27, 291.97, 487.8, 291.97, 487.8, 291.97, 498.26, 86.41, 498.26, 86.41, 508.73, 40.42, 508.73, 40.42, 497.17, 40.42, 497.17, 40.42, 486.71, 40.42, 486.71, 40.42, 476.27, 40.42, 476.27, 40.42, 466.88, 40.42, 466.88, 40.42, 455.33, 52.38, 455.33, 52.38, 444.87]], 'area': 13787.433082525036, 'iscrowd': 0, 'image_id': 0, 'bbox': [40.42, 444.87, 251.55, 63.86], 'category_id': 1, 'id': 0, 'category_name': 'text'}, {'segmentation': [[309.91, 444.91, 561.46, 444.91, 561.46, 456.42, 345.43, 456.42, 345.43, 466.88, 309.91, 466.88, 309.91, 456.42, 309.91, 456.42, 309.91, 444.91]], 'area': 3266.252633666969, 'iscrowd': 0, 'image_id': 0, 'bbox': [309.91, 444.91, 251.55, 21.97], 'category_id': 1, 'id': 1, 'category_name': 'text'}, {'segmentation': [[52.38, 507.66, 291.97,

In [6]:
sink = wds.ShardWriter("publaynet-train-%06d.tar", maxsize=3e8)
images = glob.glob("train/*")
random.shuffle(images)
for path in images:
    base = os.path.basename(path)
    root = os.path.splitext(base)[0]
    with open(path, "rb") as stream:
        png = stream.read()
    json = meta[base]
    sample = dict(__key__=root, png=png, json=json)
    sink.write(sample)
sink.close()

# writing publaynet-train-000000.tar 0 0.0 GB 0
# writing publaynet-train-000001.tar 985 0.3 GB 985
# writing publaynet-train-000002.tar 988 0.3 GB 1973
# writing publaynet-train-000003.tar 979 0.3 GB 2952
# writing publaynet-train-000004.tar 995 0.3 GB 3947
# writing publaynet-train-000005.tar 976 0.3 GB 4923
# writing publaynet-train-000006.tar 1001 0.3 GB 5924
# writing publaynet-train-000007.tar 989 0.3 GB 6913
# writing publaynet-train-000008.tar 977 0.3 GB 7890
# writing publaynet-train-000009.tar 967 0.3 GB 8857
# writing publaynet-train-000010.tar 988 0.3 GB 9845
# writing publaynet-train-000011.tar 1003 0.3 GB 10848
# writing publaynet-train-000012.tar 985 0.3 GB 11833
# writing publaynet-train-000013.tar 973 0.3 GB 12806
# writing publaynet-train-000014.tar 981 0.3 GB 13787
# writing publaynet-train-000015.tar 981 0.3 GB 14768
# writing publaynet-train-000016.tar 989 0.3 GB 15757
# writing publaynet-train-000017.tar 989 0.3 GB 16746
# writing publaynet-train-000018.tar 1000 0

In [9]:
import json

meta = load_meta("val.json")

sink = wds.ShardWriter("publaynet-val-%06d.tar", maxsize=3e8)
images = glob.glob("val/*")
random.shuffle(images)
for path in images:
    base = os.path.basename(path)
    root = os.path.splitext(base)[0]
    with open(path, "rb") as stream:
        png = stream.read()
    info = meta[base]
    sample = dict(__key__=root, png=png, json=info)
    sink.write(sample)
sink.close()

# writing publaynet-val-000000.tar 0 0.0 GB 0
# writing publaynet-val-000001.tar 967 0.3 GB 967
# writing publaynet-val-000002.tar 966 0.3 GB 1933
# writing publaynet-val-000003.tar 973 0.3 GB 2906
# writing publaynet-val-000004.tar 974 0.3 GB 3880
# writing publaynet-val-000005.tar 966 0.3 GB 4846
# writing publaynet-val-000006.tar 971 0.3 GB 5817
# writing publaynet-val-000007.tar 970 0.3 GB 6787
# writing publaynet-val-000008.tar 960 0.3 GB 7747
# writing publaynet-val-000009.tar 964 0.3 GB 8711
# writing publaynet-val-000010.tar 972 0.3 GB 9683
# writing publaynet-val-000011.tar 953 0.3 GB 10636


In [10]:
sink = wds.ShardWriter("publaynet-test-%06d.tar", maxsize=3e8)
images = glob.glob("test/*")
random.shuffle(images)
for path in images:
    base = os.path.basename(path)
    root = os.path.splitext(base)[0]
    with open(path, "rb") as stream:
        png = stream.read()
    #json = db[base]
    sample = dict(__key__=root, png=png)
    sink.write(sample)
sink.close()

# writing publaynet-test-000000.tar 0 0.0 GB 0
# writing publaynet-test-000001.tar 1014 0.3 GB 1014
# writing publaynet-test-000002.tar 1013 0.3 GB 2027
# writing publaynet-test-000003.tar 1009 0.3 GB 3036
# writing publaynet-test-000004.tar 1008 0.3 GB 4044
# writing publaynet-test-000005.tar 1023 0.3 GB 5067
# writing publaynet-test-000006.tar 1006 0.3 GB 6073
# writing publaynet-test-000007.tar 1010 0.3 GB 7083
# writing publaynet-test-000008.tar 1008 0.3 GB 8091
# writing publaynet-test-000009.tar 992 0.3 GB 9083
# writing publaynet-test-000010.tar 1000 0.3 GB 10083
# writing publaynet-test-000011.tar 1006 0.3 GB 11089
