In [71]:
from sklearn.model_selection import train_test_split
import pandas as pd
from pycocotools.coco import COCO
from glob import glob
import shutil
import os
from tqdm import tqdm
import json

In [72]:
src_path = "/Users/martellweeks/Project/dataset/National Highways/2D Pavement Labels/A14 EB-WB J47A (Woolpit) to Haugley Bridge Lane 1/"
image_files = glob(os.path.join(src_path, "img/*.jpg"))
print(len(image_files))

image_metadata = pd.read_csv(os.path.join(src_path, "reference.csv"))
image_names = image_metadata[image_metadata.file_name.str.contains("Lane1")].file_name.tolist()
print(len(image_names))
print(image_names[0])

assert(len(image_files) == len(image_names))


2342
2342
A14EBWBJ47AWoolpittoHaugleyBridge_sideview_000008_000544_Lane1.jpg


In [73]:
# Train : Val : Test = 7 : 2 : 1
train_names, test_names = train_test_split(image_names, test_size=0.3, random_state=777, shuffle=True)
val_names, test_names = train_test_split(test_names, test_size=0.3, random_state=777, shuffle=True)

print(len(train_names), len(val_names), len(test_names), sum([len(train_names), len(val_names), len(test_names)]))

1639 492 211 2342


In [74]:
def batch_copy_files(file_name: list, src_path: str, dest_path: str):
    for file in tqdm(file_name):
        shutil.copy(os.path.join(src_path, file), dest_path)

In [75]:
coco_file_path = os.path.join(src_path, "annotation/instances_default.json")
coco_annotation = COCO(annotation_file=coco_file_path)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


In [76]:
cat_ids = coco_annotation.getCatIds()
cats = coco_annotation.loadCats(cat_ids)
print(cats)

[{'id': 1, 'name': 'bleeding', 'supercategory': ''}, {'id': 2, 'name': 'raveling', 'supercategory': ''}, {'id': 3, 'name': 'crack_transverse', 'supercategory': ''}, {'id': 4, 'name': 'crack_longitudinal', 'supercategory': ''}, {'id': 5, 'name': 'crack_edge', 'supercategory': ''}, {'id': 6, 'name': 'crack_alligator', 'supercategory': ''}, {'id': 7, 'name': 'crack_block', 'supercategory': ''}, {'id': 8, 'name': 'shoving', 'supercategory': ''}, {'id': 9, 'name': 'rutting', 'supercategory': ''}, {'id': 10, 'name': 'potholes', 'supercategory': ''}, {'id': 11, 'name': 'patch', 'supercategory': ''}, {'id': 12, 'name': 'unknown', 'supercategory': ''}, {'id': 13, 'name': 'crack_corner', 'supercategory': ''}, {'id': 14, 'name': 'spalling', 'supercategory': ''}]


In [77]:
coco_json_file = open(coco_file_path)
coco_json = json.load(coco_json_file)

print(coco_json.keys())

dict_keys(['licenses', 'info', 'categories', 'images', 'annotations'])


In [78]:
print(type(coco_json["images"]), coco_json["images"][0])
print(type(coco_json["annotations"]), coco_json["annotations"][0])

<class 'list'> {'id': 1, 'width': 1232, 'height': 1028, 'file_name': 'A14EBWBJ47AWoolpittoHaugleyBridge_sideview_000008_000544_Lane1.jpg', 'license': 0, 'flickr_url': '', 'coco_url': '', 'date_captured': 0}
<class 'list'> {'id': 1, 'image_id': 5, 'category_id': 4, 'segmentation': [[663.0, 9.31, 732.46, 957.05, 771.05, 963.22, 692.33, 12.4]], 'area': 31932.0, 'bbox': [663.0, 9.31, 108.05, 953.91], 'iscrowd': 0, 'attributes': {'occluded': False}}


In [79]:
def coco_split(file_name: list, coco_json: dict):
    coco_output = dict()
    coco_output["licenses"] = coco_json["licenses"]
    coco_output["info"] = coco_json["info"]
    coco_output["categories"] = coco_json["categories"]
    coco_output["images"] = list()
    coco_output["annotations"] = list()
    ids = list()
    for image in coco_json["images"]:
        if image["file_name"] in file_name:
            coco_output["images"].append(image)
            ids.append(image["id"])
    for ann in coco_json["annotations"]:
        if ann["image_id"] in ids:
            coco_output["annotations"].append(ann)
    return coco_output

In [80]:
coco_train = coco_split(file_name=train_names, coco_json=coco_json)
coco_val = coco_split(file_name=val_names, coco_json=coco_json)
coco_test = coco_split(file_name=test_names, coco_json=coco_json)

print(coco_train["licenses"], coco_train["info"], coco_train["categories"])
assert len(train_names) == len(coco_train["images"])

[{'name': '', 'id': 0, 'url': ''}] {'contributor': '', 'date_created': '', 'description': '', 'url': '', 'version': '', 'year': ''} [{'id': 1, 'name': 'bleeding', 'supercategory': ''}, {'id': 2, 'name': 'raveling', 'supercategory': ''}, {'id': 3, 'name': 'crack_transverse', 'supercategory': ''}, {'id': 4, 'name': 'crack_longitudinal', 'supercategory': ''}, {'id': 5, 'name': 'crack_edge', 'supercategory': ''}, {'id': 6, 'name': 'crack_alligator', 'supercategory': ''}, {'id': 7, 'name': 'crack_block', 'supercategory': ''}, {'id': 8, 'name': 'shoving', 'supercategory': ''}, {'id': 9, 'name': 'rutting', 'supercategory': ''}, {'id': 10, 'name': 'potholes', 'supercategory': ''}, {'id': 11, 'name': 'patch', 'supercategory': ''}, {'id': 12, 'name': 'unknown', 'supercategory': ''}, {'id': 13, 'name': 'crack_corner', 'supercategory': ''}, {'id': 14, 'name': 'spalling', 'supercategory': ''}]


In [81]:
with open("../data/train/train.json", "w") as outfile:
    json.dump(coco_train, outfile)

with open("../data/val/val.json", "w") as outfile:
    json.dump(coco_val, outfile)

with open("../data/test/test.json", "w") as outfile:
    json.dump(coco_test, outfile)

In [83]:
batch_copy_files(file_name=train_names, src_path=os.path.join(src_path, "img/"), dest_path="../data/train/")
batch_copy_files(file_name=val_names, src_path=os.path.join(src_path, "img/"), dest_path="../data/val/")
batch_copy_files(file_name=test_names, src_path=os.path.join(src_path, "img/"), dest_path="../data/test/")

100%|██████████| 1639/1639 [00:02<00:00, 665.58it/s]
100%|██████████| 492/492 [00:00<00:00, 644.01it/s]
100%|██████████| 211/211 [00:00<00:00, 779.42it/s]
