In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from pycocotools.coco import COCO
from glob import glob
import shutil
import os
from tqdm import tqdm
import json

In [2]:
src_path = "../data/A12"
image_files = glob(os.path.join(src_path, "*.jpg"))
print(len(image_files))

image_metadata = pd.read_csv(os.path.join(src_path, "reference.csv"))
image_names = image_metadata[:].file_name.tolist()
print(len(image_names))
print(image_names[0])

assert(len(image_files) == len(image_names))


11681
11681
A12Area6SwallowsCrossMountnessingHuttonBrentwoodLondon_sideview_000000_000126_Lane1


In [3]:
# Train : Val : Test = 7 : 2 : 1
train_names, test_names = train_test_split(image_names, test_size=0.3, random_state=777, shuffle=True)
val_names, test_names = train_test_split(test_names, test_size=0.3, random_state=777, shuffle=True)

print(len(train_names), len(val_names), len(test_names), sum([len(train_names), len(val_names), len(test_names)]))

8176 2453 1052 11681


In [4]:
def batch_copy_files(file_name: list, src_path: str, dest_path: str):
    for file in tqdm(file_name):
        shutil.copy(os.path.join(src_path, file), dest_path)

In [5]:
coco_file_path = "../data/annotations/A12/raw.json"
coco_annotation = COCO(annotation_file=coco_file_path)

loading annotations into memory...
Done (t=0.09s)
creating index...
index created!


In [6]:
cat_ids = coco_annotation.getCatIds()
cats = coco_annotation.loadCats(cat_ids)
print(cats)

[{'id': 1, 'name': 'bleeding', 'supercategory': ''}, {'id': 2, 'name': 'raveling', 'supercategory': ''}, {'id': 3, 'name': 'crack_transverse', 'supercategory': ''}, {'id': 4, 'name': 'crack_longitudinal', 'supercategory': ''}, {'id': 5, 'name': 'crack_edge', 'supercategory': ''}, {'id': 6, 'name': 'crack_alligator', 'supercategory': ''}, {'id': 7, 'name': 'crack_block', 'supercategory': ''}, {'id': 8, 'name': 'shoving', 'supercategory': ''}, {'id': 9, 'name': 'rutting', 'supercategory': ''}, {'id': 10, 'name': 'potholes', 'supercategory': ''}, {'id': 11, 'name': 'patch', 'supercategory': ''}, {'id': 12, 'name': 'unknown', 'supercategory': ''}, {'id': 13, 'name': 'crack_corner', 'supercategory': ''}, {'id': 14, 'name': 'spalling', 'supercategory': ''}]


In [7]:
coco_json_file = open(coco_file_path)
coco_json = json.load(coco_json_file)

print(coco_json.keys())

dict_keys(['licenses', 'info', 'categories', 'images', 'annotations'])


In [8]:
print(type(coco_json["images"]), coco_json["images"][0])
print(type(coco_json["annotations"]), coco_json["annotations"][0])

<class 'list'> {'id': 1, 'width': 2464, 'height': 2056, 'file_name': 'A12Area6SwallowsCrossMountnessingHuttonBrentwoodLondon_sideview_000025_000416_Lane2.jpg', 'license': 0, 'flickr_url': '', 'coco_url': '', 'date_captured': 0}
<class 'list'> {'id': 1, 'image_id': 16, 'category_id': 4, 'segmentation': [[811.5, 2056.0, 816.5, 1828.4, 811.5, 1637.9, 799.1, 1447.4, 786.76, 1318.7, 863.46, 1274.17, 937.68, 1185.1, 920.37, 1118.3, 940.16, 1046.55, 1011.91, 1053.97, 984.69, 1123.24, 984.69, 1202.42, 880.78, 1316.23, 848.6, 1380.6, 870.9, 1603.2, 880.8, 1786.3, 861.0, 2056.0]], 'area': 61222.0, 'bbox': [786.76, 1046.55, 225.15, 1009.45], 'iscrowd': 0, 'attributes': {'occluded': False}}


In [9]:
def coco_split(file_name: list, coco_json: dict):
    coco_output = dict()
    coco_output["licenses"] = coco_json["licenses"]
    coco_output["info"] = coco_json["info"]
    coco_output["categories"] = coco_json["categories"]
    coco_output["images"] = list()
    coco_output["annotations"] = list()
    ids = list()
    for image in coco_json["images"]:
        if image["file_name"].split('.', -1)[0] in file_name:
            coco_output["images"].append(image)
            ids.append(image["id"])
    for ann in coco_json["annotations"]:
        if ann["image_id"] in ids:
            coco_output["annotations"].append(ann)
    return coco_output

In [10]:
coco_train = coco_split(file_name=train_names, coco_json=coco_json)
coco_val = coco_split(file_name=val_names, coco_json=coco_json)
coco_test = coco_split(file_name=test_names, coco_json=coco_json)

print(coco_train["licenses"], coco_train["info"], coco_train["categories"])
print(len(coco_train["images"]), len(coco_val["images"]), len(coco_test["images"]))
assert len(train_names) == len(coco_train["images"])

[{'name': '', 'id': 0, 'url': ''}] {'contributor': '', 'date_created': '', 'description': '', 'url': '', 'version': '', 'year': ''} [{'id': 1, 'name': 'bleeding', 'supercategory': ''}, {'id': 2, 'name': 'raveling', 'supercategory': ''}, {'id': 3, 'name': 'crack_transverse', 'supercategory': ''}, {'id': 4, 'name': 'crack_longitudinal', 'supercategory': ''}, {'id': 5, 'name': 'crack_edge', 'supercategory': ''}, {'id': 6, 'name': 'crack_alligator', 'supercategory': ''}, {'id': 7, 'name': 'crack_block', 'supercategory': ''}, {'id': 8, 'name': 'shoving', 'supercategory': ''}, {'id': 9, 'name': 'rutting', 'supercategory': ''}, {'id': 10, 'name': 'potholes', 'supercategory': ''}, {'id': 11, 'name': 'patch', 'supercategory': ''}, {'id': 12, 'name': 'unknown', 'supercategory': ''}, {'id': 13, 'name': 'crack_corner', 'supercategory': ''}, {'id': 14, 'name': 'spalling', 'supercategory': ''}]
8175 2453 1052


AssertionError: 

In [12]:
with open("/home/cituser/Desktop/sp973/pavement-defect-detection/data/annotations/A12/vanilla/train.json", "w") as outfile:
    json.dump(coco_train, outfile)

with open("/home/cituser/Desktop/sp973/pavement-defect-detection/data/annotations/A12/vanilla/val.json", "w") as outfile:
    json.dump(coco_val, outfile)

with open("/home/cituser/Desktop/sp973/pavement-defect-detection/data/annotations/A12/vanilla/test.json", "w") as outfile:
    json.dump(coco_test, outfile)

In [83]:
batch_copy_files(file_name=train_names, src_path=os.path.join(src_path, "img/"), dest_path="../data/train/")
batch_copy_files(file_name=val_names, src_path=os.path.join(src_path, "img/"), dest_path="../data/val/")
batch_copy_files(file_name=test_names, src_path=os.path.join(src_path, "img/"), dest_path="../data/test/")

100%|██████████| 1639/1639 [00:02<00:00, 665.58it/s]
100%|██████████| 492/492 [00:00<00:00, 644.01it/s]
100%|██████████| 211/211 [00:00<00:00, 779.42it/s]
