In [None]:
import os
import json

import cv2 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

%pylab inline
%config Completer.use_jedi = False

In [None]:
SEED = 42

BBOX_MARGIN = .2
TIME_STAMP = '2021_05_02'

ITASCA_PATH = '/home/ubuntu/percepto/data/ItascaClassification'

IN_PROGRESS_PATH = os.path.join(ITASCA_PATH, 'in_progress')

IMAGE_PATH = os.path.join(ITASCA_PATH, 'dataset', 'images')
RAW_IMAGES_PATH = os.path.join(IMAGE_PATH, 'raw')
CROPPED_PATH = os.path.join(IMAGE_PATH, 'cropped')

ANNOTATIONS_PATH = os.path.join(ITASCA_PATH, 'dataset', 'annotations')
MAPPING_PATH = os.path.join(ITASCA_PATH, 'dataset', 'mapping', TIME_STAMP)

MAPPING_DICT = {'ignore': 0,
                None: 2000, 
                'semantic_change': 2001,
                'air-leak': 2002, 
                'ground-leak': 2003, 
                'fire': 2004, 
                'gnome': 2005, 
                'leak': 2006}

In [None]:
# ! mkdir {os.path.join(ITASCA_PATH, 'dataset', 'appendix')}

In [None]:

with open(os.path.join(ITASCA_PATH, 'dataset', 'appendix', 'class_mapping.json'), 'w') as fp:
    json.dump(MAPPING_DICT, fp)



In [None]:
sorted(os.listdir(RAW_IMAGES_PATH))[:5]

In [None]:
sorted(os.listdir(IN_PROGRESS_PATH))[:10]

In [None]:
len(list(filter(lambda s: s.endswith('jpg'), os.listdir(RAW_IMAGES_PATH))))

In [None]:
i = 0

class_set = set()

for file in os.listdir(RAW_IMAGES_PATH):
    
    image_file = os.path.join(RAW_IMAGES_PATH, file)
    json_file = os.path.join(IN_PROGRESS_PATH, f'{file}___pixel.json')
    mask_file = os.path.join(IN_PROGRESS_PATH, f'{file}___fuse.png')

    with open(json_file) as jf:
        data = json.load(jf)
    if len(data['instances']) and (data['instances'][0]['className'] != 'semantic_change'):
        i += 1

        class_name = data['instances'][0]['className']

        if class_name not in class_set:
            class_set.add(class_name)
            image = cv2.imread(image_file)
            mask = cv2.imread(mask_file)

            plt.figure(figsize=(20, 10))
            plt.imshow(np.hstack([image[...,::-1], mask]))
            plt.title([i, data['instances'][0]['className']])
print(i)

In [None]:
i = 0
for file in os.listdir(RAW_IMAGES_PATH):

    image_file = os.path.join(RAW_IMAGES_PATH, file)
    json_file = os.path.join(IN_PROGRESS_PATH, f'{file}___pixel.json')
    mask_file = os.path.join(IN_PROGRESS_PATH, f'{file}___fuse.png')
    
    with open(json_file) as jf:
        data = json.load(jf)

    if len(data['instances']) and (data['instances'][0]['className'] == 'ground-leak'):
        i += 1
        class_name = 'ground-leak'

        image = cv2.imread(image_file)
        mask = cv2.imread(mask_file)

        plt.figure(figsize=(20, 10))
        plt.imshow(np.hstack([image[...,::-1], mask]))
        plt.title([i, data['instances'][0]['className'], file[:-4]])
        
print(i)

In [None]:
i = 0
for file in os.listdir(RAW_IMAGES_PATH):

    image_path = os.path.join(RAW_IMAGES_PATH, file)
    json_file = os.path.join(IN_PROGRESS_PATH, f'{file}___pixel.json')
    mask_path = os.path.join(IN_PROGRESS_PATH, f'{file}___fuse.png')
    
    with open(json_file) as jf:
        data = json.load(jf)

    if len(data['instances']) and (data['instances'][0]['className'] == 'air-leak'):
        i += 1
        class_name = 'air-leak'

        image = cv2.imread(image_path)
        mask = cv2.imread(mask_path)

        plt.figure(figsize=(20, 10))
        plt.imshow(np.hstack([image[...,::-1], mask]))
        plt.title([i, data['instances'][0]['className'], file[:-4]])
        
print(i)

In [None]:
i = 0

for file in os.listdir(RAW_IMAGES_PATH):

    image_path = os.path.join(RAW_IMAGES_PATH, file)
    json_file = os.path.join(IN_PROGRESS_PATH, f'{file}___pixel.json')
    mask_file = os.path.join(IN_PROGRESS_PATH, f'{file}___fuse.png')
    
    with open(json_file) as jf:
        data = json.load(jf)
        
    if len(data['instances']) and (data['instances'][0]['className'] != 'semantic_change'):
        i += 1

        class_name = data['instances'][0]['className'].split('-')[-1]

        image = cv2.imread(image_path)
        mask = cv2.imread(mask_file)
        maskgray = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(maskgray,127,255,2)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        max_area = 0 
        for c in contours:
            area = cv2.contourArea(c)
            if area > max_area:
                cnt = c
                max_area = area

        x, y, w, h = cv2.boundingRect(cnt)
        dw = int(BBOX_MARGIN*w)
        dh = int(BBOX_MARGIN*h)
        cv2.rectangle(image,(x,y),(x+w,y+h),(0,255,0),5)
        cv2.rectangle(image,(x-dw,y-dh),(x+w+dw,y+h+dh),(0,0,255),5)

        plt.figure(figsize=(20, 15))
        plt.imshow(np.hstack([image[...,::-1], mask]))

        plt.title([i, class_name, len(contours)])

        if i == 10:
            break


In [None]:
def crop_out_of_bounds(img, bbox):
    """crops an image when bounding box can be out of image bounderies"""
    
    h, w = img.shape[:2]
    x, y, dx, dy = bbox
    
    base_crop = img[max(0, y):y+dy, max(0, x):x+dx].copy()
    padded_crop = np.zeros((dy, dx, 3), dtype='uint8')
    padded_crop[abs(min(0, y)):h-y, abs(min(0, x)):w-x] = base_crop
    
    return padded_crop

In [None]:
files_list = []
mapping_list = []
classes_list = []
bbox_list = []

i = 0

crop_ext = f'_cropped_{BBOX_MARGIN}.jpg'

for file in os.listdir(RAW_IMAGES_PATH):

    image_path = os.path.join(RAW_IMAGES_PATH, file)
    json_file = os.path.join(IN_PROGRESS_PATH, f'{file}___pixel.json')
    mask_file = os.path.join(IN_PROGRESS_PATH, f'{file}___fuse.png')
    
    with open(json_file) as jf:
        data = json.load(jf)
        
    if len(data['instances']) and (data['instances'][0]['className'] != 'semantic_change'):
        i += 1

        class_name = data['instances'][0]['className'].split('-')[-1]

        image = cv2.imread(image_path)
        mask = cv2.imread(mask_file)
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(mask,127,255,2)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        max_area = 0 
        for c in contours:
            area = cv2.contourArea(c)
            if area > max_area:
                cnt = c
                max_area = area

        x, y, w, h = cv2.boundingRect(cnt)

        dw = int(BBOX_MARGIN*w)
        dh = int(BBOX_MARGIN*h)
        
        cropped_img = crop_out_of_bounds(image, (x-dw, y-dh, w+2*dw, h+2*dh))

        cropped_file = file.replace('.jpg', crop_ext)
        annotation_file = cropped_file.replace("jpg", "txt")
        
        cv2.imwrite(os.path.join(CROPPED_PATH, cropped_file), cropped_img)
        
        top, left, bottom, right = dh, dw, dh+h, dw+w
        
        with open(os.path.join(ANNOTATIONS_PATH, annotation_file), 'w') as f:
                  f.write(f'{MAPPING_DICT[class_name]},1,0,0,{left},{top},{right},{top},{right},{bottom},{left},{bottom}')
        
        files_list.append(f'images/cropped/{cropped_file}')
        mapping_list.append(f'annotations/{annotation_file}')
        classes_list.append(MAPPING_DICT[class_name])
        

In [None]:

df = pd.DataFrame(list(zip(files_list, mapping_list, classes_list)), 
                  columns =['image_name', 'mapping_file', 'class'])

print(df.head())
                


In [None]:
list(df.image_name[:10])

In [None]:
list(df.mapping_file[:10])

In [None]:
df['class'].value_counts()

In [None]:
train, val = train_test_split(df, random_state=SEED, stratify=df['class'])

In [None]:
train['class'].value_counts()

In [None]:
val['class'].value_counts()

In [None]:
# ! mkdir -p {MAPPING_PATH}

In [None]:
train.to_csv(os.path.join(MAPPING_PATH, 'train.txt'), columns=['image_name', 'mapping_file'], index=False, header=False)
val.to_csv(os.path.join(MAPPING_PATH, 'val.txt'), columns=['image_name', 'mapping_file'], index=False, header=False)

In [None]:
def make_animation(path, relative_path=''):
    
    animation_path = os.path.join(ITASCA_PATH, 'animation', os.path.splitext(os.path.basename(path))[0])

    with open(path, 'r') as f:
        files_list = f.readlines()
    
    for i, line in enumerate(files_list[:15]):
        image_path, annotation_path = line.strip().split(',')
        image_path = os.path.join(relative_path,'dataset', image_path)
        annotation_path = os.path.join(relative_path,'dataset', annotation_path)
        
        image = cv2.imread(image_path)
        
        with open(annotation_path, 'r') as f:
            class_label, _, _, _, left, top, right, _, _, bottom, _, _ = [int(x) for x in f.read().split(',')]
        
        if class_label == 2004:
            c = (255, 0, 0)
        elif class_label == 2005:
            c = (0, 255, 0)
        else:
            c = (0, 0, 255)
        
        image_w_bbox = image.copy()

        cv2.rectangle(image_w_bbox,(left, top), (right, bottom), c, -1)
        alpha = 0.3
        cv2.addWeighted(image_w_bbox, alpha, image, 1 - alpha, 0, image_w_bbox)
                
        name_to_save = os.path.join(animation_path, f'{i:04d}_*.jpg')
        cv2.imwrite(name_to_save.replace('*', 'image'), image)
        cv2.imwrite(name_to_save.replace('*', 'w_bbox'), image_w_bbox)
                                            

    


In [None]:
make_animation(os.path.join(MAPPING_PATH, 'train.txt'), ITASCA_PATH)
make_animation(os.path.join(MAPPING_PATH, 'val.txt'), ITASCA_PATH)