## Creating the labelled data for training and evaluation

In [2]:
# Import libraries
import os
import cv2
import math
import numpy as np
import pandas as pd

## Create the label folder and extract the ground truth information

In [21]:
# Load the ground truth csv file
via_export = pd.read_csv("../MarkupFromVIA/marks_markup/via_export_csv.csv")

# Create the directory for the labelled data
label_path = "prize_papers/all_labels/"

try:
    os.mkdir(label_path)
except OSError:
    print ("Creation of the directory %s failed" % label_path)
else:
    print ("Successfully created the directory %s " % label_path)


# Create the empty arrays
gtruth = []
page = []
regions = []
curname = ""
# Cycle through each instance of a mark
for x in range(via_export.shape[0]):
    
    # Extract the file name and region properties
    data = via_export.iloc[x]
    
    if data.filename[:-4] == curname:
        
        # reg_properties = [x,y,width,height] where x,y are the bottom left co-ordinates
        reg_properties = []
        temp_shape_data = data.region_shape_attributes[1:-1].split(',')

        # Check that the image actually has a reg
        if len(temp_shape_data) < 5:
            continue
        else:
            for y in range(1,5):
                value = temp_shape_data[y].split(':')

                # Take into account the images have been resized and floor the values
                reg_properties.append(math.floor(int(value[1])*factor))

            # Append new region to regions list
            regions.append(reg_properties)
        
        if x == via_export.shape[0]-1:
            page.append(regions)
            gtruth.append(page)
        
    else:
        page.append(regions)
        gtruth.append(page)
        page = []
        regions = []
        temp_name = data.filename.split('.')
        curname = temp_name[0]
        page.append(curname)
        page.append(data.region_count)

        # reg_properties = [x,y,width,height] where x,y are the bottom left co-ordinates
        reg_properties = []
        temp_shape_data = data.region_shape_attributes[1:-1].split(',')

        # Check that the image actually has a reg
        if len(temp_shape_data) < 5:
            continue
        else:
            for y in range(1,5):
                value = temp_shape_data[y].split(':')

                # Take into account the images have been resized and floor the values
                reg_properties.append(math.floor(int(value[1])*factor))

            # Append new region to regions list
            regions.append(reg_properties)
            if x == via_export.shape[0]-1:
                page.append(regions)
                gtruth.append(page)
            
gtruth.pop(0)

Creation of the directory prize_papers/all_labels/ failed


[[]]

In [22]:
# Check Groundtruth collected properly

c = 0
ok = True
for page in gtruth:
    print("\nThe filename: " + page[0])
    print("The file should have " + str(page[1]) + " regions")
    print("It has " + str(len(page[2])) + " regions\n")
    
    print(page)
    
    c = c + len(page[2])
    if page[1] != len(page[2]):
        ok = False
    else:
        continue
    
print("The total number of regions is " + str(c))
print("The CSV was processed properly: " + str(ok))


The filename: 94_20_0026
The file should have 4 regions
It has 4 regions

['94_20_0026', 4, [[616, 6103, 336, 182], [611, 6329, 402, 154], [726, 6476, 198, 70], [719, 6259, 180, 70]]]

The filename: 94_20_0027
The file should have 19 regions
It has 19 regions

['94_20_0027', 19, [[644, 704, 187, 171], [881, 875, 237, 171], [649, 1013, 231, 187], [936, 1183, 149, 187], [671, 1381, 220, 176], [969, 1530, 264, 176], [660, 1701, 275, 165], [870, 1888, 187, 154], [622, 2053, 264, 171], [897, 2207, 198, 143], [682, 2361, 187, 171], [655, 3071, 358, 182], [660, 3610, 341, 165], [671, 3803, 352, 165], [710, 4139, 286, 198], [671, 4353, 187, 198], [660, 4827, 198, 215], [682, 5355, 319, 209], [671, 5861, 314, 242]]]

The filename: 94_20_0028
The file should have 10 regions
It has 10 regions

['94_20_0028', 10, [[638, 996, 264, 171], [611, 1216, 286, 149], [578, 1381, 308, 154], [583, 1866, 237, 209], [633, 2680, 171, 242], [594, 3049, 286, 209], [578, 3946, 281, 220], [605, 4452, 275, 242], [5

In [7]:
#Create the GT of the PPs that have no marks

for dirName, subdirList, fileList in os.walk(image_path):
    print('Found directory: %s' % dirName)
    for fname in fileList:
        file_name = fname.split(".")
        #print('\t%s' % file_name[0])
        # Open image
        img = cv2.imread(image_path + file_name[0] + '.jpg', 1)
        img = cv2.rectangle(img, (0,0), (img.shape[1], img.shape[0]), (0,0,0), -1)

        # Save figure
        cv2.imwrite(label_path + file_name[0] + '.png', img)
        

Found directory: ../SampleSets/


## Create the labels

In [26]:
for page in gtruth:
    filename = page[0] 
    regions = page[2]
    file_dir = "../SampleSets/" + filename + ".jpg"
    
    img = cv2.imread(file_dir)
    img = cv2.rectangle(img, (0,0), (img.shape[1], img.shape[0]), (0,0,0), -1)
    
    for region in regions:
        
        bl = (region[0],region[1])
        tr = (region[0]+region[2], region[1]+region[3])
        
        # Top left, Bottom right corners
        img = cv2.rectangle(img, bl, tr, (0,0,255), -1)
        
    print("Saving " + filename)
    cv2.imwrite(label_path + filename + ".png", img)

Saving 94_20_0026
Saving 94_20_0027
Saving 94_20_0028
Saving 94_20_0029
Saving 94_20_0034
Saving 94_20_0035
Saving 94_20_0036
Saving 94_20_0037
Saving 94_20_0038
Saving 94_20_0060
Saving 94_20_0066
Saving 94_20_0080
Saving 94_20_0081
Saving 94_20_0105
Saving 95_32_0017
Saving 95_32_0018
Saving 95_32_0019
Saving 95_32_0045
Saving 95_32_0051
Saving 95_32_0061
Saving 103_7_0023
Saving 103_7_0024
Saving 103_7_0025
Saving 103_7_0026
Saving 103_7_0027
Saving 103_7_0032
Saving 103_7_0033
Saving 103_7_0035
Saving 103_7_0038
Saving 103_7_0043
Saving 103_7_0045
Saving 103_7_0046
Saving 103_7_0047
Saving 103_7_0048
Saving 103_7_0056
Saving 105_6_0036
Saving 105_6_0037
Saving 105_6_0038
Saving 105_6_0052
Saving 105_6_0053
Saving 105_6_0054
Saving 105_6_0055
Saving 105_6_0056
Saving 105_6_0057
Saving 105_6_0064
Saving 105_6_0065
Saving 113_21_0057
Saving 113_21_0058
Saving 113_21_0059
Saving 113_21_0060
Saving 113_21_0061
Saving 113_21_0086
Saving 113_21_0087
Saving 113_21_0088
Saving 113_21_0089
S