In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

DIMENSION = (256, 1600) # image dimension

---

### Preparations

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
df.head()

Unnamed: 0,ImageId,ClassId,EncodedPixels
0,0002cc93b.jpg,1,29102 12 29346 24 29602 24 29858 24 30114 24 3...
1,0007a71bf.jpg,3,18661 28 18863 82 19091 110 19347 110 19603 11...
2,000a4bcdd.jpg,1,37607 3 37858 8 38108 14 38359 20 38610 25 388...
3,000f6bf48.jpg,4,131973 1 132228 4 132483 6 132738 8 132993 11 ...
4,0014fce06.jpg,3,229501 11 229741 33 229981 55 230221 77 230468...


In [4]:
def count_defect_pixels(EncodedPixels):
    sum = 0
    for idx, pixel in enumerate(EncodedPixels.split()):
        if idx%2 != 0:
            sum += int(pixel)

    return sum

In [5]:
df['DefectSizePixel'] = df.EncodedPixels.apply(lambda x: count_defect_pixels(x))

---

### Convert Encoded Pixels to a Mask with ClassIds

In [92]:
def get_column(pixel, rows=256): 
    """
    Input variables:
    pixel - String
    """
    return (int(pixel) - 1) // rows

def get_row(pixel, rows=256):
    """
    Input variables:
    pixel - String
    """
    return (int(pixel) - 1) % rows

def get_column_row(pixel, rows=256):
    """
    Input variables:
    pixel - String
    """
    column = get_column(pixel, rows)
    row = get_row(pixel, rows)
    return column, row

In [7]:
# split up encoded pixels in pairs of [start_pixel, pixel_length]
def get_pixel_pairs(encoded_pixels):
    """Returns a list of pixel pairs from the `encoded_pixels`.
    
    Input variables:
    encoded_pixels - String with encoded pixel pairs of shape (start_pixel, pixel_length)
    """
    i = 0 # running variable
    temp = [0,0]
    pairs = []
    for pixel in encoded_pixels.split():
        temp[i] = pixel
        if i%2 != 0: # re-initialise after a pair is complete
            pairs.append(temp)
            i = 0
            temp = [0,0]
            continue # skip increment of i
        i += 1
    return pairs

In [97]:
def set_classid(mask, class_id, encoded_pixels=df.EncodedPixels[1], image=True, rows=256):
    """set specific values in a null-matrix to `class_id` and returns the filled mask.
    
    Input variables:
    mask           - np.zeros in the dimension of the image
    class_id       - the class that will be set at `encoded_pixels`
    encoded_pixels - String with encoded pixel pairs of shape (start_pixel, pixel_length)
    image          - for testing purposes
    rows           - for testing purposes
    """
    length_sum = 0
    for pair in get_pixel_pairs(encoded_pixels):
        if image:
            column, row = get_column_row(pair[0])
        else: # for testing purposes
            column, row = get_column_row(pair[0],rows)
        # testing
        length_sum += int(pair[1])
        print(length_sum)
        
        # set all pixels of the respective pair to `class_id`
        for length in range(int(pair[1])):
            if (row + length) < 256:
                mask[row + length][column] = class_id   
            else:
                # for column changes
                correction_factor = (row + length) // 255
                mask[row + length - (correction_factor * 255) - 1][column + correction_factor] = class_id
    return mask

In [9]:
classes = np.zeros(DIMENSION)
np.sum(set_classid(classes, 1, encoded_pixels=df.EncodedPixels[2]))

8319.0

---

### Apply to df (TO-DO)

In [11]:
df.shape

(7095, 4)

In [93]:
sums = []
for i in range(df.shape[0]):
    classes = np.zeros(DIMENSION)
    sums.append(int(np.sum(set_classid(classes, 1, encoded_pixels=df.EncodedPixels[i]))))
    

In [94]:
df['MaskSums'] = sums
df['CheckSum'] = df.DefectSizePixel - df.MaskSums
df.describe()

Unnamed: 0,ClassId,DefectSizePixel,MaskSums,CheckSum
count,7095.0,7095.0,7095.0,7095.0
mean,2.825229,23056.669626,23052.581113,4.088513
std,0.789279,34957.878108,34952.412797,17.92003
min,1.0,115.0,115.0,0.0
25%,3.0,4251.0,4251.0,0.0
50%,3.0,10341.0,10341.0,0.0
75%,3.0,26791.0,26791.0,0.0
max,4.0,368240.0,368089.0,378.0


In [95]:
df.query('CheckSum > 0')

Unnamed: 0,ImageId,ClassId,EncodedPixels,DefectSizePixel,MaskSums,CheckSum
12,005d86c25.jpg,3,331 18 587 53 843 89 1099 124 1355 159 1611 17...,163033,162996,37
14,005f19695.jpg,3,123137 7 123393 19 123649 32 123905 44 124161 ...,27106,27105,1
16,0088260da.jpg,3,26044 13 26283 40 26522 67 26762 92 27001 119 ...,86663,86656,7
30,00e0398ad.jpg,3,8140 7 8385 21 8630 35 8875 48 9120 62 9364 77...,96228,96182,46
32,00f1665e6.jpg,3,238918 90 239174 90 239430 89 239686 89 239942...,18359,18358,1
...,...,...,...,...,...,...
7077,ff5c7f868.jpg,3,109602 1 109858 1 110113 2 110369 3 110624 4 1...,58768,58699,69
7080,ff6cc90ca.jpg,3,270550 43 270722 127 270894 211 271107 254 271...,13533,13531,2
7081,ff6e35e0a.jpg,1,366059 22 366273 64 366487 106 366700 149 3669...,6446,6439,7
7083,ff933e271.jpg,3,149505 32 149761 96 150017 160 150273 224 1505...,20413,20412,1


In [49]:
test = df.query('ImageId =="005f19695.jpg"')

In [60]:
pairs = get_pixel_pairs(test['EncodedPixels'].tolist()[0])

In [114]:
test['EncodedPixels'].values[0]

'123137 7 123393 19 123649 32 123905 44 124161 57 124417 69 124673 82 124929 94 125185 107 125441 119 125697 126 125953 125 126209 124 126465 123 126721 122 126977 121 127233 120 127489 119 127745 118 128021 97 128315 58 128609 19 133889 7 134145 21 134401 35 134657 49 134913 62 135169 76 135425 90 135681 104 135937 118 136193 131 136449 145 136705 159 136961 173 137217 186 137473 200 137740 203 138017 196 138295 181 138572 160 138849 139 139127 117 139404 96 139681 75 139958 54 140236 32 140513 11 146945 6 147201 16 147457 27 147713 38 147969 48 148225 59 148481 70 148737 80 148993 91 149249 101 149505 112 149761 123 150017 133 150273 144 150529 155 150785 165 151041 176 151297 186 151553 197 151809 208 152072 211 152342 208 152611 206 152880 203 153150 195 153419 182 153689 168 153958 155 154228 141 154497 128 154766 115 155036 101 155305 88 155575 74 155844 61 156114 47 156383 34 156652 21 156922 7 173571 10 173827 30 174083 49 174339 69 174595 88 174851 108 175107 127 175363 146 17

In [107]:
test

Unnamed: 0,ImageId,ClassId,EncodedPixels,DefectSizePixel,MaskSums,CheckSum
14,005f19695.jpg,3,123137 7 123393 19 123649 32 123905 44 124161 ...,27106,27105,1


In [65]:
length = []
for pair in pairs:
    length.append(pair[1])


In [73]:
sum = 0
for i in length:
    sum += int(i)

In [88]:
rows = []
for pair in pairs:
    rows.append([get_row(pair[0]),pair[1]])

In [117]:
classes = np.zeros(DIMENSION)
mask = set_classid(classes, 1, encoded_pixels=test['EncodedPixels'].values[0])
np.sum(mask)

7
26
58
102
159
228
310
404
511
630
756
881
1005
1128
1250
1371
1491
1610
1728
1825
1883
1902
1909
1930
1965
2014
2076
2152
2242
2346
2464
2595
2740
2899
3072
3258
3458
3661
3857
4038
4198
4337
4454
4550
4625
4679
4711
4722
4728
4744
4771
4809
4857
4916
4986
5066
5157
5258
5370
5493
5626
5770
5925
6090
6266
6452
6649
6857
7068
7276
7482
7685
7880
8062
8230
8385
8526
8654
8769
8870
8958
9032
9093
9140
9174
9195
9202
9212
9242
9291
9360
9448
9556
9683
9829
9995
10180
10385
10609
10853
11107
11361
11615
11869
12123
12377
12631
12885
13124
13335
13518
13673
13800
13898
13968
14010
14024
14030
14048
14078
14119
14172
14237
14314
14402
14502
14614
14737
14872
15019
15177
15346
15527
15720
15924
16140
16368
16602
16835
17066
17296
17524
17751
17976
18200
18405
18572
18702
18795
18851
18869
18891
18956
19064
19194
19324
19454
19584
19714
19844
19974
20104
20234
20364
20494
20624
20754
20884
21014
21144
21274
21404
21539
21686
21844
22014
22195
22388
22592
22807
23034
23272
23522
26594
26818
26

27105.0

In [89]:
rows

[[0, '7'],
 [0, '19'],
 [0, '32'],
 [0, '44'],
 [0, '57'],
 [0, '69'],
 [0, '82'],
 [0, '94'],
 [0, '107'],
 [0, '119'],
 [0, '126'],
 [0, '125'],
 [0, '124'],
 [0, '123'],
 [0, '122'],
 [0, '121'],
 [0, '120'],
 [0, '119'],
 [0, '118'],
 [20, '97'],
 [58, '58'],
 [96, '19'],
 [0, '7'],
 [0, '21'],
 [0, '35'],
 [0, '49'],
 [0, '62'],
 [0, '76'],
 [0, '90'],
 [0, '104'],
 [0, '118'],
 [0, '131'],
 [0, '145'],
 [0, '159'],
 [0, '173'],
 [0, '186'],
 [0, '200'],
 [11, '203'],
 [32, '196'],
 [54, '181'],
 [75, '160'],
 [96, '139'],
 [118, '117'],
 [139, '96'],
 [160, '75'],
 [181, '54'],
 [203, '32'],
 [224, '11'],
 [0, '6'],
 [0, '16'],
 [0, '27'],
 [0, '38'],
 [0, '48'],
 [0, '59'],
 [0, '70'],
 [0, '80'],
 [0, '91'],
 [0, '101'],
 [0, '112'],
 [0, '123'],
 [0, '133'],
 [0, '144'],
 [0, '155'],
 [0, '165'],
 [0, '176'],
 [0, '186'],
 [0, '197'],
 [0, '208'],
 [7, '211'],
 [21, '208'],
 [34, '206'],
 [47, '203'],
 [61, '195'],
 [74, '182'],
 [88, '168'],
 [101, '155'],
 [115, '141'],
 [12

In [75]:
pairs

[['123137', '7'],
 ['123393', '19'],
 ['123649', '32'],
 ['123905', '44'],
 ['124161', '57'],
 ['124417', '69'],
 ['124673', '82'],
 ['124929', '94'],
 ['125185', '107'],
 ['125441', '119'],
 ['125697', '126'],
 ['125953', '125'],
 ['126209', '124'],
 ['126465', '123'],
 ['126721', '122'],
 ['126977', '121'],
 ['127233', '120'],
 ['127489', '119'],
 ['127745', '118'],
 ['128021', '97'],
 ['128315', '58'],
 ['128609', '19'],
 ['133889', '7'],
 ['134145', '21'],
 ['134401', '35'],
 ['134657', '49'],
 ['134913', '62'],
 ['135169', '76'],
 ['135425', '90'],
 ['135681', '104'],
 ['135937', '118'],
 ['136193', '131'],
 ['136449', '145'],
 ['136705', '159'],
 ['136961', '173'],
 ['137217', '186'],
 ['137473', '200'],
 ['137740', '203'],
 ['138017', '196'],
 ['138295', '181'],
 ['138572', '160'],
 ['138849', '139'],
 ['139127', '117'],
 ['139404', '96'],
 ['139681', '75'],
 ['139958', '54'],
 ['140236', '32'],
 ['140513', '11'],
 ['146945', '6'],
 ['147201', '16'],
 ['147457', '27'],
 ['147713