In [4]:
import io
from google.cloud import vision
from google.cloud.vision import types
import pandas as pd

In [5]:
def get_text(image_file, symbol_search=None):
    """Get text from images"""
    client = vision.ImageAnnotatorClient()

    bounds = []

    with io.open(image_file, 'rb') as image_file:
        content = image_file.read()

    image = types.Image(content=content)

    response = client.document_text_detection(image=image)
    document = response.full_text_annotation

    df = pd.DataFrame()

    # Collect specified feature bounds by enumerating all document features
    for n_page, page in enumerate(document.pages):
        for n_block, block in enumerate(page.blocks):
            for n_paragraph, paragraph in enumerate(block.paragraphs):
                for n_word, word in enumerate(paragraph.words):

                    symbol_search_find = False
                    n_symbol_search_find = None

                    for n_symbol, symbol in enumerate(word.symbols):
                        if symbol.text == symbol_search:
                            
                            print('find simbol')
                            symbol_search_find = True
                            n_symbol_search_find = n_symbol

                    df_temp = pd.DataFrame({
                                            'n_page': n_page,
                                            'page_height': [page.height],
                                            'page_width': [page.width],
                                            'n_block': [n_block],
                                            'block_vert': [[(vertex.x, vertex.y) for vertex in block.bounding_box.vertices]],
                                            'block_confidence': block.confidence,
                                            'n_paragraph': n_paragraph,
                                            'paragraph_vert': [[(vertex.x, vertex.y) for vertex in paragraph.bounding_box.vertices]],
                                            'paragraph_confidence': paragraph.confidence,
                                            'n_word': n_word,
                                            'word_vert': [[(vertex.x, vertex.y) for vertex in word.bounding_box.vertices]],
                                            "word_confidence": word.confidence,
                                            "text": [[symbol.text for symbol in word.symbols]],
                                            "symbol_search_pos": symbol_search_find,
                                            "n_symbol_search_find": n_symbol_search_find,
                                            })

                    df = df.append(df_temp, ignore_index=True)


     
    return df

# Get Text From Images

In [6]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/scors/Credentials/Raxar-12e4ceccdc19.json"

In [14]:
folder = '../data/raw/'

In [15]:
files = [ str(i) + '.jpg' for i in range(1, 21)]

In [19]:
symbol_search = '$'

In [20]:
df = pd.DataFrame()

for file in files:
    print(file)
    df_temp = get_text(folder + file, symbol_search)
    df_temp['file'] = file
    df = df.append(df_temp)
    
df = df.reset_index()
df = df.set_index(['file', 'index'])

1.jpg
find simbol
find simbol
2.jpg
find simbol
3.jpg
find simbol
find simbol
4.jpg
find simbol
find simbol
5.jpg
6.jpg
find simbol
find simbol
7.jpg
find simbol
find simbol
8.jpg
find simbol
find simbol
9.jpg
find simbol
find simbol
10.jpg
find simbol
find simbol
11.jpg
find simbol
12.jpg
find simbol
13.jpg
find simbol
find simbol
14.jpg
find simbol
15.jpg
16.jpg
find simbol
17.jpg
find simbol
find simbol
18.jpg
19.jpg
find simbol
20.jpg
find simbol


In [25]:
df.shape

(540, 16)

# Create Target

In [28]:
df['y'] = 0

In [29]:
df.xs('1.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,719,1280,0,"[(307, 251), (474, 255), (473, 320), (306, 316)]",0.97,0,"[(307, 251), (474, 255), (473, 320), (306, 316)]",0.97,0,"[(307, 251), (354, 253), (353, 270), (306, 268)]",0.98,"[W, W, W]",False,,0
1,0,719,1280,0,"[(307, 251), (474, 255), (473, 320), (306, 316)]",0.97,0,"[(307, 251), (474, 255), (473, 320), (306, 316)]",0.97,1,"[(312, 272), (469, 274), (469, 296), (312, 294)]",0.98,"[p, r, e, c, i, o, s, c, u, i, d, a, d, o, s]",False,,0
2,0,719,1280,0,"[(307, 251), (474, 255), (473, 320), (306, 316)]",0.97,0,"[(307, 251), (474, 255), (473, 320), (306, 316)]",0.97,2,"[(472, 275), (474, 275), (474, 296), (472, 296)]",0.9,[.],False,,0
3,0,719,1280,0,"[(307, 251), (474, 255), (473, 320), (306, 316)]",0.97,0,"[(307, 251), (474, 255), (473, 320), (306, 316)]",0.97,3,"[(315, 294), (374, 295), (374, 317), (315, 316)]",0.95,"[g, o, b, ., a, r]",False,,0
4,0,719,1280,1,"[(573, 285), (608, 286), (608, 297), (573, 296)]",0.44,0,"[(573, 285), (608, 286), (608, 297), (573, 296)]",0.44,0,"[(573, 285), (608, 286), (608, 297), (573, 296)]",0.44,"[A, n, t]",False,,0
5,0,719,1280,2,"[(645, 290), (767, 291), (767, 310), (645, 309)]",0.99,0,"[(645, 290), (767, 291), (767, 310), (645, 309)]",0.99,0,"[(645, 290), (720, 291), (720, 310), (645, 309)]",0.99,"[A, r, g, e, n, t, i, n, a]",False,,0
6,0,719,1280,2,"[(645, 290), (767, 291), (767, 310), (645, 309)]",0.99,0,"[(645, 290), (767, 291), (767, 310), (645, 309)]",0.99,1,"[(725, 291), (767, 291), (767, 310), (725, 310)]",0.99,"[u, n, i, d, a]",False,,0
7,0,719,1280,3,"[(77, 231), (946, 246), (943, 396), (74, 381)]",0.95,0,"[(77, 233), (259, 234), (259, 312), (77, 311)]",0.98,0,"[(77, 233), (219, 234), (219, 280), (77, 279)]",0.98,"[p, r, e, c, i, o, s]",False,,0
8,0,719,1280,3,"[(77, 231), (946, 246), (943, 396), (74, 381)]",0.95,0,"[(77, 233), (259, 234), (259, 312), (77, 311)]",0.98,1,"[(82, 272), (259, 272), (259, 311), (82, 311)]",0.98,"[c, u, i, d, a, d, o, s]",False,,0
9,0,719,1280,3,"[(77, 231), (946, 246), (943, 396), (74, 381)]",0.95,1,"[(174, 329), (944, 352), (942, 396), (173, 373)]",0.94,0,"[(174, 329), (341, 334), (340, 378), (173, 373)]",0.99,"[g, a, s, e, o, s, a]",False,,0


In [30]:
df.loc[('1.jpg', 22), 'y'] = 1

In [31]:
df.xs('2.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(143, 124), (772, 121), (772, 238), (143, 241)]",0.91,0,"[(144, 124), (287, 125), (287, 189), (144, 188)]",0.96,0,"[(144, 124), (258, 125), (258, 164), (144, 163)]",0.99,"[p, r, e, c, i, o, s]",False,,0
1,0,581,1032,0,"[(143, 124), (772, 121), (772, 238), (143, 241)]",0.91,0,"[(144, 124), (287, 125), (287, 189), (144, 188)]",0.96,1,"[(147, 158), (287, 158), (287, 188), (147, 188)]",0.94,"[c, u, i, d, a, d, o, s]",False,,0
2,0,581,1032,0,"[(143, 124), (772, 121), (772, 238), (143, 241)]",0.91,1,"[(215, 206), (772, 198), (772, 233), (215, 241)]",0.88,0,"[(215, 207), (345, 205), (345, 239), (215, 241)]",0.99,"[g, a, s, e, o, s, a]",False,,0
3,0,581,1032,0,"[(143, 124), (772, 121), (772, 238), (143, 241)]",0.91,1,"[(215, 206), (772, 198), (772, 233), (215, 241)]",0.88,1,"[(359, 205), (443, 204), (443, 238), (359, 239)]",0.99,"[C, O, C, A]",False,,0
4,0,581,1032,0,"[(143, 124), (772, 121), (772, 238), (143, 241)]",0.91,1,"[(215, 206), (772, 198), (772, 233), (215, 241)]",0.88,2,"[(456, 204), (531, 203), (531, 236), (456, 237)]",1.0,"[C, O, L, A]",False,,0
5,0,581,1032,0,"[(143, 124), (772, 121), (772, 238), (143, 241)]",0.91,1,"[(215, 206), (772, 198), (772, 233), (215, 241)]",0.88,3,"[(545, 202), (628, 201), (628, 235), (545, 236)]",0.76,"[1, ., 2, 5, -, 1, 7]",False,,0
6,0,581,1032,0,"[(143, 124), (772, 121), (772, 238), (143, 241)]",0.91,1,"[(215, 206), (772, 198), (772, 233), (215, 241)]",0.88,4,"[(640, 201), (695, 200), (695, 234), (640, 235)]",0.73,"[B, O, T]",False,,0
7,0,581,1032,0,"[(143, 124), (772, 121), (772, 238), (143, 241)]",0.91,1,"[(215, 206), (772, 198), (772, 233), (215, 241)]",0.88,5,"[(706, 200), (753, 199), (753, 233), (706, 234)]",0.97,"[1, ., 2, 5]",False,,0
8,0,581,1032,0,"[(143, 124), (772, 121), (772, 238), (143, 241)]",0.91,1,"[(215, 206), (772, 198), (772, 233), (215, 241)]",0.88,6,"[(766, 200), (772, 200), (772, 233), (766, 233)]",0.15,[/],False,,0
9,0,581,1032,1,"[(325, 137), (450, 141), (448, 194), (323, 190)]",0.94,0,"[(325, 137), (450, 141), (448, 194), (323, 190)]",0.94,0,"[(325, 137), (362, 139), (361, 154), (324, 152)]",0.97,"[w, w, w]",False,,0


In [32]:
df.loc[('2.jpg', 19), 'y'] = 1

In [33]:
df.xs('3.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,0,"[(258, 180), (376, 183), (375, 212), (257, 209)]",0.97,"[G, a, s, e, o, s, a]",False,,0
1,0,581,1032,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,1,"[(389, 184), (462, 186), (461, 215), (388, 213)]",0.98,"[C, O, C, A]",False,,0
2,0,581,1032,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,2,"[(477, 186), (546, 188), (545, 217), (476, 215)]",0.94,"[C, O, L, A]",False,,0
3,0,581,1032,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,3,"[(571, 189), (622, 190), (621, 219), (570, 218)]",0.99,"[B, O, T]",False,,0
4,0,581,1032,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,4,"[(634, 191), (665, 192), (664, 220), (633, 219)]",0.98,"[2, ., 2]",False,,0
5,0,581,1032,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,5,"[(678, 192), (691, 192), (690, 221), (677, 221)]",0.44,"[1, 4]",False,,0
6,0,581,1032,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,6,"[(694, 193), (696, 193), (695, 221), (693, 221)]",0.84,[.],False,,0
7,0,581,1032,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,0,"[(258, 180), (696, 192), (694, 252), (256, 240)]",0.91,7,"[(257, 213), (422, 217), (421, 244), (256, 240)]",0.88,"[D, e, s, c, a, r, t, a, b, l, e]",False,,0
8,0,581,1032,1,"[(259, 273), (534, 271), (535, 353), (260, 355)]",0.49,0,"[(259, 273), (534, 271), (535, 353), (260, 355)]",0.49,0,"[(259, 273), (318, 273), (319, 354), (260, 354)]",0.38,"[M, E]",False,,0
9,0,581,1032,1,"[(259, 273), (534, 271), (535, 353), (260, 355)]",0.49,0,"[(259, 273), (534, 271), (535, 353), (260, 355)]",0.49,1,"[(370, 272), (391, 272), (392, 353), (371, 353)]",0.24,[$],True,0.0,0


In [34]:
df.loc[('3.jpg', 10), 'y'] = 1

In [35]:
df.xs('4.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,0,"[(315, 182), (413, 177), (414, 207), (317, 212)]",0.99,"[g, a, s, e, o, s, a]",False,,0
1,0,581,1032,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,1,"[(423, 176), (489, 172), (491, 202), (425, 206)]",0.99,"[C, O, C, A]",False,,0
2,0,581,1032,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,2,"[(501, 172), (557, 169), (559, 199), (503, 202)]",0.98,"[C, O, L, A]",False,,0
3,0,581,1032,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,3,"[(575, 168), (583, 168), (585, 198), (577, 198)]",0.92,[X],False,,0
4,0,581,1032,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,4,"[(597, 166), (637, 164), (639, 195), (599, 197)]",0.82,"[1, ., 7, 5]",False,,0
5,0,581,1032,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,0,"[(315, 181), (709, 160), (711, 191), (317, 212)]",0.86,5,"[(671, 163), (710, 161), (712, 191), (673, 193)]",0.27,"[p, e, r]",False,,0
6,0,581,1032,1,"[(605, 266), (630, 266), (630, 285), (605, 285)]",0.95,0,"[(605, 266), (630, 266), (630, 285), (605, 285)]",0.95,0,"[(605, 266), (630, 266), (630, 285), (605, 285)]",0.95,"[., 7, 2]",False,,0
7,0,581,1032,2,"[(449, 248), (587, 244), (589, 319), (451, 323)]",0.99,0,"[(449, 248), (587, 244), (589, 319), (451, 323)]",0.99,0,"[(449, 248), (587, 244), (589, 319), (451, 323)]",0.99,"[1, 2, 2]",False,,0
8,0,581,1032,3,"[(414, 287), (426, 286), (427, 313), (415, 314)]",0.99,0,"[(414, 287), (426, 286), (427, 313), (415, 314)]",0.99,0,"[(414, 287), (426, 286), (427, 313), (415, 314)]",0.99,[$],True,0.0,0
9,0,581,1032,4,"[(362, 337), (510, 331), (511, 364), (363, 371)]",0.8,0,"[(362, 338), (499, 331), (500, 351), (363, 358)]",0.78,0,"[(362, 339), (418, 336), (419, 355), (363, 358)]",0.99,"[P, R, E, C, I, O]",False,,0


In [36]:
df.loc[('4.jpg', 7), 'y'] = 1

In [37]:
df.xs('5.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,719,1280,0,"[(790, 707), (855, 681), (863, 700), (798, 726)]",0.78,0,"[(790, 707), (855, 681), (863, 700), (798, 726)]",0.78,0,"[(792, 706), (823, 698), (827, 715), (796, 722)]",0.91,"[O, R]",False,,0
1,0,719,1280,0,"[(790, 707), (855, 681), (863, 700), (798, 726)]",0.78,0,"[(790, 707), (855, 681), (863, 700), (798, 726)]",0.78,1,"[(826, 699), (854, 683), (861, 696), (834, 712)]",0.69,"[O, R, I]",False,,0
2,0,719,1280,1,"[(654, 289), (770, 266), (774, 290), (659, 313)]",0.8,0,"[(654, 289), (770, 266), (774, 290), (659, 313)]",0.8,0,"[(654, 289), (721, 276), (725, 299), (658, 312)]",0.74,"[g, o, s, t, o, s, a]",False,,0
3,0,719,1280,1,"[(654, 289), (770, 266), (774, 290), (659, 313)]",0.8,0,"[(654, 289), (770, 266), (774, 290), (659, 313)]",0.8,1,"[(729, 275), (770, 267), (775, 290), (733, 298)]",0.89,"[C, O, C, A]",False,,0
4,0,719,1280,2,"[(743, 318), (858, 296), (868, 348), (753, 370)]",0.79,0,"[(743, 318), (858, 296), (868, 348), (753, 370)]",0.79,0,"[(743, 318), (832, 301), (842, 353), (753, 370)]",0.99,"[1, 2, 2]",False,,0
5,0,719,1280,2,"[(743, 318), (858, 296), (868, 348), (753, 370)]",0.79,0,"[(743, 318), (858, 296), (868, 348), (753, 370)]",0.79,1,"[(851, 298), (858, 297), (868, 349), (861, 350)]",0.21,[-],False,,0
6,0,719,1280,3,"[(939, 622), (1035, 642), (1031, 660), (935, 6...",0.97,0,"[(939, 622), (1035, 642), (1031, 660), (935, 6...",0.97,0,"[(939, 622), (1035, 642), (1031, 660), (935, 6...",0.97,"[O, R, I, G, I, N, A, L]",False,,0


In [38]:
df.loc[('5.jpg', 4), 'y'] = 1

In [39]:
df.xs('6.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,0,"[(446, 102), (490, 101), (490, 115), (446, 116)]",0.65,"[L, U, L, A]",False,,0
1,0,581,1032,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,1,"[(500, 101), (539, 100), (539, 113), (500, 114)]",0.73,"[C, U, L, A]",False,,0
2,0,581,1032,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,2,"[(551, 100), (577, 99), (577, 112), (551, 113)]",0.73,"[1, ., 2, 5]",False,,0
3,0,581,1032,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,3,"[(583, 99), (585, 99), (585, 112), (583, 112)]",0.49,[-],False,,0
4,0,581,1032,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,4,"[(590, 99), (597, 99), (597, 112), (590, 112)]",0.78,"[I, T]",False,,0
5,0,581,1032,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,5,"[(604, 99), (633, 98), (633, 112), (604, 113)]",0.65,"[B, U, R]",False,,0
6,0,581,1032,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,0,"[(446, 102), (674, 97), (674, 111), (446, 116)]",0.68,6,"[(645, 98), (674, 97), (674, 110), (645, 111)]",0.61,"[1, ., 2, 3]",False,,0
7,0,581,1032,1,"[(451, 172), (460, 172), (460, 190), (451, 190)]",0.96,0,"[(451, 172), (460, 172), (460, 190), (451, 190)]",0.96,0,"[(451, 172), (460, 172), (460, 190), (451, 190)]",0.96,[$],True,0.0,0
8,0,581,1032,2,"[(479, 145), (704, 143), (705, 198), (480, 200)]",0.52,0,"[(479, 145), (704, 143), (705, 198), (480, 200)]",0.52,0,"[(479, 145), (548, 144), (549, 199), (480, 200)]",0.99,"[8, 7]",False,,0
9,0,581,1032,2,"[(479, 145), (704, 143), (705, 198), (480, 200)]",0.52,0,"[(479, 145), (704, 143), (705, 198), (480, 200)]",0.52,1,"[(565, 144), (586, 144), (587, 199), (566, 199)]",0.4,"[3, 0]",False,,0


In [40]:
df.loc[('6.jpg', 8), 'y'] = 1

In [41]:
df.xs('7.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,0,"[(331, 71), (422, 77), (420, 102), (329, 96)]",0.97,"[g, a, s, e, o, s, a]",False,,0
1,0,581,1032,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,1,"[(435, 78), (501, 82), (499, 106), (433, 102)]",0.99,"[C, O, C, A]",False,,0
2,0,581,1032,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,2,"[(514, 83), (566, 86), (564, 110), (512, 107)]",0.99,"[C, O, L, A]",False,,0
3,0,581,1032,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,3,"[(581, 87), (618, 89), (616, 114), (579, 112)]",0.84,"[1, ., 2, 5]",False,,0
4,0,581,1032,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,4,"[(626, 90), (632, 90), (630, 114), (624, 114)]",0.95,[-],False,,0
5,0,581,1032,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,5,"[(637, 91), (645, 92), (643, 115), (635, 115)]",0.81,"[I, f]",False,,0
6,0,581,1032,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,6,"[(653, 92), (701, 95), (699, 119), (651, 116)]",0.93,"[B, O, T]",False,,0
7,0,581,1032,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,0,"[(331, 71), (765, 99), (763, 124), (329, 96)]",0.9,7,"[(707, 95), (765, 99), (763, 124), (705, 120)]",0.74,"[1, ., 2, 5, T]",False,,0
8,0,581,1032,1,"[(423, 152), (569, 144), (573, 224), (427, 232)]",0.94,0,"[(423, 152), (569, 144), (573, 224), (427, 232)]",0.94,0,"[(423, 152), (444, 151), (448, 230), (427, 231)]",0.91,[$],True,0.0,0
9,0,581,1032,1,"[(423, 152), (569, 144), (573, 224), (427, 232)]",0.94,0,"[(423, 152), (569, 144), (573, 224), (427, 232)]",0.94,1,"[(470, 150), (569, 145), (573, 223), (474, 229)]",0.95,"[8, 7]",False,,0


In [42]:
df.loc[('7.jpg', 9), 'y'] = 1

In [43]:
df.xs('8.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,960,1280,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,0,"[(294, 368), (441, 361), (443, 403), (296, 410)]",0.99,"[g, a, s, e, o, s, a]",False,,0
1,0,960,1280,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,1,"[(466, 360), (566, 355), (568, 397), (468, 402)]",0.99,"[C, O, C, A]",False,,0
2,0,960,1280,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,2,"[(588, 355), (674, 351), (676, 393), (590, 397)]",0.96,"[C, O, L, A]",False,,0
3,0,960,1280,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,3,"[(701, 350), (797, 346), (799, 387), (703, 391)]",0.97,"[1, ., 2, 5, -, 1]",False,,0
4,0,960,1280,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,4,"[(837, 344), (910, 341), (912, 382), (839, 385)]",0.87,"[B, O, T]",False,,0
5,0,960,1280,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,5,"[(929, 339), (992, 336), (994, 378), (931, 381)]",0.97,"[1, ., 2, 5]",False,,0
6,0,960,1280,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,6,"[(1011, 336), (1036, 335), (1038, 376), (1013,...",0.87,"[I, t]",False,,0
7,0,960,1280,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,0,"[(294, 368), (1042, 334), (1044, 376), (296, 4...",0.93,7,"[(1034, 335), (1042, 335), (1044, 376), (1036,...",0.21,[.],False,,0
8,0,960,1280,1,"[(509, 472), (654, 474), (653, 581), (508, 579)]",0.99,0,"[(509, 472), (654, 474), (653, 581), (508, 579)]",0.99,0,"[(509, 472), (539, 472), (538, 579), (508, 579)]",0.99,[$],True,0.0,0
9,0,960,1280,1,"[(509, 472), (654, 474), (653, 581), (508, 579)]",0.99,0,"[(509, 472), (654, 474), (653, 581), (508, 579)]",0.99,1,"[(543, 472), (654, 473), (653, 580), (542, 579)]",0.99,"[8, 7]",False,,0


In [44]:
df.loc[('8.jpg', 9), 'y'] = 1

In [45]:
df.xs('9.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,0,"[(259, 151), (383, 151), (383, 179), (259, 179)]",0.92,"[G, a, s, e, o, s, a]",False,,0
1,0,581,1032,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,1,"[(399, 151), (475, 151), (475, 179), (399, 179)]",0.97,"[C, O, C, A]",False,,0
2,0,581,1032,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,2,"[(488, 151), (557, 151), (557, 179), (488, 179)]",0.99,"[C, O, L, A]",False,,0
3,0,581,1032,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,3,"[(580, 151), (632, 151), (632, 179), (580, 179)]",0.99,"[B, O, T]",False,,0
4,0,581,1032,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,4,"[(642, 151), (679, 151), (679, 179), (642, 179)]",0.97,"[2, ., 2]",False,,0
5,0,581,1032,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,5,"[(690, 151), (705, 151), (705, 179), (690, 179)]",0.54,"[1, 4]",False,,0
6,0,581,1032,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,6,"[(708, 151), (710, 151), (710, 179), (708, 179)]",0.44,[.],False,,0
7,0,581,1032,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,0,"[(258, 151), (710, 151), (710, 210), (258, 210)]",0.91,7,"[(258, 183), (443, 183), (443, 210), (258, 210)]",0.93,"[D, e, s, c, a, r, t, a, b, l, e]",False,,0
8,0,581,1032,1,"[(388, 216), (535, 236), (523, 323), (376, 303)]",0.94,0,"[(388, 216), (535, 236), (523, 323), (376, 303)]",0.94,0,"[(388, 217), (413, 220), (401, 307), (376, 303)]",0.86,[$],True,0.0,0
9,0,581,1032,1,"[(388, 216), (535, 236), (523, 323), (376, 303)]",0.94,0,"[(388, 216), (535, 236), (523, 323), (376, 303)]",0.94,1,"[(424, 221), (535, 236), (523, 323), (412, 308)]",0.96,"[1, 5, 2]",False,,0


In [46]:
df.loc[('9.jpg', 9), 'y'] = 1

In [47]:
df.xs('10.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,719,1280,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 285), (674, 280), (675, 302), (582, 307)]",0.99,"[G, a, s, e, o, s, a]",False,,0
1,0,719,1280,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,1,"[(688, 279), (745, 276), (746, 298), (689, 301)]",0.99,"[C, O, C, A]",False,,0
2,0,719,1280,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,2,"[(755, 276), (812, 273), (813, 294), (756, 297)]",0.8,"[C, O, L, A]",False,,0
3,0,719,1280,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,3,"[(830, 272), (873, 270), (874, 291), (831, 293)]",0.95,"[B, O, T]",False,,0
4,0,719,1280,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,4,"[(881, 269), (885, 269), (886, 290), (882, 290)]",0.9,[:],False,,0
5,0,719,1280,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,5,"[(887, 269), (910, 268), (911, 289), (888, 290)]",0.92,"[2, ., 2]",False,,0
6,0,719,1280,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,6,"[(920, 267), (932, 266), (933, 287), (921, 288)]",0.85,"[I, t]",False,,0
7,0,719,1280,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,7,"[(934, 267), (936, 267), (937, 288), (935, 288)]",0.42,[.],False,,0
8,0,719,1280,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,0,"[(581, 284), (936, 266), (938, 315), (584, 333)]",0.93,8,"[(584, 308), (715, 303), (716, 327), (585, 332)]",0.99,"[D, e, s, c, a, r, t, a, b, l, e]",False,,0
9,0,719,1280,1,"[(677, 332), (814, 343), (807, 419), (671, 408)]",0.91,0,"[(677, 332), (814, 343), (807, 419), (671, 408)]",0.91,0,"[(677, 332), (697, 334), (691, 408), (671, 407)]",0.73,[$],True,0.0,0


In [48]:
df.loc[('10.jpg', 10), 'y'] = 1

In [49]:
df.xs('11.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,0,"[(85, 108), (242, 104), (244, 178), (87, 182)]",0.99,0,"[(85, 109), (210, 104), (211, 141), (86, 146)]",0.99,"[p, r, e, c, i, o, s]",False,,0
1,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,0,"[(85, 108), (242, 104), (244, 178), (87, 182)]",0.99,1,"[(88, 141), (243, 138), (244, 177), (89, 180)]",0.99,"[c, u, i, d, a, d, o, s]",False,,0
2,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,1,"[(166, 192), (761, 177), (762, 216), (167, 231)]",0.9,0,"[(166, 193), (309, 189), (310, 227), (167, 231)]",0.99,"[g, a, s, e, o, s, a]",False,,0
3,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,1,"[(166, 192), (761, 177), (762, 216), (167, 231)]",0.9,1,"[(325, 189), (416, 187), (417, 224), (326, 226)]",0.99,"[C, O, C, A]",False,,0
4,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,1,"[(166, 192), (761, 177), (762, 216), (167, 231)]",0.9,2,"[(432, 186), (508, 184), (509, 222), (433, 224)]",1.0,"[C, O, L, A]",False,,0
5,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,1,"[(166, 192), (761, 177), (762, 216), (167, 231)]",0.9,3,"[(533, 183), (604, 181), (605, 219), (534, 221)]",0.87,"[1, ., 2, 5, -, 1]",False,,0
6,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,1,"[(166, 192), (761, 177), (762, 216), (167, 231)]",0.9,4,"[(607, 182), (617, 182), (618, 219), (608, 219)]",0.07,[+],False,,0
7,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,1,"[(166, 192), (761, 177), (762, 216), (167, 231)]",0.9,5,"[(625, 181), (681, 180), (682, 217), (626, 218)]",0.75,"[B, O, T]",False,,0
8,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,1,"[(166, 192), (761, 177), (762, 216), (167, 231)]",0.9,6,"[(695, 179), (740, 178), (741, 216), (696, 217)]",0.97,"[1, ., 2, 5]",False,,0
9,0,581,1032,0,"[(85, 108), (759, 90), (762, 216), (88, 234)]",0.93,1,"[(166, 192), (761, 177), (762, 216), (167, 231)]",0.9,7,"[(756, 178), (761, 178), (762, 215), (757, 215)]",0.91,[/],False,,0


In [50]:
df.loc[('11.jpg', 20), 'y'] = 1

In [51]:
df.xs('12.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,0,"[(152, 101), (289, 105), (287, 171), (150, 167)]",0.99,0,"[(152, 101), (263, 104), (262, 142), (151, 139)]",0.99,"[p, r, e, c, i, o, s]",False,,0
1,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,0,"[(152, 101), (289, 105), (287, 171), (150, 167)]",0.99,1,"[(152, 133), (288, 137), (287, 171), (151, 167)]",0.99,"[c, u, i, d, a, d, o, s]",False,,0
2,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,1,"[(217, 183), (747, 199), (746, 234), (216, 218)]",0.92,0,"[(217, 183), (344, 187), (343, 221), (216, 217)]",0.99,"[g, a, s, e, o, s, a]",False,,0
3,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,1,"[(217, 183), (747, 199), (746, 234), (216, 218)]",0.92,1,"[(358, 188), (439, 190), (438, 223), (357, 221)]",0.99,"[C, O, C, A]",False,,0
4,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,1,"[(217, 183), (747, 199), (746, 234), (216, 218)]",0.92,2,"[(451, 191), (523, 193), (522, 226), (450, 224)]",1.0,"[C, O, L, A]",False,,0
5,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,1,"[(217, 183), (747, 199), (746, 234), (216, 218)]",0.92,3,"[(534, 193), (579, 194), (578, 228), (533, 227)]",0.98,"[1, ., 2, 5]",False,,0
6,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,1,"[(217, 183), (747, 199), (746, 234), (216, 218)]",0.92,4,"[(586, 195), (594, 195), (593, 228), (585, 228)]",0.99,[-],False,,0
7,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,1,"[(217, 183), (747, 199), (746, 234), (216, 218)]",0.92,5,"[(601, 196), (612, 196), (611, 229), (600, 229)]",0.9,[I],False,,0
8,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,1,"[(217, 183), (747, 199), (746, 234), (216, 218)]",0.92,6,"[(625, 196), (676, 198), (675, 232), (624, 230)]",0.69,"[B, O, T]",False,,0
9,0,581,1032,0,"[(152, 101), (749, 119), (745, 234), (149, 216)]",0.94,1,"[(217, 183), (747, 199), (746, 234), (216, 218)]",0.92,7,"[(688, 198), (730, 199), (729, 233), (687, 232)]",0.92,"[1, ., 2, 5]",False,,0


In [52]:
df.loc[('12.jpg', 19), 'y'] = 1

In [53]:
df.xs('13.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,0,"[(262, 165), (373, 164), (373, 193), (262, 194)]",0.97,"[G, a, s, e, o, s, a]",False,,0
1,0,581,1032,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,1,"[(385, 164), (455, 163), (455, 192), (385, 193)]",0.99,"[C, O, C, A]",False,,0
2,0,581,1032,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,2,"[(467, 163), (532, 162), (532, 191), (467, 192)]",0.83,"[C, O, L, A]",False,,0
3,0,581,1032,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,3,"[(553, 162), (601, 162), (601, 191), (553, 191)]",0.97,"[B, O, T]",False,,0
4,0,581,1032,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,4,"[(612, 162), (645, 162), (645, 190), (612, 190)]",0.97,"[2, ., 2]",False,,0
5,0,581,1032,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,5,"[(658, 162), (671, 162), (671, 190), (658, 190)]",0.91,"[I, t]",False,,0
6,0,581,1032,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,6,"[(674, 162), (677, 162), (677, 190), (674, 190)]",0.81,[.],False,,0
7,0,581,1032,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,0,"[(259, 165), (677, 161), (678, 220), (260, 224)]",0.95,7,"[(260, 194), (430, 194), (430, 222), (260, 222)]",0.97,"[D, e, s, c, a, r, t, a, b, l, e]",False,,0
8,0,581,1032,1,"[(376, 243), (759, 248), (758, 320), (375, 315)]",0.46,0,"[(376, 243), (759, 248), (758, 320), (375, 315)]",0.46,0,"[(376, 244), (396, 244), (395, 315), (375, 315)]",0.99,[$],True,0.0,0
9,0,581,1032,1,"[(376, 243), (759, 248), (758, 320), (375, 315)]",0.46,0,"[(376, 243), (759, 248), (758, 320), (375, 315)]",0.46,1,"[(403, 244), (500, 245), (499, 316), (402, 315)]",0.97,"[1, 5, 2]",False,,0


In [54]:
df.loc[('13.jpg', 9), 'y'] = 1

In [55]:
df.xs('14.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,0,"[(308, 136), (402, 137), (402, 159), (308, 158)]",0.9,"[g, u, s, e, o, s, a]",False,,0
1,0,581,1032,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,1,"[(411, 138), (472, 139), (472, 160), (411, 159)]",0.97,"[C, O, C, A]",False,,0
2,0,581,1032,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,2,"[(485, 138), (538, 138), (538, 160), (485, 160)]",0.97,"[C, O, L, A]",False,,0
3,0,581,1032,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,3,"[(553, 139), (591, 139), (591, 160), (553, 160)]",0.79,"[1, ., 2, 5]",False,,0
4,0,581,1032,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,4,"[(598, 139), (603, 139), (603, 160), (598, 160)]",0.92,[-],False,,0
5,0,581,1032,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,5,"[(607, 139), (612, 139), (612, 160), (607, 160)]",0.23,[i],False,,0
6,0,581,1032,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,6,"[(626, 139), (671, 139), (671, 161), (626, 161)]",0.79,"[B, O, T]",False,,0
7,0,581,1032,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,0,"[(308, 136), (737, 140), (737, 162), (308, 158)]",0.85,7,"[(679, 140), (737, 141), (737, 162), (679, 161)]",0.79,"[7, ., 2, 5, 7]",False,,0
8,0,581,1032,1,"[(454, 197), (547, 198), (546, 273), (453, 272)]",0.99,0,"[(454, 197), (547, 198), (546, 273), (453, 272)]",0.99,0,"[(454, 197), (547, 198), (546, 273), (453, 272)]",0.99,"[8, 7]",False,,0
9,0,581,1032,2,"[(569, 221), (598, 220), (599, 240), (570, 241)]",0.94,0,"[(569, 221), (598, 220), (599, 240), (570, 241)]",0.94,0,"[(569, 221), (598, 220), (599, 240), (570, 241)]",0.94,"[., 5, 0]",False,,0


In [56]:
df.loc[('14.jpg', 8), 'y'] = 1

In [57]:
df.xs('15.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,0,"[(363, 200), (451, 197), (452, 220), (364, 223)]",0.99,"[g, a, s, e, o, s, a]",False,,0
1,0,581,1032,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,1,"[(459, 197), (516, 195), (517, 218), (460, 220)]",0.99,"[C, O, C, A]",False,,0
2,0,581,1032,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,2,"[(525, 194), (576, 192), (577, 215), (526, 217)]",0.99,"[C, O, L, A]",False,,0
3,0,581,1032,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,3,"[(609, 192), (642, 191), (643, 213), (610, 214)]",0.82,"[1, ., 7, 5]",False,,0
4,0,581,1032,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,0,"[(363, 200), (671, 189), (672, 212), (364, 223)]",0.89,4,"[(649, 191), (671, 190), (672, 212), (650, 213)]",0.33,"[T, T]",False,,0
5,0,581,1032,1,"[(450, 250), (635, 247), (636, 309), (451, 312)]",0.69,0,"[(450, 250), (635, 247), (636, 309), (451, 312)]",0.69,0,"[(450, 250), (467, 250), (468, 311), (451, 311)]",0.44,[s],False,,0
6,0,581,1032,1,"[(450, 250), (635, 247), (636, 309), (451, 312)]",0.69,0,"[(450, 250), (635, 247), (636, 309), (451, 312)]",0.69,1,"[(489, 249), (594, 247), (595, 309), (490, 311)]",0.91,"[1, 2, 2]",False,,0
7,0,581,1032,1,"[(450, 250), (635, 247), (636, 309), (451, 312)]",0.69,0,"[(450, 250), (635, 247), (636, 309), (451, 312)]",0.69,2,"[(618, 247), (635, 247), (636, 308), (619, 308)]",0.28,[n],False,,0
8,0,581,1032,2,"[(409, 320), (521, 317), (522, 344), (410, 347)]",0.69,0,"[(409, 320), (519, 318), (519, 333), (409, 335)]",0.79,0,"[(409, 320), (478, 319), (478, 334), (409, 335)]",0.94,"[P, R, E, C, I, O, X, I]",False,,0
9,0,581,1032,2,"[(409, 320), (521, 317), (522, 344), (410, 347)]",0.69,0,"[(409, 320), (519, 318), (519, 333), (409, 335)]",0.79,1,"[(488, 319), (519, 318), (519, 332), (488, 333)]",0.48,"[5, 7, 0, 1]",False,,0


In [58]:
df.loc[('15.jpg', 6), 'y'] = 1

In [59]:
df.xs('16.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,581,1032,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,0,"[(258, 143), (351, 140), (352, 165), (259, 168)]",0.98,"[g, a, s, e, o, s, a]",False,,0
1,0,581,1032,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,1,"[(363, 140), (426, 138), (427, 162), (364, 164)]",0.99,"[C, O, C, A]",False,,0
2,0,581,1032,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,2,"[(438, 137), (497, 135), (498, 160), (439, 162)]",1.0,"[C, O, L, A]",False,,0
3,0,581,1032,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,3,"[(510, 135), (516, 135), (517, 159), (511, 159)]",0.99,[X],False,,0
4,0,581,1032,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,4,"[(531, 134), (570, 133), (571, 157), (532, 158)]",0.99,"[1, ., 7, 5]",False,,0
5,0,581,1032,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,5,"[(582, 132), (594, 132), (595, 157), (583, 157)]",0.96,"[T, I]",False,,0
6,0,581,1032,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,0,"[(258, 143), (639, 131), (640, 156), (259, 168)]",0.98,6,"[(602, 132), (639, 131), (640, 156), (603, 157)]",0.95,"[P, E, T]",False,,0
7,0,581,1032,1,"[(272, 203), (568, 200), (569, 276), (273, 279)]",0.56,0,"[(272, 203), (568, 200), (569, 276), (273, 279)]",0.56,0,"[(272, 203), (380, 202), (381, 278), (273, 279)]",0.45,"[2, ., s]",False,,0
8,0,581,1032,1,"[(272, 203), (568, 200), (569, 276), (273, 279)]",0.56,0,"[(272, 203), (568, 200), (569, 276), (273, 279)]",0.56,1,"[(408, 202), (526, 201), (527, 277), (409, 278)]",0.83,"[1, 2, 2]",False,,0
9,0,581,1032,1,"[(272, 203), (568, 200), (569, 276), (273, 279)]",0.56,0,"[(272, 203), (568, 200), (569, 276), (273, 279)]",0.56,2,"[(547, 201), (568, 201), (569, 276), (548, 276)]",0.07,[»],False,,0


In [60]:
df.loc[('16.jpg', 8), 'y'] = 1

In [61]:
df.xs('17.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (829, 1988), (828, 2044), (605, ...",0.99,"[G, A, S, E, O, S, A]",False,,0
1,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,1,"[(857, 1989), (969, 1991), (968, 2047), (856, ...",0.99,"[C, O, L, A]",False,,0
2,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,2,"[(990, 1991), (1061, 1992), (1060, 2048), (989...",0.99,"[S, I, N]",False,,0
3,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,3,"[(1084, 1993), (1261, 1996), (1260, 2051), (10...",0.99,"[A, Z, U, C, A, R]",False,,0
4,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,4,"[(1279, 1995), (1403, 1997), (1402, 2053), (12...",0.99,"[C, O, C, A]",False,,0
5,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,5,"[(1422, 1997), (1535, 1999), (1534, 2055), (14...",1.0,"[C, O, L, A]",False,,0
6,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,6,"[(1557, 2000), (1643, 2001), (1642, 2056), (15...",0.99,"[P, E, T]",False,,0
7,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,7,"[(1658, 2001), (1679, 2001), (1678, 2056), (16...",0.94,[X],False,,0
8,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,8,"[(1698, 2002), (1755, 2003), (1754, 2058), (16...",0.73,"[2, ., 5]",False,,0
9,0,4032,3024,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,0,"[(606, 1985), (1821, 2003), (1820, 2059), (605...",0.96,9,"[(1774, 2003), (1821, 2004), (1820, 2060), (17...",0.82,"[L, T]",False,,0


In [62]:
df.loc[('17.jpg', 12), 'y'] = 1

In [63]:
df.xs('18.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,4032,3024,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (1053, 1816), (1053, 1878), (794...",0.99,"[G, A, S, E, O, S, A]",False,,0
1,0,4032,3024,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,1,"[(1073, 1817), (1205, 1818), (1205, 1879), (10...",0.97,"[C, O, L, A]",False,,0
2,0,4032,3024,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,2,"[(1232, 1817), (1383, 1818), (1383, 1880), (12...",0.99,"[L, I, G, H, T]",False,,0
3,0,4032,3024,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,3,"[(1400, 1818), (1552, 1819), (1552, 1880), (14...",0.99,"[C, O, C, A]",False,,0
4,0,4032,3024,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,4,"[(1565, 1819), (1716, 1820), (1716, 1881), (15...",1.0,"[C, O, L, A]",False,,0
5,0,4032,3024,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,5,"[(1723, 1819), (1838, 1819), (1838, 1880), (17...",0.99,"[P, E, T]",False,,0
6,0,4032,3024,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,6,"[(1859, 1820), (1876, 1820), (1876, 1881), (18...",0.98,[X],False,,0
7,0,4032,3024,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,7,"[(1916, 1820), (2002, 1820), (2002, 1881), (19...",0.95,"[1, ., 7, 5]",False,,0
8,0,4032,3024,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,0,"[(794, 1815), (2096, 1820), (2096, 1882), (794...",0.97,8,"[(2027, 1820), (2096, 1820), (2096, 1881), (20...",0.8,"[L, T]",False,,0
9,0,4032,3024,1,"[(2016, 1935), (2222, 1934), (2223, 2078), (20...",0.92,0,"[(2016, 1935), (2222, 1934), (2223, 2078), (20...",0.92,0,"[(2016, 1935), (2222, 1934), (2223, 2078), (20...",0.92,"[0, 0]",False,,0


In [64]:
df.loc[('18.jpg', 10), 'y'] = 1

In [65]:
df.xs('19.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2033), (1546, 2023), (1548, 2086), (12...",0.99,"[G, A, S, E, O, S, A]",False,,0
1,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,1,"[(1569, 2023), (1748, 2017), (1750, 2080), (15...",0.99,"[C, O, L, A]",False,,0
2,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,2,"[(1773, 2016), (2076, 2007), (2078, 2071), (17...",0.99,"[R, E, G, U, L, A, R]",False,,0
3,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,3,"[(2101, 2006), (2284, 2000), (2286, 2063), (21...",0.99,"[C, O, C, A]",False,,0
4,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,4,"[(2309, 2000), (2486, 1995), (2488, 2058), (23...",1.0,"[C, O, L, A]",False,,0
5,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,5,"[(2513, 1993), (2634, 1989), (2636, 2053), (25...",0.99,"[P, E, T]",False,,0
6,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,6,"[(2651, 1989), (2692, 1988), (2694, 2051), (26...",0.98,[X],False,,0
7,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,7,"[(2715, 1987), (2844, 1983), (2846, 2047), (27...",0.99,"[2, ., 2, 5]",False,,0
8,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,8,"[(2871, 1982), (2933, 1980), (2935, 2044), (28...",0.99,"[L, T]",False,,0
9,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,9,"[(2930, 1981), (2942, 1981), (2944, 2044), (29...",0.99,[.],False,,0


In [69]:
df = df.drop('19.jpg', level='file')

In [68]:
df.xs('20.jpg', level='file')

Unnamed: 0_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2033), (1546, 2023), (1548, 2086), (12...",0.99,"[G, A, S, E, O, S, A]",False,,0
1,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,1,"[(1569, 2023), (1748, 2017), (1750, 2080), (15...",0.99,"[C, O, L, A]",False,,0
2,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,2,"[(1773, 2016), (2076, 2007), (2078, 2071), (17...",0.99,"[R, E, G, U, L, A, R]",False,,0
3,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,3,"[(2101, 2006), (2284, 2000), (2286, 2063), (21...",0.99,"[C, O, C, A]",False,,0
4,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,4,"[(2309, 2000), (2486, 1995), (2488, 2058), (23...",1.0,"[C, O, L, A]",False,,0
5,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,5,"[(2513, 1993), (2634, 1989), (2636, 2053), (25...",0.99,"[P, E, T]",False,,0
6,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,6,"[(2651, 1989), (2692, 1988), (2694, 2051), (26...",0.98,[X],False,,0
7,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,7,"[(2715, 1987), (2844, 1983), (2846, 2047), (27...",0.99,"[2, ., 2, 5]",False,,0
8,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,8,"[(2871, 1982), (2933, 1980), (2935, 2044), (28...",0.99,"[L, T]",False,,0
9,0,4032,3024,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,0,"[(1237, 2032), (2942, 1979), (2944, 2044), (12...",0.99,9,"[(2930, 1981), (2942, 1981), (2944, 2044), (29...",0.99,[.],False,,0


In [70]:
df = df.drop('20.jpg', level='file')

In [71]:
df.shape

(432, 16)

**Target**

In [73]:
df[df['y']==1]

Unnamed: 0_level_0,Unnamed: 1_level_0,n_page,page_height,page_width,n_block,block_vert,block_confidence,n_paragraph,paragraph_vert,paragraph_confidence,n_word,word_vert,word_confidence,text,symbol_search_pos,n_symbol_search_find,y
file,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1.jpg,22,0,719,1280,5,"[(433, 437), (752, 423), (756, 526), (437, 540)]",0.98,0,"[(433, 437), (752, 423), (756, 526), (437, 540)]",0.98,1,"[(505, 434), (752, 424), (756, 526), (509, 537)]",0.98,"[8, 7, ., 5, 0]",False,,1
2.jpg,19,0,581,1032,4,"[(411, 286), (638, 260), (646, 338), (420, 363)]",0.98,0,"[(411, 286), (638, 260), (646, 338), (420, 363)]",0.98,1,"[(463, 280), (638, 260), (647, 338), (472, 357)]",0.98,"[8, 7, ., 5, 0]",False,,1
3.jpg,10,0,581,1032,1,"[(259, 273), (534, 271), (535, 353), (260, 355)]",0.49,0,"[(259, 273), (534, 271), (535, 353), (260, 355)]",0.49,2,"[(399, 272), (510, 271), (511, 353), (400, 354)]",0.7,"[1, 5, 2]",False,,1
4.jpg,7,0,581,1032,2,"[(449, 248), (587, 244), (589, 319), (451, 323)]",0.99,0,"[(449, 248), (587, 244), (589, 319), (451, 323)]",0.99,0,"[(449, 248), (587, 244), (589, 319), (451, 323)]",0.99,"[1, 2, 2]",False,,1
5.jpg,4,0,719,1280,2,"[(743, 318), (858, 296), (868, 348), (753, 370)]",0.79,0,"[(743, 318), (858, 296), (868, 348), (753, 370)]",0.79,0,"[(743, 318), (832, 301), (842, 353), (753, 370)]",0.99,"[1, 2, 2]",False,,1
6.jpg,8,0,581,1032,2,"[(479, 145), (704, 143), (705, 198), (480, 200)]",0.52,0,"[(479, 145), (704, 143), (705, 198), (480, 200)]",0.52,0,"[(479, 145), (548, 144), (549, 199), (480, 200)]",0.99,"[8, 7]",False,,1
7.jpg,9,0,581,1032,1,"[(423, 152), (569, 144), (573, 224), (427, 232)]",0.94,0,"[(423, 152), (569, 144), (573, 224), (427, 232)]",0.94,1,"[(470, 150), (569, 145), (573, 223), (474, 229)]",0.95,"[8, 7]",False,,1
8.jpg,9,0,960,1280,1,"[(509, 472), (654, 474), (653, 581), (508, 579)]",0.99,0,"[(509, 472), (654, 474), (653, 581), (508, 579)]",0.99,1,"[(543, 472), (654, 473), (653, 580), (542, 579)]",0.99,"[8, 7]",False,,1
9.jpg,9,0,581,1032,1,"[(388, 216), (535, 236), (523, 323), (376, 303)]",0.94,0,"[(388, 216), (535, 236), (523, 323), (376, 303)]",0.94,1,"[(424, 221), (535, 236), (523, 323), (412, 308)]",0.96,"[1, 5, 2]",False,,1
10.jpg,10,0,719,1280,1,"[(677, 332), (814, 343), (807, 419), (671, 408)]",0.91,0,"[(677, 332), (814, 343), (807, 419), (671, 408)]",0.91,1,"[(707, 334), (814, 343), (807, 418), (701, 410)]",0.98,"[1, 5, 2]",False,,1


In [74]:
df.to_csv('../data/preprocessed/data.csv')