In [1]:
import glob
import json
import pandas as pd
from pathlib import Path

In [2]:
def create_dataset(dataset_path):
    data = []
    for filename in glob.glob(dataset_path):
        with open(filename) as f:
            annotate = json.load(f)
            for match in annotate['form']:
                match['file'] = Path(filename).stem
            data.extend(annotate['form'])
    return pd.DataFrame.from_records(data)

In [3]:
train_dataset = create_dataset("training_data/annotations/*.json")
test_dataset = create_dataset("testing_data/annotations/*.json")

In [4]:
train_dataset.to_csv("train_data.csv", index=False)
test_dataset.to_csv("test_data.csv", index=False)

In [30]:
data

Unnamed: 0,box,text,label,words,linking,id,file
0,"[394, 145, 433, 162]",ITEM:,question,"[{'box': [394, 145, 433, 162], 'text': 'ITEM:'}]","[[0, 15]]",0,71206427
1,"[109, 112, 151, 129]",DATE:,question,"[{'box': [109, 112, 151, 129], 'text': 'DATE:'}]","[[1, 13]]",1,71206427
2,"[110, 140, 159, 155]",BRAND:,question,"[{'box': [110, 140, 159, 155], 'text': 'BRAND:'}]","[[2, 14]]",2,71206427
3,"[109, 183, 168, 198]",SUMMARY,question,"[{'box': [109, 183, 168, 198], 'text': 'SUMMAR...",[],3,71206427
4,"[175, 184, 193, 195]",OF,question,"[{'box': [175, 184, 193, 195], 'text': 'OF'}]",[],4,71206427
...,...,...,...,...,...,...,...
7406,"[397, 881, 528, 914]",Topline w/o 12 /18 /95 Final Report 12/ 31/ 95,answer,"[{'text': 'Topline', 'box': [398, 883, 443, 89...","[[59, 60]]",60,80310840a
7407,"[577, 742, 635, 759]",$100 600,other,"[{'text': '$100', 'box': [577, 742, 606, 757]}...",[],61,80310840a
7408,"[398, 761, 560, 779]",Revised Costs (if any),question,"[{'text': '', 'box': [398, 761, 409, 774]}, {'...",[],62,80310840a
7409,"[397, 788, 532, 806]",Fieldwork Schedule:,question,"[{'text': 'Fieldwork', 'box': [397, 788, 464, ...","[[63, 58]]",63,80310840a


In [31]:
set(data['label'])

{'answer', 'header', 'other', 'question'}

In [32]:
data['label'].value_counts()

question    3266
answer      2802
other        902
header       441
Name: label, dtype: int64

In [7]:
data.isna().sum()

box        0
text       0
label      0
words      0
linking    0
id         0
dtype: int64

In [16]:
data

Unnamed: 0,box,text,label,words,linking,id
0,"[394, 145, 433, 162]",ITEM:,question,"[{'box': [394, 145, 433, 162], 'text': 'ITEM:'}]","[[0, 15]]",0
1,"[109, 112, 151, 129]",DATE:,question,"[{'box': [109, 112, 151, 129], 'text': 'DATE:'}]","[[1, 13]]",1
2,"[110, 140, 159, 155]",BRAND:,question,"[{'box': [110, 140, 159, 155], 'text': 'BRAND:'}]","[[2, 14]]",2
3,"[109, 183, 168, 198]",SUMMARY,question,"[{'box': [109, 183, 168, 198], 'text': 'SUMMAR...",[],3
4,"[175, 184, 193, 195]",OF,question,"[{'box': [175, 184, 193, 195], 'text': 'OF'}]",[],4
...,...,...,...,...,...,...
7406,"[397, 881, 528, 914]",Topline w/o 12 /18 /95 Final Report 12/ 31/ 95,answer,"[{'text': 'Topline', 'box': [398, 883, 443, 89...","[[59, 60]]",60
7407,"[577, 742, 635, 759]",$100 600,other,"[{'text': '$100', 'box': [577, 742, 606, 757]}...",[],61
7408,"[398, 761, 560, 779]",Revised Costs (if any),question,"[{'text': '', 'box': [398, 761, 409, 774]}, {'...",[],62
7409,"[397, 788, 532, 806]",Fieldwork Schedule:,question,"[{'text': 'Fieldwork', 'box': [397, 788, 464, ...","[[63, 58]]",63


Are there natural language texts, which requires bigger model to understand the whold sentence, or more like a short text string?

In [34]:
data.to_csv('train_data.csv', index=False)

In [9]:
data.loc[247]

box                               [496, 204, 502, 210]
text                                                  
label                                         question
words      [{'box': [496, 204, 502, 210], 'text': ''}]
linking                                             []
id                                                   5
Name: 247, dtype: object

In [26]:
data['text'].str.len().describe()

count    7411.000000
mean       16.993118
std        29.437967
min         0.000000
25%         5.000000
50%        10.000000
75%        18.000000
max       706.000000
Name: text, dtype: float64

In [30]:
data['text_len'] = data['text'].str.len()

In [33]:
index = data['text_len'].sort_values().index

In [39]:
data.loc[index].tail(1000)['text'].values

array(['GENERAL PROJECT DESCRIPTION', 'Cigarette Design, Permanent',
       'NEWPORT RACING EVENT SURVEY', 'Quantities and Description:',
       'Smoke Analysis PMO Analysis', 'IF SO, GIVE DATE AND TITLE:',
       'H. J. Minnemeyer L. B. Gray', 'BMC/RMI/EPB JEM/DEC PRC/DRB',
       '1. Productivity Improvement', 'AMES ASSAY FOR MUTAGENICITY',
       '300 mm. tar, 100 mm acetone', 'Check Required ASAP/ PROMPT',
       'Description of change order:', 'Own Brand - Dunhill Smokers:',
       'O ORGANIZER (if applicable):', 'LD50 (95% CONFIDENCE LIMITS)',
       'The report of the results of', '☐ Decrease Existing Revenues',
       'Mr. Johnny Pedersen - Gallup', 'Consideration deferred until',
       'SCIENTIFIC METING OF CHOICE:', 'TH 46 TCRC REGISTRATION FORM',
       '1974- Jan. 1982 to Dec. 1981', '* Director - (G. L. Littell)',
       'THE AMERICAN TOBACCO COMPANY', 'DOES WORK MERIT PUBLICATION?',
       '* Advertising Creative Title', 'FAX TO THE FOLLOWING PERSON:',
       '20. 285 1 

In [47]:
data.loc[index].tail(30)['label'].value_counts()

answer      19
other        9
question     2
Name: label, dtype: int64

In [53]:
data.loc[index].tail(530)[data.loc[index].tail(530)['label']=='question'].text.values

array(['Registry of the Toxic Effects of Chemicals',
       'Decision of Committee on Present Request -',
       'Total number of pages including this page:',
       'Increase Costs - May Be Possible to Absorb',
       'TOTAL REPORTABLE EXPENDITURES FOR VARIETYI:',
       'FOLLOW DEPARTMENT AND COMPANY SAFETY MANUALS',
       'FOLLOW DEPARTMENT AND COMPANY SAFETY MANUALS',
       'NG DOCUMENTS WERE FOUND WITHIN THE ORIGINAL:',
       'PAGE NUMBER(S) WERE MISSING IN THE ORIGINAL.',
       'Date of Final Report (Review Completed Date)',
       '% OF DISTRIBUTION ACHIEVED IN RETAIL OUTLETS:',
       'Reimbursements for expenses (please itemize):',
       'who being duly sworn, says that (he) (she) is',
       'Source of Business - Local Premium KS Smokers',
       '% OF DISTRIBUTION ACHIEVED IN RETAIL OUTLETS:',
       'CONTROL REVERTANTS PER PLATE TOON MI SOLVENTI',
       'Application: tipping, carton end flaps, etc.)',
       'NAMES OF OTHER PERSONS COLLABORATING IN WORK:',
       'CON

We may have to use proper NLP model