In [1]:
from ocrpy import TextOcrPipeline
from ocrpy import DocumentReader, StorageWriter, TextParser, TableParser

In [5]:
# unzip the data
!unzip sample_data/data.zip -d sample_data/data
!mkdir sample_data/output

Archive:  sample_data/data.zip
  inflating: sample_data/data/research paper 2.jpg  
  inflating: sample_data/data/__MACOSX/._research paper 2.jpg  
  inflating: sample_data/data/10.1.1.839.3147_removed.pdf  
  inflating: sample_data/data/__MACOSX/._10.1.1.839.3147_removed.pdf  
  inflating: sample_data/data/103-103-1-PB_removed.pdf  
  inflating: sample_data/data/__MACOSX/._103-103-1-PB_removed.pdf  
  inflating: sample_data/data/budget.jpg  
  inflating: sample_data/data/__MACOSX/._budget.jpg  
  inflating: sample_data/data/image.jpg  
  inflating: sample_data/data/__MACOSX/._image.jpg  
  inflating: sample_data/data/invoice.jpg  
  inflating: sample_data/data/__MACOSX/._invoice.jpg  
  inflating: sample_data/data/news 2.jpg  
  inflating: sample_data/data/__MACOSX/._news 2.jpg  
  inflating: sample_data/data/news.jpg  
  inflating: sample_data/data/__MACOSX/._news.jpg  
  inflating: sample_data/data/Odors_released_by_stressed_rats_produce.pdf  
  inflating: sample_data/data/__MACOSX/

In [6]:
DOC_PATH = "sample_data/data/invoice.jpg"
PDF_PATH =  "sample_data/data/Odors_released_by_stressed_rats_produce.pdf"
PARSER_BACKEND = "pytesseract"

### Text Data Parser for Single File


In [7]:
CREDENTIALS = {"AWS": "path/to/aws-credentials.env/file",
               "GCP": "path/to/gcp-credentials.json/file"}
               
reader = DocumentReader(file=DOC_PATH) # read image or pdf file
text_parser = TextParser(backend=PARSER_BACKEND, credentials={}) # Supported backends: pytesseract, google-cloud-vision, aws-textract. And you can also pass credentials for each backend if required.
parsed_text = text_parser.parse(reader) # parse the document using the selected parser backend.

writer = StorageWriter() # write to storage
writer.write(parsed_text, "sample_data/output/sample_image_output.json") # write the parsed text to storage

In [4]:
print("Full Text: ")
print(parsed_text[0]['text'])

Full Text: 
 

custa

a division of P. H. GLATFELTER COMPANY
[ISGAN FOREST, NORTH CAROLINA 26768

 

Page Lo
C5008
SAMPLE ORDER

No.
"THLEPHONE: 705 877-211

 

 

04-503

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

reouesteo [ “1 [ star 10 7
ev Mr. Glenn E. Creamer Mr. Kenneth Wayne Smith
R. J. Reynolds Tobacco Company R, J. Reynolds Tobacco Company
Bowman Gray Technical Center Bowman Gray Technical Center
P. 0. Box 1487 950 Reynolds Boulevard
Winston-Salem, North Carolina Winston-Salem, North Carolina
L 27102 _| L_ 27105_|
Gum one io | ae | a | curreo [ROUTH
fuer | entero | soveomeo | snpenc | suipreo | pro | cox
3022-9461] 4/13/95] 4/20/95] 4/17/95 | 4/17/95| x United Parcel Service
eu ‘ounnrry . s2e roovct yuan
1 | 2 Bobbin 27 mm x 6000 | TOD 07550, Low Sidestream Cigarette 1
Paper, 25 g/M? Basis Weight, Wood
Pulp, 28% Calcium Carbonate, 3%
Magnesium Hydroxide, 0.4% Citrates
‘SAMPLES - NO COMMERCIAL VALUE - VALUE FOR CUSTOMS PURPOSES ONLY: LESS THAN U.S. $1000 2
8
MARKS: 

In [5]:
# OCR blocks in the document
print("Lines")
parsed_text[0]['lines'][:3]

Lines


[{'text': ' ',
  'region': {'x1': 0, 'y1': 0, 'x2': 725, 'y2': 6},
  'idx': 0,
  'tokens': [{'text': ' ',
    'region': {'x1': 0, 'y1': 0, 'x2': 725, 'y2': 6},
    'idx': 0,
    'meta_data': {'text_length': 1}}],
  'meta_data': {'token_count': 1, 'text_length': 1}},
 {'text': 'custa',
  'region': {'x1': 288, 'y1': 27, 'x2': 429, 'y2': 52},
  'idx': 1,
  'tokens': [{'text': 'custa',
    'region': {'x1': 288, 'y1': 27, 'x2': 429, 'y2': 52},
    'idx': 1,
    'meta_data': {'text_length': 5}}],
  'meta_data': {'token_count': 1, 'text_length': 5}},
 {'text': 'a division of P. H. GLATFELTER COMPANY',
  'region': {'x1': 246, 'y1': 69, 'x2': 475, 'y2': 78},
  'idx': 2,
  'tokens': [{'text': 'a',
    'region': {'x1': 246, 'y1': 71, 'x2': 250, 'y2': 78},
    'idx': 2,
    'meta_data': {'text_length': 1}},
   {'text': 'division',
    'region': {'x1': 254, 'y1': 69, 'x2': 295, 'y2': 78},
    'idx': 3,
    'meta_data': {'text_length': 8}},
   {'text': 'of',
    'region': {'x1': 299, 'y1': 69, 'x2':

### Table Data Parser

In [6]:
aws_config = "../notebooks/local/aws_keys.env"

reader = DocumentReader(file=PDF_PATH) # read document

table_parser = TableParser(credentials=aws_config) # Table parser
parsed_table = table_parser.parse(reader,  attempt_csv_conversion=True) # parse the document using the selected parser backend.

DocumentReader(file='sample_data/Odors_released_by_stressed_rats_produce.pdf', credentials=None, storage_type='LOCAL')


In [13]:
# parsed table is a dictionary of pandas dataframes. (each item represents an individual table in a pdf document)
# print (page)
for page, tables in parsed_table.items():
    print(f"Page {page} has {len(tables)} tables")

Page 0 has 0 tables
Page 1 has 2 tables
Page 2 has 0 tables


In [14]:
parsed_table[1][0]

Unnamed: 0,0,1,2,3
0,,,Latency (in,min)
1,Group,n,M,SE
2,Experiment 1,,,
3,Control,12,6.6,1.3
4,Stress,12,10.9,1.2
5,Experiment 2,,,
6,Control,4,4.4,1.7
7,Stress,5,10.5,0.4
8,Experiment 3,,,
9,Control,8,5.9,1.6


In [15]:
parsed_table[1][1]

Unnamed: 0,0,1,2
0,Behavior,Control,Stress
1,Formalin induced,5,1
2,Rearing,41,28
3,Freezing,1,2
4,Grooming,6,8
5,General activity,47,61


### Text Pipeline

In [17]:
SOURCE = 'sample_data/data' # s3 bucket or local directory or gcs bucket with your documents.
DESTINATION = 'sample_data/output/' # s3 bucket or local directory or gcs bucket to write the processed documents.
PARSER = 'pytesseract' # or 'google-cloud-vision' or 'pytesseract'
CREDENTIALS = {"AWS": "path/to/aws-credentials.env/file",
               "GCP": "path/to/gcp-credentials.json/file"} # optional - if you are using any cloud service.

pipeline = TextOcrPipeline(source_dir=SOURCE, destination_dir=DESTINATION,
                           parser_backend=PARSER, credentials_config=CREDENTIALS)
pipeline.process()

Running Pipeline with the following configuration:

1. DOCUMENT_SOURCE: sample_data
2. DOCUMENT_DESTINATION: output
3. SOURCE_STORAGE_TYPE: LOCAL
4. DESTINATION_STORAGE_TYPE: LOCAL
5. PARSER_BACKEND_TYPE: pytesseract
6. TOTAL_DOCUMENT_COUNT: 13
7. IMAGE_FILE_COUNT: 7
8. PDF_FILE_COUNT: 4
9. CREDENTIALS: {'AWS': 'path/to/aws-credentials.env/file', 'GCP': 'path/to/gcp-credentials.json/file'}


2it [00:11,  5.59s/it]

FILE: .DS_Store - ERROR: 'FileTypeNotSupported' object is not iterable


6it [00:43, 10.09s/it]

FILE: output - ERROR: 'FileTypeNotSupported' object is not iterable


13it [01:57,  9.02s/it]
