In [5]:
from ocrpy import DocumentReader, TextOcrIndexPipeline
from haystack.nodes import BM25Retriever, TfidfRetriever
from haystack.document_stores import OpenSearchDocumentStore

In [None]:
# unzip the data
!unzip sample_data/data.zip -d sample_data/data
!mkdir sample_data/output

### Lets create a new pipeline and Index the documents

In [2]:
SOURCE = 'sample_data/data' # s3 bucket or local directory or gcs bucket with your documents.
DESTINATION = 'sample_data/output/' # s3 bucket or local directory or gcs bucket to write the processed documents.
PARSER = 'pytesseract' # or 'google-cloud-vision' or 'pytesseract'
CREDENTIALS = {"AWS": "path/to/aws-credentials.env/file",
               "GCP": "path/to/gcp-credentials.json/file"} # optional - if you are using any cloud service.

DATABASE_BACKEND = "opensearch"
DATABASE_CONFIG = {"opensearch": {"port": 9200, "username": "admin", "password": "admin"} , "batch_size": 100}

In [8]:
pipeline = TextOcrIndexPipeline(source_dir= SOURCE,
                                destination_dir=DESTINATION,
                                parser_backend=PARSER,
                                credentials_config=CREDENTIALS,
                                database_backend=DATABASE_BACKEND,
                                database_config=DATABASE_CONFIG)

In [9]:
pipeline.process()

Running Pipeline with the following configuration:

1. DOCUMENT_SOURCE: sample_data
2. DOCUMENT_DESTINATION: output
3. SOURCE_STORAGE_TYPE: LOCAL
4. DESTINATION_STORAGE_TYPE: LOCAL
5. PARSER_BACKEND_TYPE: pytesseract
6. TOTAL_DOCUMENT_COUNT: 13
7. IMAGE_FILE_COUNT: 7
8. PDF_FILE_COUNT: 4
9. CREDENTIALS: {'AWS': 'path/to/aws-credentials.env/file', 'GCP': 'path/to/gcp-credentials.json/file'}
10. DATABASE_BACKEND: opensearch
11. DATABASE_CONFIG: {'opensearch': {'port': 9200, 'username': 'admin', 'password': 'admin'}, 'batch_size': 100}


2it [00:11,  5.74s/it]

FILE: .DS_Store - ERROR: 'FileTypeNotSupported' object is not iterable


6it [00:44, 10.32s/it]

FILE: output - ERROR: 'FileTypeNotSupported' object is not iterable


13it [02:01,  9.38s/it]


### Lets create a document retriever and search for the documents.

In [10]:
# Create a document store to retrieve;
doc_store = OpenSearchDocumentStore(**DATABASE_CONFIG['opensearch'])
retriver = BM25Retriever(doc_store)


In [17]:
for i in retriver.retrieve(query="metabolism", top_k=2):
    print(i.meta['file_name'])
    print(i.content[:1000])
    print("-"*10)

10.1.1.839.3147_removed_pytesseract.json
The metabolic and psychological effects of
exercise training in hemodialysis patients" ?

Andrew P. Goldberg, M.D., James Hagberg, Ph.D., James A. Delmez, M.D.,
Robert M. Carney, Ph.D., Patricia M. McKevitt, ACSW, Ali A. Ehsani, M.D., and
Herschel R. Harter, M.D.

ABSTRACT The effect of exercise training on metabolic abnormalities and psychological
function was assessed in seven hemodialysis patients. Their initial work capacity was low and
improved after 8 months of training. Exercise was associated with a reduction in the dose of
antihypertensive medications in four patients and a decrease in phosphate binder therapy in three
patients. There was also a rise in hematocrit levels (%A = 34 + 20%, P < 0.03) and the hemoglobin
concentration (%A = 37 + 23%, P < 0.05) of five males. Plasma glucose levels fell (—5 + 2%, P<
0.05, n = 5) and the glucose disappearance rate improved (20 + 7%, P < 0.02), while hyperinsulinism
decreased (—36 + 20%, P < 0.02