## Extract Meta Information from Documents

### Import packages and class

In [1]:
from os import listdir
from os.path import isfile, join
from tabulate import tabulate
from tika import parser 
import pprint

import meta_information_class

### Read filename of pdfs from directory and choose file

In [2]:
filenames = [f for f in listdir('data') if isfile(join('data', f))]

file_idx = 3
file = 'data/'+filenames[file_idx]
file

'data/accounts-payable-policy-procedures.pdf'

### Apply file parser

In [3]:
raw = parser.from_file(file)
text = raw["content"]

2021-06-20 19:06:27,915 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar to /var/folders/f1/ddxh7b9x3s1bhw035wjcbxwc0000gn/T/tika-server.jar.
2021-06-20 19:06:31,310 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar.md5 to /var/folders/f1/ddxh7b9x3s1bhw035wjcbxwc0000gn/T/tika-server.jar.md5.
2021-06-20 19:06:32,226 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


### Define keyword list

In [4]:
keywords_names_list = ['procedure', 'process', 'sop', 'policy', 'manual', 'step']
keywords_documents_list = ['purchase', 'order', 'form', 'request', 'invoice', 'documents', 'document', 'documentation']
keywords_date_list = ['issued date','issue-date','effective-date', 'implementation-date','updated', 'adopted' ,'revised',"review date","revision date","version","last revision","issued","effective date","date"]

### Extract meta information from document

In [8]:
ngrams_process_docs = [2,3]
information = meta_information_class.extract_meta_information(file, keywords_names_list, keywords_date_list, keywords_documents_list, ngrams_process_docs)

information_dict = information.create_dict()
information_df = information.create_df()

### Output as dictionary

In [9]:
pprint.pprint(information_dict)

{'date': [('date', '2003-12-15'),
          ('adopted', '2008-03-25'),
          ('revised', '2016-09-26'),
          ('revised', '2016-11-22')],
 'documents (bigrams)': [('purchase order', 10),
                         ('invoice processing', 3),
                         ('documentation disbursements', 2)],
 'documents (trigrams)': [('purchase order receiving', 4),
                          ('documentation disbursements must', 2),
                          ('source documentation such', 2)],
 'linked processes': ['None'],
 'name': [('POLICY AND PROCEDURES', 1), ('ACCOUNTS PAYABLE', 1)]}


### Output as dataframe

In [7]:
print(tabulate(information_df, headers='keys', tablefmt='psql', showindex=False))

+--------------------+---------------------------------------+---------+
| meta information   | value                                 |   count |
|--------------------+---------------------------------------+---------|
| name               | STANDARD OPERATING PROCEDURE          |       4 |
| date               | 2014-02-28                            |         |
| documents (2-gram) | projection form                       |      12 |
| documents (2-gram) | form attachment                       |       5 |
| documents (2-gram) | plan form                             |       3 |
| documents (3-gram) | flow projection form                  |       4 |
| documents (3-gram) | business plan form                    |       3 |
| documents (3-gram) | cost projection form                  |       3 |
| linked processes   | 8. RELATED DOCUMENTS                  |         |
| linked processes   | 8.1. Sanitation Business Plan Form    |         |
| linked processes   | 8.2. Market Analysis Form   