In [1]:
from tika import parser 
import re
from collections import Counter

In [2]:
class extract_process_name():
    def __init__(self, text, keywords_list):
        """
        Params:
        text = process document as string
        keywords_list = most common keywords to identify process name (based on keywords matrix)
        """

        self.text = text
        self.keywords_list = keywords_list
        self.output = [] 
    
    def process_name_keywords(self):
        # search process name based on selected keywords from keywords_list
        # regex: line containing keyword, line break
        try: 
            self.name_keywords = re.search(r'(.+({marker})[^\n])'.format(marker = '|'.join(self.keywords_list)), self.text, re.IGNORECASE).group(1)
            self.output.append(self.name_keywords)
            return(self.name_keywords)
        except:
            print("Keywords not found in document")
            
    def process_name_title(self):
        # search process name based on keyword 'Title'
        # regex: Title, any special character, name, line break
        try:
            self.name_title = re.search(r'((?<=Title(\W)).+[^\n])', self.text, re.IGNORECASE).group(1)
            self.output.append(self.name_title)
            return(self.name_title)
        except:
            print("Keyword {} not found in document".format("'Title'"))

    def process_name_first_line(self):
        # extract first line of document 
        # regex: last line break of first string, name, line break
        self.name_first = re.search(r'([^\n].+[^\n])', self.text).group(1)
        self.output.append(self.name_first)
        return(self.name_first)

    def most_common(self):
        # check which process name appear how often 
        self.process_name_keywords()
        self.process_name_title()
        self.process_name_first_line()
        cleaned_lst = [string.lstrip().rstrip() for string in self.output]  # remove leading and end whitespaces
        c = Counter(cleaned_lst) # most common substrings out of every method 
        return(c.most_common())

In [22]:
file_vec = ["3 Marcus Institute Purchasing Policy _ Procedure 1.31.19 For FY19(1).pdf", "nut_scm_sam_sop_manual_2017.pdf",
           "TRW.PRO.POL.1063.2-Supply-Chain-Inventory-Management-Policy.pdf"]

In [32]:
file_index = 0
raw = parser.from_file("data/"+file_vec[file_index])
text = raw["content"]

In [33]:
text

"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nTRW.PRO.POL.1063.2 Supply Chain Inventory Management Policy  1 \n\n \n\nTrust Policy \n\n \n\nSupply Chain Inventory Management Policy (SC010) \n\nIssue Date Review Date  Version \n\nSeptember 2020 September 2025 2 \n\nPurpose \n\nThe purpose of this Supply Chain Inventory Management Policy is to identify the relevant \nStandard Operating Procedures (SOP’s), required to ensure robust and compliant \ninventory control across the Trust for areas that are controlled by Procurement & Supply \nChain Management (PSCM). \n\nWho should read this document? \n\nEmployees within the PSCM and any other departments or teams which act as an interface with or \ncustomer of these functions.  \n\nKey Messages \n\nGood Inventory Management is critical to a high performing NHS Trust. Having the right stock level, \nof the right product, means the Trust can perform the necessary procedures, without holding \nunn

In [34]:
keywords_names_list = ['Procedure', 'Process', 'SOP', 'Policy', 'Manual', 'Step']
names = extract_process_name(text, keywords_names_list)

most_common_names = names.most_common()
most_common_names

[('TRW.PRO.POL.1063.2 Supply Chain Inventory Management Policy', 1),
 ('Supply Chain Inventory Management Policy (SC010)', 1),
 ('TRW.PRO.POL.1063.2 Supply Chain Inventory Management Policy  1', 1)]

In [35]:
names.process_name_keywords()

'TRW.PRO.POL.1063.2 Supply Chain Inventory Management Policy '

In [36]:
names.process_name_title()

'Supply Chain Inventory Management Policy (SC010) '

In [37]:
names.process_name_first_line()

'TRW.PRO.POL.1063.2 Supply Chain Inventory Management Policy  1 '