In [28]:
import io 
from pdfminer.converter import TextConverter 
from pdfminer.pdfinterp import PDFPageInterpreter 
from pdfminer.pdfinterp import PDFResourceManager 
from pdfminer.pdfpage import PDFPage
from keybert import KeyBERT
import requests

In [29]:
def extract_text_by_page(pdf_path): 

    with open(pdf_path, 'rb') as fh: 
        
        for page in PDFPage.get_pages(fh, 
                                    caching=True, 
                                    check_extractable=True): 
            
            resource_manager = PDFResourceManager() 
            fake_file_handle = io.StringIO() 
            
            converter = TextConverter(resource_manager, 
                                    fake_file_handle) 
            
            page_interpreter = PDFPageInterpreter(resource_manager, 
                                                converter) 
            
            page_interpreter.process_page(page) 
            text = fake_file_handle.getvalue() 
            
            yield text 
            
            # close open handles 
            converter.close() 
            fake_file_handle.close()

In [30]:
def extract_text(pdf_path):
    text = ""
    for page in extract_text_by_page(pdf_path): 
        text = page + " "
    return text

In [31]:
def segment_str(chars, exclude=None):

    words = []

    if not chars.isalpha():  # don't check punctuation etc.; needs more work
        return [chars]

    if not exclude:
        exclude = set()

    working_chars = chars
    while working_chars:
        # iterate through segments of the chars starting with the longest segment possible
        for i in range(len(working_chars), 1, -1):
            segment = working_chars[:i]
            if eng_dict.check(segment) and segment not in exclude:
                words.append(segment)
                working_chars = working_chars[i:]
                break
        else:  # no matching segments were found
            if words:
                exclude.add(words[-1])
                return segment_str(chars, exclude=exclude)
            # let the user know a word was missing from the dictionary,
            # but keep the word
            print('"{chars}" not in dictionary (so just keeping as one segment)!'
                  .format(chars=chars))
            return [chars]
    # return a list of words based on the segmentation
    return words

In [32]:
doc = extract_text(r"Understanding Supply-and-Demand-C2J-2022.pdf")
print(doc)

FOUR COUNTY LABOUR MARKET PLANNING BOARD111 Jackson Street South, Suite 1Walkerton, ON  N0G 1L0www.planningboard.ca • 519-881-2725 


In [33]:
kw_model = KeyBERT()

In [34]:

# doc = """
#      What's been added so far ? Grey County Trails Grey County Official Plan GIS Data Grey County Roads Data Grey County Rural Business Listing Grey County Children s Services Overview Census 2016 Results For more information about BGDISC , contact Brad Noble at 519 376 2205 . Grey County Early Development Instrument Results Grey County Children s Services Overview Child Care Wait list Alcohol Vendors Grey Bruce Food Security Assets Employer One Summary Report Backpack Program United Way of Bruce Grey Utility Assistance Program United Way of Bruce Grey Population by age and sex Dwellings and Housing Families , Households , Marital Status Income , Lab our , Commute to Work Mobility , Migration Immigration , Eth no cultural Diversity , Aboriginal Identity Grey County 211 Statistics for 2017A Profile of Child , Youth and Family Health in Grey and Bruce Counties Census Municipal Profiles Substance Abuse Trends Grey Bruce Canadian Community Health Survey Reports 1 . Business Economy 2 . Government Finance 3 . Demographics 4 . Education 5 . Environment 6 . Health 7 . Infrastructure 8 . Land Use Development 9 . Recreation , Culture Tourism 10 . Community Social Services 11 . TransportationBGDISC Open Data Topics If your organization would like an introduction to BGDISC , contact Brad Noble at 519 376 2205 . https bgdisc . ca open data toolkit data contributorsBGDISC Introduction ReadinessBGDISC Open Data Toolkit Have a data driven spring ! The BGDISC Open Data Toolkit has been updated and now contains two sections . The first section is for data users those who are coming to BGDISC to use the available data . The second section is for data contributors those who are coming to BGDISC to add their own data to the BGDISC available data . The data user section has the BGDISC Terms of Reference and the BGDISC Open Data License . Both documents help data users understand how they are permitted to use the available data , as well as understanding the principles and goals of the BGDISC . The data contributor section has several documents , including the BGDISC Terms of Reference , Open Data License and the BGDISC Open Data Readiness Survey . Within the data contributor section there is also another page devoted to help organizations create their own Open Data Licenses , Policies and Assessment Structures . If your organization would like to become a member of BGDISC , complete the readiness survey . Both the Google Form and the PDF document of the survey area vail able at this URL
#      """

""""
url = 'https://bgdisc.ca/sites/default/files/Understanding%20Supply-and-Demand-C2J-2022.pdf'
r = requests.get(url, allow_redirects=True)
open('temp.pdf', 'wb').write(r.content)
doc = extract_text('temp.pdf')
print(doc)
""""

keywords = kw_model.extract_keywords(doc)

SyntaxError: EOL while scanning string literal (3164884919.py, line 11)

In [35]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1))

[('ca', 0.3369),
 ('planningboard', 0.3345),
 ('county', 0.3324),
 ('jackson', 0.3083),
 ('street', 0.2974)]

In [36]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 3))

[('planning board111 jackson', 0.5956),
 ('board111 jackson street', 0.5877),
 ('jackson street south', 0.5357),
 ('jackson street', 0.5232),
 ('county labour market', 0.4972)]

In [37]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 3), stop_words=None)

[('planning board111 jackson', 0.5956),
 ('board111 jackson street', 0.5877),
 ('jackson street south', 0.5357),
 ('jackson street', 0.5232),
 ('county labour market', 0.4972)]

In [38]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 3), 
                          use_maxsum=True, nr_candidates=20, top_n=5)

[('1l0www planningboard', 0.3371),
 ('labour market', 0.3393),
 ('south suite 1walkerton', 0.357),
 ('ca 519 881', 0.4292),
 ('jackson street', 0.5232)]

In [39]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 3),
                          use_mmr=True, diversity=0.7)

[('planning board111 jackson', 0.5956),
 ('labour market', 0.3393),
 ('519 881 2725', 0.1377),
 ('ca', 0.3369),
 ('south suite', 0.2876)]

In [40]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 3),
                              use_mmr=True, diversity=0.2)

[('planning board111 jackson', 0.5956),
 ('county labour market', 0.4972),
 ('board111 jackson street', 0.5877),
 ('jackson street south', 0.5357),
 ('planningboard ca', 0.4893)]

In [41]:
kw_model = KeyBERT(model="xlm-r-bert-base-nli-stsb-mean-tokens")
kw_model.extract_keywords(doc, top_n=15)

[('jackson', 0.4455),
 ('county', 0.3876),
 ('planningboard', 0.3728),
 ('1walkerton', 0.3387),
 ('519', 0.3244),
 ('planning', 0.294),
 ('suite', 0.2937),
 ('2725', 0.2803),
 ('1l0www', 0.2791),
 ('board111', 0.2608),
 ('n0g', 0.2475),
 ('market', 0.2466),
 ('south', 0.2352),
 ('ca', 0.2303),
 ('street', 0.1913)]

Or we can select a SentenceTransformer model with our own parameters:

In [42]:
from sentence_transformers import SentenceTransformer
#sentence_model = SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens", device="cuda")
sentence_model = SentenceTransformer("xlm-r-bert-base-nli-stsb-mean-tokens")

In [43]:
kw_model = KeyBERT(model=sentence_model)
kw_model.extract_keywords(doc)

[('jackson', 0.4455),
 ('county', 0.3876),
 ('planningboard', 0.3728),
 ('1walkerton', 0.3387),
 ('519', 0.3244)]