#### Install Dependencies!

In [7]:
# !pip install pdfminer
# !pip install tqdm
# !pip install python-time
# !pip install nltk

Collecting nltk
  Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Downloading regex-2023.6.3-cp310-cp310-macosx_11_0_arm64.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.0/289.0 kB[0m [31m564.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.6.3


In [6]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from tqdm import tqdm
from io import StringIO

import json
from collections import Counter
import concurrent.futures
from multiprocessing import Pool
from functools import partial

import nltk
import spacy
import numpy as np
import glob

ModuleNotFoundError: No module named 'nltk'

#### Boilerplat Function!

In [4]:
def get_pdf_file_content(path_to_pdf):
    # Set parameters 
    out_text = StringIO()
    text_converter = TextConverter(PDFResourceManager(caching=True), out_text, laparams=LAParams())
    interpreter = PDFPageInterpreter(PDFResourceManager(caching=True), text_converter)

    fp = open(path_to_pdf, 'rb')

    # Set the maximum number of pages to read
    max_pages = 5

    # Use tqdm to create a progress bar
# with tqdm(total=max_pages, desc="Extracting") as pbar:
    for index, page in enumerate(PDFPage.get_pages(fp, pagenos=set())):
        interpreter.process_page(page)
        # pbar.update(1)

        # Check if the maximum number of pages has been reached
        if index + 1 >= max_pages:
            break

    text = out_text.getvalue()

    fp.close()
    text_converter.close()
    out_text.close()

    return text


#### NLP Function

In [5]:
def five_most_recurrent_locations(text: str) -> dict:
    # Load the pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Sample text
    sample_text = text

    # Process the text
    doc = nlp(sample_text)

    # Find location words and their locations
    locations = [entity.text for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]

    # Sorting locations by frequency
    my_dict = dict(Counter(locations))
    sorted_dict = dict(sorted(my_dict.items(), key=lambda x: x[1], reverse=True))
    first_five_elements = dict(list(sorted_dict.items())[:5])

    return first_five_elements

#### Processing Files

In [6]:

def get_pdf_file_names():
    pdf_files = glob.glob("data/*.pdf")
    return pdf_files

# Define a function to process a single file
def process_pdf_file(file):
    content = get_pdf_file_content(file)
    output_dict = five_most_recurrent_locations(content)
    return list(output_dict.keys())

pdf_files = get_pdf_file_names()
list_of_lists = []

#### Multithreading (makes the code run at least 4 times faster!)

In [7]:
# Create a ThreadPoolExecutor with the maximum number of worker threads
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the file processing tasks to the executor
    future_results = [executor.submit(process_pdf_file, file) for file in pdf_files]

    # Use tqdm to track the progress of the tasks
    for future in tqdm(concurrent.futures.as_completed(future_results), total=len(future_results)):
        # Retrieve the result from the completed task and append it to the list
        list_of_lists.append(future.result())

100%|██████████| 5/5 [00:04<00:00,  1.10it/s]


#### 'list_of_lists' now contains the results from processing each PDF file

In [10]:
list_of_lists

[[],
 ['al.', 'Bhattachan et al.', 'Manda', 'Kummu', 'Klein'],
 ['○', 'California', 'UK', 'Australia', 'the Global South'],
 ['al.', 'McCabe', 'Durham', 'Cranford', 'NC'],
 ['Prague', 'the United States', 'Iran', 'the Czech Republic', 'Europe']]