#### Install Dependencies!

In [8]:
pip install pdfminer tqdm python-time nltk scidownl spacy

Collecting spacy
  Downloading spacy-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.4-py3-none-any.whl (11 kB)
Collecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.3.0-py3-none-any.whl (181 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.6/181.6 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting pathy>=0.10.0
  Downloading pathy-0.10.2-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.9/48.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.7-cp39-cp39-macosx_10_9_x86_64.whl (32 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.9-cp

In [9]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from scidownl import scihub_download
from tqdm import tqdm
from io import StringIO
from scidownl import scihub_download #new addition for scihub unofficial API

import json
from collections import Counter
import concurrent.futures
from multiprocessing import Pool
from functools import partial

import nltk
import spacy
import numpy as np
import glob
import pandas as pd

#### Boilerplat Function!

In [16]:
def get_pdf_file_content(path_to_pdf):
    # Set parameters 
    out_text = StringIO()
    text_converter = TextConverter(PDFResourceManager(caching=True), out_text, laparams=LAParams())
    interpreter = PDFPageInterpreter(PDFResourceManager(caching=True), text_converter)

    fp = open(path_to_pdf, 'rb')

    # Set the maximum number of pages to read
    max_pages = 5

    # Use tqdm to create a progress bar
# with tqdm(total=max_pages, desc="Extracting") as pbar:
    for index, page in enumerate(PDFPage.get_pages(fp, pagenos=set())):
        interpreter.process_page(page)
        # pbar.update(1)

        # Check if the maximum number of pages has been reached
        if index + 1 >= max_pages:
            break

    text = out_text.getvalue()

    fp.close()
    text_converter.close()
    out_text.close()

    return text


#### NLP Function

In [17]:
def five_most_recurrent_locations(text: str) -> dict:
    # Load the pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Sample text
    sample_text = text

    # Process the text
    doc = nlp(sample_text)

    # Find location words and their locations
    locations = [entity.text for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]

    # Sorting locations by frequency
    my_dict = dict(Counter(locations))
    sorted_dict = dict(sorted(my_dict.items(), key=lambda x: x[1], reverse=True))
    first_five_elements = dict(list(sorted_dict.items())[:5])

    return first_five_elements

#### Processing Files

In [18]:

def get_pdf_file_names():
    pdf_files = glob.glob("PDF Papers (20)/*.pdf")
    return pdf_files

# Define a function to process a single file
def process_pdf_file(file):
    content = get_pdf_file_content(file)
    output_dict = five_most_recurrent_locations(content)
    return file, list(output_dict.keys())

pdf_files = get_pdf_file_names()
list_of_lists = []

#### Multithreading (makes the code run at least 4 times faster!)

In [19]:
filename = []
# Create a ThreadPoolExecutor with the maximum number of worker threads
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the file processing tasks to the executor
    future_results = [executor.submit(process_pdf_file, file) for file in pdf_files]

    # Use tqdm to track the progress of the tasks
    for future in tqdm(concurrent.futures.as_completed(future_results), total=len(future_results)):
        # Retrieve the result from the completed task and append it to the list
        list_of_lists.append(future.result()[1])
        filename.append(future.result()[0])

100%|██████████| 18/18 [00:20<00:00,  1.16s/it]


#### 'list_of_lists' now contains the results from processing each PDF file

In [20]:
list_of_lists

[['North Carolina',
  'USA',
  'North America',
  'Bird',
  'United States of America'],
 ['North Carolina', 'USA', 'NC', 'Nitrous', 'Megonigal'],
 ['the West Coast', 'Florida', 'USA', 'San Bernardino'],
 ['USA', 'al.', 'Biogeochemistry', 'the Albemarle Sound', 'L-1'],
 ['South Carolina', 'Louisiana', 'Georgia', 'USA', 'Waccamaw'],
 ['USA', 'Weston', 'North Carolina', 'N2O', 'Mason'],
 ['al.', 'Georgia', 'the Altamaha River', 'N', 'Weston'],
 ['al.', 'New Jersey', 'Smith', 'al. /', 'the Delaware Bay'],
 ['al.', 'USA', 'Bridgham', 'South Carolina', 'Richmond'],
 ['USA', 'North Carolina', 'North  Carolina', 'North Carolina’s', 'NC'],
 ['al.', 'USA', 'New England', 'Niering', 'Connecticut'],
 ['North Carolina', 'Bhattachan', 'USA', 'Netherlands', 'Bhattachan et\xa0al'],
 ['al.', 'Florida', 'Louisiana', 'Gulf of Mexico', 'LA'],
 ['al.', 'Sorghum', 'Tester', 'Kielen', 'Maryland'],
 ['USA', 'L-1', 'Biogeochemistry', 'al.', 'North Carolina'],
 ['al.', 'Maryland', 'Florida', 'Virginia', 'New B

In [21]:
pdf_files

['PDF Papers (20)/mcz039.pdf',
 'PDF Papers (20)/Journal of Geophysical Research  Biogeosciences - 2006 - Weston - Ramifications of increased salinity in tidal freshwater.pdf',
 'PDF Papers (20)/s10533-016-0189-5.pdf',
 'PDF Papers (20)/s10021-018-0325-2.pdf',
 'PDF Papers (20)/file.pdf',
 'PDF Papers (20)/08-77.1.pdf',
 'PDF Papers (20)/Journal of Applied Ecology - 2018 - Borchert - Coastal wetland adaptation to sea level rise  Quantifying potential for.pdf',
 'PDF Papers (20)/bg-10-8171-2013.pdf',
 'PDF Papers (20)/1-s2.0-S0006320716303007-main.pdf',
 'PDF Papers (20)/s10533-021-00797-5.pdf',
 'PDF Papers (20)/1-s2.0-S0964569117307676-main.pdf',
 'PDF Papers (20)/1-s2.0-S009884722030280X-main.pdf',
 'PDF Papers (20)/Sea-Level_Rise_and_Coastal_Forest_Retreat_on_the_W.pdf',
 'PDF Papers (20)/s10533-014-9986-x.pdf',
 'PDF Papers (20)/s11069-019-03706-0.pdf',
 'PDF Papers (20)/04-0211.1.pdf',
 'PDF Papers (20)/355.Short-Term Response of Carbon Cycling to.pdf',
 'PDF Papers (20)/s10021-02

In [23]:
filename

['PDF Papers (20)/file.pdf',
 'PDF Papers (20)/s10533-021-00797-5.pdf',
 'PDF Papers (20)/Sea-Level_Rise_and_Coastal_Forest_Retreat_on_the_W.pdf',
 'PDF Papers (20)/s10533-016-0189-5.pdf',
 'PDF Papers (20)/08-77.1.pdf',
 'PDF Papers (20)/s10021-018-0325-2.pdf',
 'PDF Papers (20)/Journal of Geophysical Research  Biogeosciences - 2006 - Weston - Ramifications of increased salinity in tidal freshwater.pdf',
 'PDF Papers (20)/1-s2.0-S0964569117307676-main.pdf',
 'PDF Papers (20)/bg-10-8171-2013.pdf',
 'PDF Papers (20)/mcz039.pdf',
 'PDF Papers (20)/1-s2.0-S0006320716303007-main.pdf',
 'PDF Papers (20)/s11069-019-03706-0.pdf',
 'PDF Papers (20)/Journal of Applied Ecology - 2018 - Borchert - Coastal wetland adaptation to sea level rise  Quantifying potential for.pdf',
 'PDF Papers (20)/1-s2.0-S009884722030280X-main.pdf',
 'PDF Papers (20)/s10533-014-9986-x.pdf',
 'PDF Papers (20)/04-0211.1.pdf',
 'PDF Papers (20)/s10021-021-00686-w.pdf',
 'PDF Papers (20)/355.Short-Term Response of Carbon C

In [24]:
# Remove "PDF Papers (20)" from the strings in pdf_files
filename = [file.replace("PDF Papers (20)/", "") for file in filename]

# Extract the columns from list_of_lists
col1 = [item[0] for item in list_of_lists]
col2 = [item[1] for item in list_of_lists]
col3 = [item[2] for item in list_of_lists]
col4 = [item[3] for item in list_of_lists]
# col5 = [item[4] for item in list_of_lists]


# Create the dataframe
data = {
    'PDF File': filename,
    'Col1': col1,
    'Col2': col2,
    'Col3': col3,
    'Col4': col4,
    # 'Col5': col5,
}
df = pd.DataFrame(data)

In [25]:
df.to_csv('text_analysis_test.csv')

In [26]:
list_of_lists

[['North Carolina',
  'USA',
  'North America',
  'Bird',
  'United States of America'],
 ['North Carolina', 'USA', 'NC', 'Nitrous', 'Megonigal'],
 ['the West Coast', 'Florida', 'USA', 'San Bernardino'],
 ['USA', 'al.', 'Biogeochemistry', 'the Albemarle Sound', 'L-1'],
 ['South Carolina', 'Louisiana', 'Georgia', 'USA', 'Waccamaw'],
 ['USA', 'Weston', 'North Carolina', 'N2O', 'Mason'],
 ['al.', 'Georgia', 'the Altamaha River', 'N', 'Weston'],
 ['al.', 'New Jersey', 'Smith', 'al. /', 'the Delaware Bay'],
 ['al.', 'USA', 'Bridgham', 'South Carolina', 'Richmond'],
 ['USA', 'North Carolina', 'North  Carolina', 'North Carolina’s', 'NC'],
 ['al.', 'USA', 'New England', 'Niering', 'Connecticut'],
 ['North Carolina', 'Bhattachan', 'USA', 'Netherlands', 'Bhattachan et\xa0al'],
 ['al.', 'Florida', 'Louisiana', 'Gulf of Mexico', 'LA'],
 ['al.', 'Sorghum', 'Tester', 'Kielen', 'Maryland'],
 ['USA', 'L-1', 'Biogeochemistry', 'al.', 'North Carolina'],
 ['al.', 'Maryland', 'Florida', 'Virginia', 'New B