#### Install Dependencies!

In [14]:
# !pip install pdfminer
# !pip install tqdm
# !pip install python-time
# !pip install nltk

In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from tqdm import tqdm
from io import StringIO

import json
from collections import Counter
import concurrent.futures
from multiprocessing import Pool
from functools import partial

import nltk
import spacy
import numpy as np
import glob
import pandas as pd

#### Boilerplat Function!

In [2]:
def get_pdf_file_content(path_to_pdf):
    # Set parameters 
    out_text = StringIO()
    text_converter = TextConverter(PDFResourceManager(caching=True), out_text, laparams=LAParams())
    interpreter = PDFPageInterpreter(PDFResourceManager(caching=True), text_converter)

    fp = open(path_to_pdf, 'rb')

    # Set the maximum number of pages to read
    max_pages = 7

    # Use tqdm to create a progress bar
    # with tqdm(total=max_pages, desc="Extracting") as pbar:
    for index, page in enumerate(PDFPage.get_pages(fp, pagenos=set())):
        interpreter.process_page(page)
        # pbar.update(1)

        # Check if the maximum number of pages has been reached
        if index + 1 >= max_pages:
            break

    text = out_text.getvalue()

    fp.close()
    text_converter.close()
    out_text.close()

    return text


#### NLP Function

In [3]:
import re

def replace_newlines(text):
    # Replace newlines in the middle of a sentence with spaces
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

    # Remove newlines at the end of a sentence
    text = re.sub(r'\n$', '', text)

    return text

from collections import Counter

def most_recurrent_locations(text: str) -> dict:
    # Load the pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Replace newlines in the text
    sample_text = replace_newlines(text)

    # Process the text
    doc = nlp(sample_text)

    # Find location words and their locations
    locations = [entity.text for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]

    # Disregard strings that contain specific words
    disregarded_words = ['USA', 'United States', 'United States of America', 'North America', 'UNITED STATES', 'al.', 'US', "U.S.", 'the United States']
    locations = [location for location in locations if not any(word in location for word in disregarded_words)]

    # Count the frequency of each location
    location_counts = Counter(locations)

    # Sorting locations by frequency
    sorted_locations = sorted(location_counts.items(), key=lambda x: x[1], reverse=True)

    # Create a dictionary with 7 item (frequency) format
    first_elements = {f"{item} ({count})": count for item, count in sorted_locations[:7]}

    return first_elements


#### Processing Files

In [4]:

def get_pdf_file_names():
    pdf_files = glob.glob("data/*.pdf")
    return pdf_files

# Define a function to process a single file
def process_pdf_file(file):
    content = get_pdf_file_content(file)
    output_dict = most_recurrent_locations(content)
    return file, list(output_dict.keys())

pdf_files = get_pdf_file_names()
list_of_lists = []

#### Multithreading

In [5]:
filename = []
# Create a ThreadPoolExecutor with the maximum number of worker threads
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the file processing tasks to the executor
    future_results = [executor.submit(process_pdf_file, file) for file in pdf_files]

    # Use tqdm to track the progress of the tasks
    for future in tqdm(concurrent.futures.as_completed(future_results), total=len(future_results)):
        # Retrieve the result from the completed task and append it to the list
        list_of_lists.append(future.result()[1])
        filename.append(future.result()[0])

# Find the maximum number of elements in the lists
max_list_length = max(len(lst) for lst in list_of_lists)

# Add empty strings to lists lacking elements
for lst in list_of_lists:
    while len(lst) < max_list_length:
        lst.append('')

# Now all the lists inside list_of_lists have the same number of elements

100%|██████████| 18/18 [00:36<00:00,  2.04s/it]


In [6]:
# filename = []
# list_of_lists = []

# # Process each file sequentially
# for file in pdf_files:
#     result = process_pdf_file(file)
#     list_of_lists.append(result[1])
#     filename.append(result[0])

# # Find the maximum number of elements in the lists
# max_list_length = max(len(lst) for lst in list_of_lists)

# # Add empty strings to lists lacking elements
# for lst in list_of_lists:
#     while len(lst) < max_list_length:
#         lst.append('')


# pdf_files = glob.glob("PDF Papers (20)/*.pdf")
# file = pdf_files[0]
# text_file = open("sample.txt", "w")
# n = text_file.write(get_pdf_file_content(file))
# text_file.close()


#### 'list_of_lists' now contains the results from processing each PDF file

In [7]:
list_of_lists

[['the West Coast (1)', 'Florida (1)', 'San Bernardino (1)', '', '', '', ''],
 ['L-1 (10)',
  'Biogeochemistry (4)',
  'North Carolina (3)',
  'NC (2)',
  'Columbia (2)',
  'MD (2)',
  'N2O (1)'],
 ['North Carolina (4)',
  'Gull Rock (3)',
  'Swanquarter (3)',
  'Swanquar- (2)',
  'Palmetto Peartree Preserve (2)',
  '0.01b (2)',
  'Raleigh (1)'],
 ['North Carolina (4)',
  'Biogeochemistry (4)',
  'the Albemarle Sound (4)',
  'NC (3)',
  'L-1 (3)',
  'Ardo´n (2)',
  'Ohio (2)'],
 ['North Carolina (8)',
  'Bhattachan (3)',
  'North  Carolina (3)',
  'Haywood (2)',
  'Netherlands (2)',
  'the Albemarle Sound (2)',
  'ArcGIS (2)'],
 ['Maryland (9)',
  'New York (7)',
  'Virginia (7)',
  'Florida (6)',
  'New Brunswick (4)',
  'KEARNEY (4)',
  'Copenheaver (3)'],
 ['the Altamaha River (7)',
  'HS(cid:1 (4)',
  'N (4)',
  'Georgia (3)',
  'Canfield (3)',
  'Weston (2)',
  'Altamaha River (2)'],
 ['Smith (7)',
  'New Jersey (6)',
  'the Delaware Bay (4)',
  'Sebold (4)',
  'Delaware (3)',
  '


#### Making Pandas DataFrame

In [8]:
import pandas as pd

# Remove "PDF Papers (20)" from the strings in pdf_files
filename = [file.replace("PDF Papers (20)/", "") for file in filename]

# Extract the columns from list_of_lists
col1 = [item[0] if len(item) > 0 else '' for item in list_of_lists]
col2 = [item[1] if len(item) > 1 else '' for item in list_of_lists]
col3 = [item[2] if len(item) > 2 else '' for item in list_of_lists]
col4 = [item[3] if len(item) > 3 else '' for item in list_of_lists]
col5 = [item[4] if len(item) > 4 else '' for item in list_of_lists]
col6 = [item[5] if len(item) > 5 else '' for item in list_of_lists]
col7 = [item[6] if len(item) > 6 else '' for item in list_of_lists]

# Create the dataframe
data = {
    'PDF File': filename,
    'Col1': col1,
    'Col2': col2,
    'Col3': col3,
    'Col4': col4,
    'Col5': col5,
    'Col6': col6,
    'Col7': col7
}
df = pd.DataFrame(data)

In [9]:
df

Unnamed: 0,PDF File,Col1,Col2,Col3,Col4,Col5,Col6,Col7
0,SEA-LEVEL RISE AND COASTAL FOREST RETREAT ON T...,the West Coast (1),Florida (1),San Bernardino (1),,,,
1,10.1007:s10533-014-9986-x.pdf,L-1 (10),Biogeochemistry (4),North Carolina (3),NC (2),Columbia (2),MD (2),N2O (1)
2,10.1007:s10533-021-00797-5.pdf,North Carolina (4),Gull Rock (3),Swanquarter (3),Swanquar- (2),Palmetto Peartree Preserve (2),0.01b (2),Raleigh (1)
3,10.1007:s10533-016-0189-5.pdf,North Carolina (4),Biogeochemistry (4),the Albemarle Sound (4),NC (3),L-1 (3),Ardo´n (2),Ohio (2)
4,10.1007:s11069-019-03706-0.pdf,North Carolina (8),Bhattachan (3),North Carolina (3),Haywood (2),Netherlands (2),the Albemarle Sound (2),ArcGIS (2)
5,10.2112:04-0211.1.pdf,Maryland (9),New York (7),Virginia (7),Florida (6),New Brunswick (4),KEARNEY (4),Copenheaver (3)
6,10.1029:2005JG000071.pdf,the Altamaha River (7),HS(cid:1 (4),N (4),Georgia (3),Canfield (3),Weston (2),Altamaha River (2)
7,10.1016:j.ocecoaman.2017.09.010.pdf,Smith (7),New Jersey (6),the Delaware Bay (4),Sebold (4),Delaware (3),Weishar (3),Warren (3)
8,10.1672:08-77.1.pdf,Louisiana (13),South Carolina (12),Georgia (10),Waccamaw (5),Waccamaw River (2),Winyah Bay (2),Savannah River (2)
9,10.2136:sssaj2011.0026.pdf,NS (14),NaCl (6),Specifi (2),MD (2),NS\n\n (2),NS NS (2),Purvaja (2)


In [10]:
df.to_csv('text_analysis_test.csv')