#### Install Dependencies!

In [14]:
# !pip install pdfminer
# !pip install tqdm
# !pip install python-time
# !pip install nltk

In [4]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from tqdm import tqdm
from io import StringIO

import json
from collections import Counter
import concurrent.futures
from multiprocessing import Pool
from functools import partial

import nltk
import spacy
import numpy as np
import glob
import pandas as pd

#### Boilerplat Function!

In [5]:
def get_pdf_file_content(path_to_pdf):
    # Set parameters 
    out_text = StringIO()
    text_converter = TextConverter(PDFResourceManager(caching=True), out_text, laparams=LAParams())
    interpreter = PDFPageInterpreter(PDFResourceManager(caching=True), text_converter)

    fp = open(path_to_pdf, 'rb')

    # Set the maximum number of pages to read
    max_pages = 7

    # Use tqdm to create a progress bar
    # with tqdm(total=max_pages, desc="Extracting") as pbar:
    for index, page in enumerate(PDFPage.get_pages(fp, pagenos=set())):
        interpreter.process_page(page)
        # pbar.update(1)

        # Check if the maximum number of pages has been reached
        if index + 1 >= max_pages:
            break

    text = out_text.getvalue()

    fp.close()
    text_converter.close()
    out_text.close()

    return text


#### NLP Function

In [6]:
import re

def replace_newlines(text):
    # Replace newlines in the middle of a sentence with spaces
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

    # Remove newlines at the end of a sentence
    text = re.sub(r'\n$', '', text)

    return text

from collections import Counter

def most_recurrent_locations(text: str) -> dict:
    # Load the pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Replace newlines in the text
    sample_text = replace_newlines(text)

    # Process the text
    doc = nlp(sample_text)

    # Find location words and their locations
    locations = [entity.text for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]

    # Disregard strings that contain specific words
    disregarded_words = ['USA', 'United States', 'United States of America', 'North America', 'UNITED STATES', 'al.', 'US', "U.S.", 'the United States']
    locations = [location for location in locations if not any(word in location for word in disregarded_words)]

    # Count the frequency of each location
    location_counts = Counter(locations)

    # Sorting locations by frequency
    sorted_locations = sorted(location_counts.items(), key=lambda x: x[1], reverse=True)

    # Create a dictionary with 7 item (frequency) format
    first_elements = {f"{item} ({count})": count for item, count in sorted_locations[:7]}

    return first_elements


#### Processing Files

In [14]:

def get_pdf_file_names():
    pdf_files = glob.glob("/Users/othmaneechchabi/Desktop/Research/Data+/Climate+/SWISLTR/script/combined_data/*.pdf")
    return pdf_files

# Define a function to process a single file
def process_pdf_file(file):
    content = get_pdf_file_content(file)
    output_dict = most_recurrent_locations(content)
    return file, list(output_dict.keys())

pdf_files = get_pdf_file_names()
list_of_lists = []

#### Multithreading

In [15]:
filename = []
# Create a ThreadPoolExecutor with the maximum number of worker threads
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the file processing tasks to the executor
    future_results = [executor.submit(process_pdf_file, file) for file in pdf_files]

    # Use tqdm to track the progress of the tasks
    for future in tqdm(concurrent.futures.as_completed(future_results), total=len(future_results)):
        # Retrieve the result from the completed task and append it to the list
        list_of_lists.append(future.result()[1])
        filename.append(future.result()[0])

# Find the maximum number of elements in the lists
max_list_length = max(len(lst) for lst in list_of_lists)

# Add empty strings to lists lacking elements
for lst in list_of_lists:
    while len(lst) < max_list_length:
        lst.append('')

# Now all the lists inside list_of_lists have the same number of elements

 36%|███▌      | 77/213 [01:39<02:10,  1.04it/s] The PDF <_io.BufferedReader name='/Users/othmaneechchabi/Desktop/Research/Data+/Climate+/SWISLTR/script/combined_data/10.2118:176630-ms.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case
100%|██████████| 213/213 [04:08<00:00,  1.17s/it]


In [16]:
# filename = []
# list_of_lists = []

# # Process each file sequentially
# for file in pdf_files:
#     result = process_pdf_file(file)
#     list_of_lists.append(result[1])
#     filename.append(result[0])

# # Find the maximum number of elements in the lists
# max_list_length = max(len(lst) for lst in list_of_lists)

# # Add empty strings to lists lacking elements
# for lst in list_of_lists:
#     while len(lst) < max_list_length:
#         lst.append('')


# pdf_files = glob.glob("PDF Papers (20)/*.pdf")
# file = pdf_files[0]
# text_file = open("sample.txt", "w")
# n = text_file.write(get_pdf_file_content(file))
# text_file.close()


#### 'list_of_lists' now contains the results from processing each PDF file

In [17]:
list_of_lists

[['Mogao (12)',
  'Japan (5)',
  'China (3)',
  'Osaka (2)',
  'Ito Misae (1)',
  'Toyonaka (1)',
  'Dunhuang city (1)'],
 ['Africa (3)',
  'London (2)',
  'Brazil (1)',
  'South Africa (1)',
  'West Africa (1)',
  'Uganda (1)',
  'China (1)'],
 ['River (6)',
  'Australia (5)',
  'I.D. (5)',
  'Eldridge (1)',
  'Talsma (1)',
  'South (1)',
  'New  South  Wales (1)'],
 ['Oman (8)',
  'El-Kaliouby (4)',
  'Muscat (2)',
  'Northern Oman (2)',
  'J.D. (2)',
  'Young (2)',
  'Egypt (1)'],
 ['the Colorado River (13)',
  'Mexico (12)',
  'Colorado (7)',
  'the Gulf of California (7)',
  'Nile (6)',
  'Egypt (6)',
  'Indus (5)'],
 ['Guadalhorce River (12)',
  'the Guadalhorce River (12)',
  'Spain (7)',
  'the Mediterranean Sea (6)',
  'Europe (4)',
  'Southern Spain (2)',
  'Mediterranean (2)'],
 ['Argentina (8)',
  'Deregibus (6)',
  'VA (4)',
  'Sala (3)',
  'Sanchez (3)',
  'Danthonia (2)',
  'Midland (2)'],
 ['Portugal (3)',
  'Oulu (1)',
  'Finland (1)',
  'Porto (1)',
  'Khodaverdiloo (


#### Making Pandas DataFrame

In [18]:
import pandas as pd

# Remove "PDF Papers (20)" from the strings in pdf_files
filename = [file.replace("PDF Papers (20)/", "") for file in filename]

# Extract the columns from list_of_lists
col1 = [item[0] if len(item) > 0 else '' for item in list_of_lists]
col2 = [item[1] if len(item) > 1 else '' for item in list_of_lists]
col3 = [item[2] if len(item) > 2 else '' for item in list_of_lists]
col4 = [item[3] if len(item) > 3 else '' for item in list_of_lists]
col5 = [item[4] if len(item) > 4 else '' for item in list_of_lists]
col6 = [item[5] if len(item) > 5 else '' for item in list_of_lists]
col7 = [item[6] if len(item) > 6 else '' for item in list_of_lists]

# Create the dataframe
data = {
    'PDF File': filename,
    'Col1': col1,
    'Col2': col2,
    'Col3': col3,
    'Col4': col4,
    'Col5': col5,
    'Col6': col6,
    'Col7': col7
}
df = pd.DataFrame(data)

In [19]:
df

Unnamed: 0,PDF File,Col1,Col2,Col3,Col4,Col5,Col6,Col7
0,/Users/othmaneechchabi/Desktop/Research/Data+/...,Mogao (12),Japan (5),China (3),Osaka (2),Ito Misae (1),Toyonaka (1),Dunhuang city (1)
1,/Users/othmaneechchabi/Desktop/Research/Data+/...,Africa (3),London (2),Brazil (1),South Africa (1),West Africa (1),Uganda (1),China (1)
2,/Users/othmaneechchabi/Desktop/Research/Data+/...,River (6),Australia (5),I.D. (5),Eldridge (1),Talsma (1),South (1),New South Wales (1)
3,/Users/othmaneechchabi/Desktop/Research/Data+/...,Oman (8),El-Kaliouby (4),Muscat (2),Northern Oman (2),J.D. (2),Young (2),Egypt (1)
4,/Users/othmaneechchabi/Desktop/Research/Data+/...,the Colorado River (13),Mexico (12),Colorado (7),the Gulf of California (7),Nile (6),Egypt (6),Indus (5)
...,...,...,...,...,...,...,...,...
208,/Users/othmaneechchabi/Desktop/Research/Data+/...,South Africa (18),Africa (7),South (5),South Africa (4),Sarcocornia (4),Turpie (2),Estuarine (2)
209,/Users/othmaneechchabi/Desktop/Research/Data+/...,Italy (5),San Vitale (3),Adriatic Sea (2),Romagna (1),Nazarnia (1),Northern Italy (1),Carminati (1)
210,/Users/othmaneechchabi/Desktop/Research/Data+/...,Tashkent (5),Karakalpakstan (3),pp. (3),Uzbekistan (2),the Republic (1),The Republic (1),Aral Sea (1)
211,/Users/othmaneechchabi/Desktop/Research/Data+/...,pBCORc115 (6),Poly(A (2),Belgium (1),Departments (1),Gainesville (1),Ottawa (1),Ontario (1)


In [13]:
df.to_csv('text_analysis_test.csv')