#### Install Dependencies!

In [6]:
# !pip install chatgpt 
# !pip install pdfminer
# !pip install tqdm
# !pip install python-time
# !pip install nltk

#updated by Josh Jul 12, 2023. 

Collecting chatgpt
  Downloading chatgpt-2.2212.0-py3-none-any.whl (24 kB)
Collecting rich
  Downloading rich-13.4.2-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.4/239.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting tls-client
  Downloading tls_client-0.2.1-py3-none-any.whl (35.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.2/35.2 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting markdown-it-py>=2.2.0
  Downloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.5/87.5 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting pygments<3.0.0,>=2.13.0
  Downloading Pygments-2.15.1-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting mdurl~=0.1
  Downloading mdurl-

In [None]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from tqdm import tqdm
from io import StringIO

import json
from collections import Counter
import concurrent.futures
from multiprocessing import Pool
from functools import partial

import nltk
import spacy
import numpy as np
import glob
import pandas as pd

#### Boilerplat Function!

In [None]:
def get_pdf_file_content(path_to_pdf):
    # Set parameters 
    out_text = StringIO()
    text_converter = TextConverter(PDFResourceManager(caching=True), out_text, laparams=LAParams())
    interpreter = PDFPageInterpreter(PDFResourceManager(caching=True), text_converter)

    fp = open(path_to_pdf, 'rb')

    # Set the maximum number of pages to read
    max_pages = 7

    # Use tqdm to create a progress bar
    # with tqdm(total=max_pages, desc="Extracting") as pbar:
    for index, page in enumerate(PDFPage.get_pages(fp, pagenos=set())):
        interpreter.process_page(page)
        # pbar.update(1)

        # Check if the maximum number of pages has been reached
        if index + 1 >= max_pages:
            break

    text = out_text.getvalue()

    fp.close()
    text_converter.close()
    out_text.close()

    return text #we take text and send it to chatgpt API


#### ChatGPT API

In [None]:
from chatgpt import ChatGPT

#edit keywords here 
keywords1 = ["salt water intrusion", "sea level rise", "NACP"]

#define searching for keywords with chatgpt
def search_for_keywords(keywords, text):
    client = chatgpt.ChatGPT()
    results = client.search(keywords, text=text)
    return results

#run the analysis
if __name__ == "__main__":
    results = search_for_keywords(keywords1, text)
    print(results)

#### NLP Function

In [None]:
import re

def replace_newlines(text):
    # Replace newlines in the middle of a sentence with spaces
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

    # Remove newlines at the end of a sentence
    text = re.sub(r'\n$', '', text)

    return text

from collections import Counter

def most_recurrent_locations(text: str) -> dict:
    # Load the pre-trained model
    nlp = spacy.load("en_core_web_sm")

    # Replace newlines in the text
    sample_text = replace_newlines(text)

    # Process the text
    doc = nlp(sample_text)

    # Find location words and their locations
    locations = [entity.text for entity in doc.ents if entity.label_ == "GPE" or entity.label_ == "LOC"]

    # Disregard strings that contain specific words
    disregarded_words = ['USA', 'United States', 'United States of America', 'North America', 'UNITED STATES', 'al.', 'US', "U.S.", 'the United States']
    locations = [location for location in locations if not any(word in location for word in disregarded_words)]

    # Count the frequency of each location
    location_counts = Counter(locations)

    # Sorting locations by frequency
    sorted_locations = sorted(location_counts.items(), key=lambda x: x[1], reverse=True)

    # Create a dictionary with 7 item (frequency) format
    first_elements = {f"{item} ({count})": count for item, count in sorted_locations[:7]}

    return first_elements


#### Processing Files

In [None]:

def get_pdf_file_names():
    pdf_files = glob.glob("/Users/othmaneechchabi/Desktop/Research/Data+/Climate+/SWISLTR/script/combined_data/*.pdf")
    return pdf_files

# Define a function to process a single file
def process_pdf_file(file):
    content = get_pdf_file_content(file)
    output_dict = most_recurrent_locations(content)
    return file, list(output_dict.keys())

pdf_files = get_pdf_file_names()
list_of_lists = []

#### Multithreading

In [None]:
filename = []
# Create a ThreadPoolExecutor with the maximum number of worker threads
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the file processing tasks to the executor
    future_results = [executor.submit(process_pdf_file, file) for file in pdf_files]

    # Use tqdm to track the progress of the tasks
    for future in tqdm(concurrent.futures.as_completed(future_results), total=len(future_results)):
        # Retrieve the result from the completed task and append it to the list
        list_of_lists.append(future.result()[1])
        filename.append(future.result()[0])

# Find the maximum number of elements in the lists
max_list_length = max(len(lst) for lst in list_of_lists)

# Add empty strings to lists lacking elements
for lst in list_of_lists:
    while len(lst) < max_list_length:
        lst.append('')

# Now all the lists inside list_of_lists have the same number of elements

In [None]:
# filename = []
# list_of_lists = []

# # Process each file sequentially
# for file in pdf_files:
#     result = process_pdf_file(file)
#     list_of_lists.append(result[1])
#     filename.append(result[0])

# # Find the maximum number of elements in the lists
# max_list_length = max(len(lst) for lst in list_of_lists)

# # Add empty strings to lists lacking elements
# for lst in list_of_lists:
#     while len(lst) < max_list_length:
#         lst.append('')


# pdf_files = glob.glob("PDF Papers (20)/*.pdf")
# file = pdf_files[0]
# text_file = open("sample.txt", "w")
# n = text_file.write(get_pdf_file_content(file))
# text_file.close()


#### 'list_of_lists' now contains the results from processing each PDF file

In [None]:
list_of_lists


#### Making Pandas DataFrame

In [None]:
import pandas as pd

# Remove "PDF Papers (20)" from the strings in pdf_files
filename = [file.replace("PDF Papers (20)/", "") for file in filename]

# Extract the columns from list_of_lists
col1 = [item[0] if len(item) > 0 else '' for item in list_of_lists]
col2 = [item[1] if len(item) > 1 else '' for item in list_of_lists]
col3 = [item[2] if len(item) > 2 else '' for item in list_of_lists]
col4 = [item[3] if len(item) > 3 else '' for item in list_of_lists]
col5 = [item[4] if len(item) > 4 else '' for item in list_of_lists]
col6 = [item[5] if len(item) > 5 else '' for item in list_of_lists]
col7 = [item[6] if len(item) > 6 else '' for item in list_of_lists]

# Create the dataframe
data = {
    'PDF File': filename,
    'Col1': col1,
    'Col2': col2,
    'Col3': col3,
    'Col4': col4,
    'Col5': col5,
    'Col6': col6,
    'Col7': col7
}
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df.to_csv('text_analysis_test.csv')