In [9]:
import os
import json
import PyPDF2
import openai
from tqdm import tqdm
import gspread
import pandas as pd
import json
import re
import ast

In [10]:
# Read and parse the JSON settings file
with open('../settings.json') as settings_file:
    settings = json.load(settings_file)

In [19]:
# Set your OpenAI API key
openai.api_key = settings['OPENAI_API_KEY']

def list_pdf_files(folder_path):
    pdf_files = [file for file in os.listdir(folder_path) if file.endswith('.pdf')]
    return pdf_files

def read_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
            # assume the abstract is on the first page
            break
    return text

def extract_abstract(text):
    messages = [
    {"role": "system", "content" : "You’re a kind helpful assistant for extracting the title and the abstract from the first page of scientific article"}
    ]
    messages.append(
        {"role": "user", "content": f"extract the title and the abstract from the following content. The content is the first page of read pdf. It contains the title at the top and after the abstract, it contains most likely keywords and introduction chapter return the result in json format with properties title and abstract. Content: {text}"}
    )
    completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
    )
    chat_response = completion.choices[0].message.content
    return ast.literal_eval(json.dumps(chat_response))

folder_path = "../data/articles"

pdf_files = list_pdf_files(folder_path)


In [22]:
processed_files = {}
errors = {}

In [23]:


with tqdm(total=len(pdf_files)) as pbar:
    for pdf_file in pdf_files:
        if pdf_file in processed_files:
            pbar.update(1)
            continue
        try:
            file_path = os.path.join(folder_path, pdf_file)
            text = read_pdf(file_path)
            parsed_content = extract_abstract(text)
            processed_files[pdf_file] = parsed_content
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")
            errors[pdf_file] = e
            
        pbar.update(1)

 35%|███▌      | 61/172 [16:11<15:47,  8.54s/it]  

Error processing 2002 Maly et al - Seasonal variability in soil N mineralization and nitrification as influenced by N fertilization.pdf: This model's maximum context length is 4097 tokens. However, your messages resulted in 11501 tokens. Please reduce the length of the messages.


 51%|█████     | 88/172 [24:48<22:16, 15.91s/it]

Error processing 2008 Nakhone and Tabatabai - Nitrogen mineralization of leguminous crops in soils.pdf: This model's maximum context length is 4097 tokens. However, your messages resulted in 8997 tokens. Please reduce the length of the messages.


 63%|██████▎   | 109/172 [30:42<10:48, 10.29s/it]

Error processing 2003 Paul et al - Defining the relation between soil water content and net nitrogen mineralization.pdf: This model's maximum context length is 4097 tokens. However, your messages resulted in 7688 tokens. Please reduce the length of the messages.


100%|██████████| 172/172 [50:38<00:00, 17.67s/it]


In [24]:
json.loads(processed_files['2011 Miller et al - Pea green manure management affects organic winter wheat yield and quality in semiarid Montana.pdf'])

{'title': 'Pea green manure management affects organic winter wheat yield and quality in semiarid Montana',
 'abstract': 'Organic farmers in semiarid Montana desire green manures that supply sufficient soil nitrate-N (NO 3-N) to subsequent crops with minimal soil water depletion. Spring and winter pea (Pisum sativum L.) green manures were compared at the bloom and pod stages for soil NO 3-N contribution and water use, and subsequent winter wheat (Triticum aestivum L.) grain yield and quality in a long-term organic farm in northern Montana. Winter wheat was managed with three additional variables (cultivar, row spacing, and seeding rate). Winter pea had 15/C1 33 kg ha/C281 greater shoot N content (at pod stage only), contributed 14 /C120 kg ha/C281 greater soil NO 3-N, used 26 /C131 mm less soil water, and increased winter wheat grain yield by 13/C139%and protein by 1.5 percentage units(2007 only),compared with spring pea.'}

In [29]:
errors

{'2002 Maly et al - Seasonal variability in soil N mineralization and nitrification as influenced by N fertilization.pdf': InvalidRequestError(message="This model's maximum context length is 4097 tokens. However, your messages resulted in 11501 tokens. Please reduce the length of the messages.", param='messages', code='context_length_exceeded', http_status=400, request_id=None),
 '2008 Nakhone and Tabatabai - Nitrogen mineralization of leguminous crops in soils.pdf': InvalidRequestError(message="This model's maximum context length is 4097 tokens. However, your messages resulted in 8997 tokens. Please reduce the length of the messages.", param='messages', code='context_length_exceeded', http_status=400, request_id=None),
 '2003 Paul et al - Defining the relation between soil water content and net nitrogen mineralization.pdf': InvalidRequestError(message="This model's maximum context length is 4097 tokens. However, your messages resulted in 7688 tokens. Please reduce the length of the me

In [60]:
file_path = os.path.join(folder_path, list(errors.keys())[1])
text = read_pdf(file_path)

In [59]:
print(text)

/G52/G4F/G53/G54/G4C/G49/G4E/G4E/GC1/G20/G56/GDD/G52/G4F/G42/G41/G2C/G20 /G34/G38/G2C/G20/G32/G30/G30/G32/G20/G28/G39/G29/G3A/G20/G33/G38/G39/G96/G33/G39/G36 /G33/G38/G39/G53/G65/G61/G73/G6F/G6E/G61/G6C/G20/G76/G61/G72/G69/G61/G62/G69/G6C/G69/G74/G79/G20/G69/G6E/G20/G73/G6F/G69/G6C/G20/G4E/G20/G6D/G69/G6E/G65/G72/G61/G6C/G69/G7A/G61/G74/G69/G6F/G6E/G20/G61/G6E/G64/G20/G6E/G69/G74/G72/G69/G66/G69/G63/G61/G74/G69/G6F/G6E
/G61/G73/G20/G69/G6E/G66/G6C/G75/G65/G6E/G63/G65/G64/G20/G62/G79/G20/G4E/G20/G66/G65/G72/G74/G69/G6C/G69/G7A/G61/G74/G69/G6F/G6E
/G53/G2E/G20/G4D/G61/G6C/GFD/G31/G2C/G20/G42/G2E/G20/G8A/G61/G72/G61/G70/G61/G74/G6B/G61/G32/G2C/G20/G4D/G2E/G20/G4B/G72/G9A/G6B/G6F/G76/GE1/G32
/G31/G43/G65/G6E/G74/G72/G61/G6C/G20/G49/G6E/G73/G74/G69/G74/G75/G74/G65/G20/G66/G6F/G72/G20/G53/G75/G70/G65/G72/G76/G69/G73/G69/G6E/G67/G20/G61/G6E/G64/G20/G54/G65/G73/G74/G69/G6E/G67/G20/G69/G6E/G20/G41/G67/G72/G69/G63/G75/G6C/G74/G75/G72/G65/G2C/G20/G42/G72/G6E/G6F/G2C/G20/G43/G7A/G65/G63/G68/G20/G5

In [63]:
decoded_text_by_key = {}
for idx, filename in enumerate(errors.keys()):
    file_path = os.path.join(folder_path, filename)
    text = read_pdf(file_path)
    # Remove the leading '/' and split the input string by '/'
    

    if idx == 0:
        custom_codes = text[1:].split('/')
        # Convert each hexadecimal code to the corresponding ASCII character
        ascii_chars = [chr(int(hex_code[1:], 16)) for hex_code in custom_codes]
        # Join the ASCII characters to form the decoded text
        decoded_text = ''.join(ascii_chars)
    else:
        decoded_words = []
        custom_code_words = text[1:].split(' ')
        for word in custom_code_words:
            custom_codes = word[1:].split('/')
            ascii_chars = [chr(int(custom_code[1:])) for custom_code in custom_codes]
            # Join the ASCII characters to form the decoded text
            decoded_word = ''.join(ascii_chars)
            decoded_words.append(decoded_word)
        decoded_text = ' '.join(decoded_words)

    
    print(decoded_text)
    decoded_text_by_key[filename] = decoded_text

ROSTLINNÁ VÝROBA, 48, 2002 (9): 389396389Seasonal variability in soil N mineralization and nitrificationas influenced by N fertilizationS. Malý1, B. arapatka2, M. Krková21Central Institute for Supervising and Testing in Agriculture, Brno, Czech Republic2Faculty of Science, Palacký University, Olomouc, Czech RepublicABSTRACTParameters characterizing N mineralization and nitrification were measured in soils of ten monitoring areas of the basalsoil monitoring carried out by the Central Institute for Supervising and Testing in Agriculture. A remarkable seasonalcycle was found only for nitrate concentrations that reached their maxima in the spring (AprilJune), and late summerand/or autumn, starting in August. Ammonium ions were nitrified immediately after fertilizer application. AnaerobicN mineralization represented a variable parameter, which was not directly affected by mineral N fertilizers. Nitrificationmeasured by means of one-week incubation was significantly stimulated by N ferti

In [64]:
for file, text in decoded_text_by_key.items():
    try:
        processed_files[file] = extract_abstract(text)
    except Exception as e:
        print(f"Error processing {file}: {e}")
        errors[file] = e

In [83]:
# processed_files
items_list = []
failed_items = []
for key, value in processed_files.items():
    try:
        # Remove newlines
        value = re.sub('\n', '', value)

        # Replace multiple spaces with single space
        value = re.sub(' +', ' ', value)

        # Remove trailing comma (if present)
        value = re.sub(',\s*}', '}', value)

        # Remove single quotes and double quotes within double quotes
        value = re.sub('(".*?)"', lambda x: x.group(1).replace("'", "").replace('"', ''), value)

        # print(value)
        if '"title"' not in value:
            title, abstract = value.split('abstract:')
            title = title.replace('title:', '')
            # remove curly braces
            title = title.replace('{', '').replace('}', '')
            abstract = abstract.replace('{', '').replace('}', '')
            # remove leading and trailing spaces
            title = title.strip()
            abstract = abstract.strip()
            #remove leading and trailing commas
            title = title.strip(',')
            abstract = abstract.strip(',')
            # json_object = {"title": title, "abstract": abstract}
        else:
            json_object = json.loads(value)
            title = json_object['title']
            abstract = json_object['abstract']
        # json_object['file_name'] = key
        items_list.append([key, title, abstract])
    except Exception as e:
        print(e)
        print(value)
        failed_items.append(value)
        break

In [84]:
len(items_list)

172

In [86]:
# create dataframe from dictionary
df = pd.DataFrame(items_list, columns=['filename', 'title', 'abstract'])

In [87]:
df['abstract'] = df['abstract'].apply(lambda x: x.lower())
df['abstract_modified'] = df['abstract']

In [89]:
# save dataframe to CSV file
df.to_csv('../data/exports/article_abstracts.csv', index=False)