In [None]:
%pip install openai
# %pip install pdfminer
# %pip install pdfminer.six
%pip install azure-ai-formrecognizer
%pip install langchain
%pip install pandas

In [None]:
import openai
import requests
import json
import io
import pandas as pd
import math
#import pdfminer.high_level
#import pdfminer.layout
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import FormRecognizerClient
from azure.ai.formrecognizer import DocumentAnalysisClient
import os
from openai import AzureOpenAI

# Document Intelligence
Document intelligence is service available through Azure. It features many models that can extract infromation from files including PDF, HTML JPEG/JPG, PNG, BMP, TIFF, HEIF, and Microsoft Office 

We will use the **Layout** model, which allows us to extract text paragraph by paragraph, and extract data from tables

In [None]:
endpoint = "https://<your open ai endpoint>.cognitiveservices.azure.com/" ##### here should be the endpoint of your azure subscription's OpenAI instance. 
api_key = "<your key here>" ##### here you will need to modify to your OpenAI aipkey, 
pdf_file_path = "pdf_no_table/30.pdf" ###### where you local pdf file you want to study, 


This function uses the document intellegence API to extract infromation from the given file using the layout model. The result is an object of type analyzeResult, which is a hierarchical data structure containing the extracted information.

In [None]:
# extracts text from pdf
def analyze_pdf_with_form_recognizer(endpoint, api_key, file_path):
    credential = AzureKeyCredential(api_key)
    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(api_key)
    )   

    with open(file_path, "rb") as file:
        poller = document_analysis_client.begin_analyze_document("prebuilt-layout", file)
        result = poller.result()
    return result

By filtering out papers that arent from Wiley, Elsivier, or RSC, we get 213 papers

Paper 23 is missing from the folder and has no DOI information

When manually checking the results from paper 84, I found that the information in Saeki's dataset does not match the content of the paper. Since we do not have the correct paper I removed it from the analysis

That means the total for this test is 211 papers

Papers are stored in ./data/pdfs and should have file name X.pdf, where X is the paper id

In [None]:
available_papers = pd.read_csv("tested_papers.csv")
# papers = available_papers[available_papers['paper_id'] > 64]['paper_id'].to_list()
papers = available_papers['paper_id'].to_list()

Still missing 23, 84 is wrong paper

In [None]:
for paper in papers:
    result = analyze_pdf_with_form_recognizer(endpoint, api_key, f"./data/pdfs/{paper}.pdf")
    output = result.to_dict()
    with open(f'./data/jsons/{paper}.json', 'w') as json_file:
        json.dump(output, json_file, indent=4)

Once we have an analyzeResult object, we can use the following function to generate a string containing the relevant text from the paper

In [None]:
# Generate complete text from AnalyzeResult
def complete_text_from_analyze_result(result):
    complete_text = ""
    for paragraph in result.paragraphs:
        if paragraph.role != 'pageFooter' and paragraph.role != 'pageHeader':
            recognized_text = paragraph.content
            complete_text += recognized_text
    return complete_text

Alternatively, we save the analyze result as a JSON, and generate the text from that. This allows us to avoid needing to analyze the same document more than once while working with the data

In [None]:
# Generate complete text from JSON
def complete_text_from_JSON(number):
    with open(f"./data/jsons/{number}.json") as file:
        data = json.load(file)
    complete_text = ""
    for paragraph in data['paragraphs']:
        if 'role' in list(paragraph.keys()):
            if paragraph['role'] != 'pageHeader' and paragraph['role'] != 'pageFooter':
                recognized_text = paragraph['content']
                complete_text += recognized_text
                complete_text += '\n'
        else:
            recognized_text = paragraph['content']
            complete_text += recognized_text
            complete_text += '\n'
    return complete_text

In [None]:
print(complete_text_from_JSON(95))

# Azure OpenAI
We can use the Azure OpenAI API to obtain completions. We will use the most recent version of GPT-4, which offers the best performance and new features like parallel function calling

In [None]:
client = AzureOpenAI(
  azure_endpoint = "https://data-mining-gpt.openai.azure.com/",##### here should be the endpoint of your azure subscription's OpenAI instance. 
  api_key='554073e88efc4d498a5df75522139352', ##### here you will need to modify to your OpenAI aipkey, 
  api_version="2023-05-15" ##### you can choose another version, you might need to find the version name in the documentation. 
)

### Tools
Tools are like function that the language model can call. They can be used to obtain reliable output or allow the model to do things like search the internet, interact with databases, or perform calcualtions. When a tool is called, the model will not provide a normal message. Instead, it will provide a list of tool calls, containing the output for each time the tool was called. Tools can be called multiple times in parallel and the response will contain the output from all calls.

Our tool will make it so the language model returns a dictionary containing polymer name, values for the desired properties, and their units. The *metrics* list defines the desired properties.

In [None]:
# Define what values to be extracted from text. Can be changed by changing the metrics list

metrics = ['power conversion efficiency (PCE)',
            'open circuit voltage (VOC)', 
            'short circuit current density (JSC)', 
            'fill factor (FF)']

# creates openAI tool for extracting data from text
extract_info_function = [
    {
        "type": "function",
            "function": {
                "name": "extract_information",
                "description": "extracts information about a polymer",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "polymer_name": {
                            "type": "string",
                            "description": "Name of the polymer",
                        }                        
                    },
                    "required": ["polymer_name"],
                },
            }
    }
]
properties = extract_info_function[0]['function']['parameters']['properties']
for metric in metrics:
    properties[metric] = {
                            "type": "number",
                            "description": f"Highest value of {metric} reported for the polymer",
                        }
    properties[f"{metric} unit"] = {
                            "type": "string",
                            "description": f"units that {metric} is reported in",
                        }
    

### Obtaining a completion
The following method generates a completion from the language model. We can choose the model, define what messages to send the model, and control the temperature. We can also provide any tools we want the model to be able to call. 

In our prompt to we ask the model to extract the desired metrics for each polymer. It is important to be clear and specific in the prompt to avoid unwanted behavior. We are also sure to provide the tool we defined earlier

In [None]:
def extract_data(metrics, text):
    response = client.chat.completions.create(
        model="data-mining-4", # model = "deployment_name".
        messages=[
            {"role": "system", "content": f"""
            You are a polymer scientist analyzing scientific papers. For the polymers studied in the paper, extract the following values: {metrics}. Report the highest value of {metrics[0]} and the corresponding values of the other metrics for each polymer. Do not make up any information. 
            """}, ##### this is part of the prompt, which probably needs to be modified to get right answer. This system prompt tells the GPT AI what role it needs to play.
            {"role": "user", "content": f"Here is the paper to analyze: {text}"},
        ],
        temperature = 0, ##### You might want to experiment with different values.
        tools = extract_info_function
    )
    return response


It is important to understand how much it costs to obtain responsed from the language models. We can calculate this from the number of tokens used for the prompt and completion, which are stored in the response object

In [None]:
# calculates the cost to obtain a response from language model
def calculate_cost(response):
    if response.model == 'gpt-4':
        output_price = 0.03/1000
        input_price = 0.01/1000
    else:
        output_price = 0.0015/1000
        input_price = 0.0005/1000
    total_cost = response.usage.prompt_tokens * input_price + response.usage.completion_tokens * output_price
    return total_cost

# prints tokens used and total cost
def print_usage(response):
    
    print('prompt tokens = ' + str(response.usage.prompt_tokens))
    print('completion tokens = ' + str(response.usage.completion_tokens))
    total_cost = calculate_cost(response)
    print('total cost = $' + str(total_cost))
    return

In [None]:
test_text = "The VOC of PMMA is 25 V. It's PCE is 15%. The same values for PVC are 40 V and 10%, respectively"
test_text2 = "The VOC of PMMA is 25 V. It's PCE is 15%. The VOC of PVC is 14V"

# Obtaining results
Using the functions we have defined, we can now analyze multiple papers to create a dataset

In [None]:
# Numbers of the papers being used, analyze result should be saved in the format X.pdf.json
# If running from the start this should already be the case
# papers = [1]
# papers = [15,30,34,77,92,154,200,204,205,241,243,262,271,273,289,303,309,330,333,334]

In [None]:

with open(f"./data/llm_cost.json") as file:
        cost = json.load(file)

for paper in papers:
    text = complete_text_from_JSON(paper)

    results_table = pd.DataFrame()
    results_table['paper_#'] = []
    results_table['polymer_name'] = []
    for metric in metrics:
        results_table[metric] = []
        results_table[f"{metric} unit"] = []
    
    response = extract_data(metrics, text)
    cost += calculate_cost(response)
    with open(f'./data/llm_cost.json', 'w') as json_file:
        json.dump(cost, json_file, indent=4) 
    tool_calls = response.choices[0].message.tool_calls
    for call in tool_calls:
        current = json.loads(call.function.arguments)
        current['paper_#'] = paper
        results_table.loc[len(results_table)] = current
    results_table.to_csv(f'./data/csv_from_text/{paper}.csv', sep='\t', header=True, encoding='utf-8')



Then we can save these results to a csv

In [None]:
results_table.to_csv('results2.csv', sep='\t', header=True, encoding='utf-8')

In [None]:
def build_results_table(papers):
    table = pd.DataFrame()
    table['paper_#'] = []
    table['polymer_name'] = []

    for metric in metrics:
        table[metric] = []
    
    

    
    for paper in papers:
        current_paper = pd.read_csv(f"./data/csv_from_text/{paper}.csv", sep='\t', encoding='utf-8')
        for index, row in current_paper.iterrows():
            ff = row['fill factor (FF)']
            if ff > 1:
                ff = ff/100
            new_row = [paper, row['polymer_name'], row['power conversion efficiency (PCE)'], row['open circuit voltage (VOC)'], row['short circuit current density (JSC)'], ff]
            table.loc[len(table)] = new_row
    return table

In [None]:
results_table = build_results_table(papers)
results_table.to_csv('./data/results_table_5.csv', sep='\t', header=True, encoding='utf-8')

To check the results, we can also generate a subset of saeki's dataset containing information from the papers of interest

In [None]:
# Given a list of integers corresponding to the ID's of the papers being looked at, 
# builds a dataframe from saeki's data set which only contains polymers from the 
# relevant papers and only relevant properties that are being extracted
def build_saeki_table(papers):
    table = pd.DataFrame()
    table['polymer_name'] = []

    for metric in metrics:
        table[metric] = []
    
    table['paper_#'] = []

    saeki = pd.read_csv("saeki's_dataset.csv")
    for paper in papers:
        current_paper = saeki[saeki['Ref. No'] == f'S{paper}']
        for index, row in current_paper.iterrows():
            new_row = [row['Nickname'], row['PCE_max(%)'], row['Voc (V)'], row['Jsc (mA cm^2)'], row['FF'], paper]
            table.loc[len(table)] = new_row
    return table

In [None]:
saeki_table = build_saeki_table(papers)
saeki_table

In [None]:
saeki_table.to_csv('saeki_table_5.csv', sep='\t', header=True, encoding='utf-8')

The following code autimatically compares the two tables generate. However, sometimes there are slight differences between the names of the polymers, so importing the data to excel and manually matching the names is more reliable

In [None]:
# Might need to fix the case where there are duplicates with the same polymer name
accuracy = {
    'polymers correct': 0, # Polymer was identified from paper
    'polymers missed': 0, # Polymer was not identified from paper
    'polymers extra': 0, # Polymer was identified from paper and not found in saeki's dataset
    'values correct': 0, # Value extracted agrees with saeki's dataset
    'values missed': 0, # Value was not extracted from text
    'values incorrect': 0, # Value extracted does not agree with saeki's dataset
    'values extra': 0 # Value for polymer not found in saeki's dataset 
}
for index, results_row in results_table.iterrows():
    saeki_row = saeki_table[saeki_table['polymer_name'] == results_row['polymer_name']]
    results_row_list = results_row.tolist()
    if len(saeki_row) == 0:
        accuracy['polymers extra'] += 1
        for i in range(0, len(metrics)):
            if not math.isnan(results_row_list[i+1]):
                accuracy['values extra'] += 1
    else:
        accuracy['polymers correct'] += 1
        saeki_row_list = saeki_row.iloc[0].tolist()
        for i in range(0, len(metrics)):
            if math.isnan(results_row_list[i+1]):
                accuracy['values missed'] += 1
            elif saeki_row_list[i+1] == results_row_list[i+1]:
                accuracy['values correct'] += 1
            else:
                accuracy['values incorrect'] +=1
accuracy['polymers missed'] = len(saeki_table) - accuracy['polymers correct']        
accuracy