In [1]:
!pip install --upgrade langchain langchain_groq
!pip install --upgrade langchain-community
!pip install --upgrade langchain-openai
!pip install --upgrade langchain-core
!pip install --upgrade langsmith
!pip install ctransformers[cuda]
!pip install huggingface-hub
!pip install --upgrade sqlalchemy
!pip install rdflib
!pip install llama-cpp-python
!pip install typing-extensions==4.7.1 --upgrade
!pip install pypdf2
!pip install sentence-transformers
!pip install faiss-gpu
!pip install bitsandbytes accelerate sentence-transformers

[0mCollecting langchain
  Downloading langchain-0.2.1-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting langchain_groq
  Downloading langchain_groq-0.1.4-py3-none-any.whl (11 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.9.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting langchain-core<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_core-0.2.3-py3-none-any.whl (310 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.2/310.2 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.0-py3-none-any.whl (23 kB)
Collecting langsm

In [197]:
import re
import os
import transformers
import torch
from torch import cuda, bfloat16
import json
from langchain.chat_models import ChatOpenAI
from langchain_openai import OpenAI
from langchain_groq import ChatGroq
from langchain_community.chat_models import ChatZhipuAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer, util

In [3]:
os.environ["GROQ_API_KEY"] = "XXXX"

In [126]:
os.environ["ZHIPUAI_API_KEY"] = "XXXX"

In [147]:
os.environ["OPENAI_API_KEY"] = "XXXX"

In [127]:
llm = ChatGroq(temperature=0, model_name="llama3-8b-8192")

In [137]:
llm = ChatZhipuAI(
    model="glm-4",
    temperature=0,
)

In [199]:
llm = ChatOpenAI(model_name='gpt-4-turbo-2024-04-09')

In [5]:
sim_model_id = "Salesforce/SFR-Embedding-Mistral"
model_kwargs = {"device": "cuda"}
sim_model = HuggingFaceEmbeddings(model_name = sim_model_id, model_kwargs=model_kwargs)



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [110]:
SYS_PROMPT = """You are an assistant for extract information from context and selection the possible answer from the selection provided.
You are given the extracted parts of a paper about solar chemistry and a question. Provide the extracted information and nothing else.
Context: {context}
Question: {question}
"""

In [111]:
rag_prompt = PromptTemplate(template=SYS_PROMPT, input_variables=['Context', 'Question'])

In [112]:
def get_rag(text, embed_model):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=128,
        length_function=len
    )
    chunks = text_splitter.split_text(text=text)
    return FAISS.from_texts(chunks, embedding=embed_model)

In [113]:
item_list = ['Light_source', 'Lamp', 'Reactor_type', 'Reaction_medium', 'Operation_mode']

item_group = {
    "group_1": {"items": ["Light_source", "Lamp"], "restriction": "If Light_source is Solar or Solar Simulator, Lamp is always Solar Simulator"},
    "group_2": {"items": ["Reaction_medium"], "restriction": "If Ph Value is mentioned in the experiment, Reaction_medium is always Liquid"}
}

title_list = ["Abstract", "Experimental", "Results and discussion"]

In [114]:
with open("./data/item_choice.json", "rb") as f:
    item_data = json.load(f)

In [115]:
with open("section_ground.json", "rb") as f:
    ground_truth = json.load(f)

In [116]:
def clean_line(e):
    return ''.join(e for e in string if e.isalnum())

In [153]:
def clean_gen(gen):
    res = {}
    for line in gen.split("\n"):
        if ":" in line:
            try:
                s, e = line.split(":")
                res[s.strip()] = e
            except:
                pass
    return res

In [118]:
def cal_distance(res, truth):
    a, b = sim_model.embed_query(res), sim_model.embed_query(truth)
    return round(float(util.cos_sim(a, b).detach().numpy()[0]), 5)

In [119]:
def eval_res(pred, ground):
    pred, ground = pred.lower(), ground.lower()
    if ground in pred:
        return 1
    elif pred in ground:
        return 1
    elif pred == "optical fiber" and ground == "fixed-bed":
        return 1
    elif pred == "membrane" and ground == "fixed-bed":
        return 1
    elif cal_distance(ground, pred) >= 0.85:
        return 1
    else:
        return 0

In [120]:
def get_context(context):
    res = []
    for item in context:
        res.append(item.page_content)
    return res

In [200]:
all_list = ['catalyst', 'co_catalyst', 'Light_source', 'Lamp', 'Reactor_type', 'Reaction_medium', 'Operation_mode']

In [201]:
for all_key in all_list:
    locals()[all_key] = []

In [202]:
result = {}

In [203]:
context_data = {}

In [204]:
for i in range(30):
    idx = i + 1
    if str(idx) in list(ground_truth.keys()):
        fp = "paper_json/paper" + str(idx) + "_extraction.json"
        with open(fp, "rb") as f:
            data = json.load(f)
        context = ""
        for section in data:
            if section["title"] in title_list:
                context += section["title"]
                context += "\n"
                context += section["content"]
                context += "\n"
        vector_store = get_rag(context, sim_model)
        retriever = vector_store.as_retriever(search_kwargs={'k': 5})
        temp_context = {}
        catalyst_prompt = """
        Please find the name of the catalyst and co-catalyst name from the provided context which describe an solar chemical experiment.
        Please only generate the name of the catalyst and nothing else. Such as TiO2. ZrO2 or Ag, etc.
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        If there are multiple results, please indicate them as XXX-XXX.
        catalyst: XXX
        co_catalyst: XXX
        """
        qa = RetrievalQA.from_chain_type(llm=llm,
                               chain_type="stuff",
                               retriever=retriever,
                               return_source_documents=True,
                               chain_type_kwargs={"prompt": rag_prompt})
        response = qa.invoke({"query": catalyst_prompt})
        # print(type(response["source_documents"][0].page_content))
        temp_context["catalyst/co_catalyst"] = get_context(response["source_documents"])
        res = response['result']
        light_prompt = f"""
        Please find the category of Light_source and Lamp from the provided context which describe an solar chemical experiment.
        The generation condition of the extraction is given: If Light_source is Solar or Solar Simulator, Lamp is always Solar Simulator.
        Please only select the generation from the provided possible choices.
        Possible Choices:
        Light_source: {item_data["Light_source"]}
        Lamp: {item_data["Lamp"]}
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        Light_source: XXX
        Lamp: XXX
        """
        response = qa.invoke({"query": light_prompt})
        temp_context["Light_source/Lamp"] = get_context(response["source_documents"])
        res += "\n"
        res += response['result']
        
        medium_prompt = f"""
        Please find the category of Reaction_medium from the provided context which describe an solar chemical experiment.
        The generation condition of the extraction is given: If Ph Value is mentioned in the experiment, Reaction_medium is always Liquid.
        Please only select the generation from the provided possible choices.
        Possible Choices:
        Reaction_medium: {item_data["Reaction_medium"]}
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        Reaction_medium: XXX
        """
        response = qa.invoke({"query": medium_prompt})
        temp_context["Reaction_medium"] = get_context(response["source_documents"])
        res += "\n"
        res += response['result']
        reactor_prompot = f"""
        Please find the category of Reactor_type from the provided context which describe an solar chemical experiment.
        Please only select the generation from the provided possible choices.
        Possible Choices:
        Reactor_type: {item_data["Reactor_type"]}
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        Reactor_type: XXX
        """
        response = qa.invoke({"query": reactor_prompot})
        temp_context["Reactor_type"] = get_context(response["source_documents"])
        res += "\n"
        res += response['result']
        operation_prompot = f"""
        Please find the category of Operation_mode from the provided context which describe an solar chemical experiment.
        Please only select the generation from the provided possible choices.
        Possible Choices:
        Operation_mode: {item_data["Operation_mode"]}
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        Operation_mode: XXX
        """
        response = qa.invoke({"query": operation_prompot})
        temp_context["Operation_mode"] = get_context(response["source_documents"])
        res += "\n"
        res += response['result']
        print(res)
        result[str(idx)] = clean_gen(res)
        context_data[str(idx)] = temp_context
        print(result[str(idx)])
        print(ground_truth[str(idx)])
        temp_flag = []
        for all_item in all_list:
            try:
                flag = eval_res(result[str(idx)][all_item], ground_truth[str(idx)][all_item])
            except:
                flag = 0
            # result[str(idx)][all_item+"_ground"] = ground_truth[str(idx)][all_item]
            locals()[all_item].append(flag)
            temp_flag.append(flag)
        # if sum(temp_flag) >= 4:
        #     result[str(idx)]["flag"] = "Success"
        # else:
        #     result[str(idx)]["flag"] = "Failed"
        print(temp_flag)
    else:
        pass

catalyst: TiO2
co_catalyst: Ag
Light_source: UV
Lamp: Mercury
Reaction_medium: Liquid
Reactor_type: Slurry
Operation_mode: Batch
{'catalyst': ' TiO2', 'co_catalyst': ' Ag', 'Light_source': ' UV', 'Lamp': ' Mercury', 'Reaction_medium': ' Liquid', 'Reactor_type': ' Slurry', 'Operation_mode': ' Batch'}
{'catalyst': 'TiO2', 'co_catalyst': 'Ag', 'Light_source': 'UV', 'Lamp': 'Mercury', 'Reactor_type': 'Slurry', 'Reaction_medium': 'Liquid', 'Operation_mode': 'Batch'}
[1, 1, 1, 1, 1, 1, 1]
catalyst: TiO2
co_catalyst: O2
Light_source: UV
Lamp: Fluorescent
Reaction_medium: Gas
Reactor_type: Membrane
Operation_mode: Batch/Continuous
{'catalyst': ' TiO2', 'co_catalyst': ' O2', 'Light_source': ' UV', 'Lamp': ' Fluorescent', 'Reaction_medium': ' Gas', 'Reactor_type': ' Membrane', 'Operation_mode': ' Batch/Continuous'}
{'catalyst': 'TiO2', 'co_catalyst': 'None', 'Light_source': 'UV', 'Lamp': 'Fluorescent', 'Reactor_type': 'Fixed-bed', 'Reaction_medium': 'Gas', 'Operation_mode': 'Batch/Continous'}
[1

In [205]:
import pandas as pd

In [206]:
refer = pd.read_csv("./data/paper_references.csv")

In [207]:
for key, item in result.items():
    if key.isnumeric():
        a = refer[refer["No_de_Ref"] == int(key)]
        item["DOI"] = str(a["DOI"].values[0])

In [208]:
for key, value in ground_truth["1"].items():
    print(sum(locals()[key]) / len(locals()[key]))

0.8620689655172413
0.4827586206896552
0.6551724137931034
0.6551724137931034
0.4482758620689655
0.6206896551724138
0.8620689655172413


In [209]:
prompt_template = f"""
        Please find the category of Light_source and Lamp from the provided context which describe an solar chemical experiment.
        The generation condition of the extraction is given: If Light_source is Solar or Solar Simulator, Lamp is always Solar Simulator.
        Please only select the generation from the provided possible choices.
        Possible Choices:
        Light_source: {item_data["Light_source"]}
        Lamp: {item_data["Lamp"]}
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        Light_source: XXX
        Lamp: XXX
        """

In [210]:
gen_file = {}

In [211]:
gen_result = []
gen_file["model_id"] = "GPT4"
gen_file["prompt_template"] = prompt_template

In [212]:
for key, item in result.items():
    temp = {}
    temp["reference_index"] = key
    temp["DOI"] = item["DOI"]
    result = {}
    for item_key, item_item in item.items():
        if item_key != "DOI":
            result[item_key.lower()] = item_item
    temp["generation"] = result
    gen_result.append(temp)

In [213]:
gen_file["result"] = gen_result

In [214]:
with open('GPT4/Generation.json', "w") as f:
    json.dump(gen_file, f)

In [215]:
temp = {}
for key, value in ground_truth["1"].items():
    temp[key] = sum(locals()[key]) / len(locals()[key])

In [216]:
eval_result = {}

In [217]:
eval_result["generation_model_id"] = "GPT4"
eval_result["similarity_model_id"] = "Salesforce/SFR-Embedding-Mistral"
eval_result["source_ground_truth"] = "Ground_Truth.json"
eval_result["source_generation"] = "/LLama_3_70B/Generation.json"
eval_result["evaluation_strategy"] = "group"
eval_result["metric"] = "accuracy"
eval_result["result"] = []

In [218]:
for key, item in temp.items():
    a = key.lower()
    temp_sub = {}
    temp_sub["item"] = a
    temp_sub["acc"] = item
    print(temp_sub)
    eval_result["result"].append(temp_sub)

{'item': 'catalyst', 'acc': 0.8620689655172413}
{'item': 'co_catalyst', 'acc': 0.4827586206896552}
{'item': 'light_source', 'acc': 0.6551724137931034}
{'item': 'lamp', 'acc': 0.6551724137931034}
{'item': 'reactor_type', 'acc': 0.4482758620689655}
{'item': 'reaction_medium', 'acc': 0.6206896551724138}
{'item': 'operation_mode', 'acc': 0.8620689655172413}


In [219]:
with open('GPT4/Evaluation.json', "w") as f:
    json.dump(eval_result, f)

In [220]:
context_result = {}
context_result["similarity_model_id"] = "Salesforce/SFR-Embedding-Mistral"
context_result["similarity_metric"] = "Cosine_Similarity"
a = []

In [221]:
for key, item in context_data.items():
    temp = {}
    temp["reference_index"] = key
    temp["context"] = {}
    for context_item, context in item.items():
        temp['context']["item"] = context_item.split('/')
        temp['context']['chunks'] = context
    a.append(temp)

In [222]:
context_result["contexts"] = a

In [223]:
with open("GPT4/Context.json", "w") as f:
    json.dump(context_result, f)

In [224]:
llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7f5593b0a210>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7f5593a5b3d0>, model_name='gpt-4-turbo-2024-04-09', openai_api_key='sk-proj-aREI5CtcwJat4pmFFw9LT3BlbkFJvTYsLBhPO4Nn5m2PUO8R', openai_proxy='')