In [1]:
!pip install --upgrade langchain langchain_groq
!pip install --upgrade langchain-community
!pip install --upgrade langchain-openai
!pip install --upgrade langchain-core
!pip install --upgrade langsmith
!pip install ctransformers[cuda]
!pip install huggingface-hub
!pip install --upgrade sqlalchemy
!pip install rdflib
!pip install llama-cpp-python
!pip install typing-extensions==4.7.1 --upgrade
!pip install pypdf2
!pip install sentence-transformers
!pip install faiss-gpu
!pip install bitsandbytes accelerate sentence-transformers

[0mCollecting langchain
  Downloading langchain-0.2.0-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.7/973.7 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting langchain_groq
  Downloading langchain_groq-0.1.4-py3-none-any.whl (11 kB)
Collecting aiohttp<4.0.0,>=3.8.3
  Downloading aiohttp-3.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting dataclasses-json<0.7,>=0.5.7
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting tenacity<9.0.0,>=8.1.0
  Downloading tenacity-8.3.0-py3-none-any.whl (25 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0
  Downloading langchain_text_splitters-0.2.0-py3-none-any.whl (23 kB)
Collecting pydantic<3,>=1
  Downloading pydantic-2.7.1-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [2]:
import re
import os
import transformers
import torch
from torch import cuda, bfloat16
import json

from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer, util

In [3]:
os.environ["GROQ_API_KEY"] = "gsk_XXXXXX"

In [125]:
llm = ChatGroq(temperature=0, model_name="llama3-70b-8192")

In [5]:
sim_model_id = "Salesforce/SFR-Embedding-Mistral"
model_kwargs = {"device": "cuda"}
sim_model = HuggingFaceEmbeddings(model_name = sim_model_id, model_kwargs=model_kwargs)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
SYS_PROMPT = """You are an assistant for extract information from context and selection the possible answer from the selection provided.
You are given the extracted parts of a paper about solar chemistry and a question. Provide the extracted information and nothing else.
Context: {context}
Question: {question}
"""

In [7]:
rag_prompt = PromptTemplate(template=SYS_PROMPT, input_variables=['Context', 'Question'])

In [8]:
def get_rag(text, embed_model):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1024,
        chunk_overlap=128,
        length_function=len
    )
    chunks = text_splitter.split_text(text=text)
    return FAISS.from_texts(chunks, embedding=embed_model)

In [9]:
item_list = ['Light_source', 'Lamp', 'Reactor_type', 'Reaction_medium', 'Operation_mode']

item_group = {
    "group_1": {"items": ["Light_source", "Lamp"], "restriction": "If Light_source is Solar or Solar Simulator, Lamp is always Solar Simulator"},
    "group_2": {"items": ["Reaction_medium"], "restriction": "If Ph Value is mentioned in the experiment, Reaction_medium is always Liquid"}
}

title_list = ["Abstract", "Experimental", "Results and discussion"]

In [10]:
with open("./data/item_choice.json", "rb") as f:
    item_data = json.load(f)

In [11]:
with open("section_ground.json", "rb") as f:
    ground_truth = json.load(f)

In [117]:
def clean_line(e):
    return ''.join(e for e in string if e.isalnum())

In [121]:
def clean_gen(gen):
    res = {}
    for line in gen.split("\n"):
        if ":" in line:
            try:
                s, e = line.split(":")
                res[s] = e
            except:
                pass
    return res

In [13]:
def cal_distance(res, truth):
    a, b = sim_model.embed_query(res), sim_model.embed_query(truth)
    return round(float(util.cos_sim(a, b).detach().numpy()[0]), 5)

In [14]:
def eval_res(pred, ground):
    pred, ground = pred.lower(), ground.lower()
    if ground in pred:
        return 1
    elif pred in ground:
        return 1
    elif pred == "optical fiber" and ground == "fixed-bed":
        return 1
    elif pred == "membrane" and ground == "fixed-bed":
        return 1
    elif cal_distance(ground, pred) >= 0.85:
        return 1
    else:
        return 0

In [130]:
all_list = ['catalyst', 'co_catalyst', 'Light_source', 'Lamp', 'Reactor_type', 'Reaction_medium', 'Operation_mode']

In [134]:
for all_key in all_list:
    locals()[all_key] = []

In [135]:
result = {}

In [136]:
for i in range(30):
    idx = i + 1
    if str(idx) in list(ground_truth.keys()):
        fp = "paper_json/paper" + str(idx) + "_extraction.json"
        with open(fp, "rb") as f:
            data = json.load(f)
        context = ""
        for section in data:
            if section["title"] in title_list:
                context += section["title"]
                context += "\n"
                context += section["content"]
                context += "\n"
        vector_store = get_rag(context, sim_model)
        retriever = vector_store.as_retriever(search_kwargs={'k': 5})
        catalyst_prompt = """
        Please find the name of the catalyst and co-catalyst name from the provided context which describe an solar chemical experiment.
        Please only generate the name of the catalyst and nothing else. Such as TiO2. ZrO2 or Ag, etc.
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        If there are multiple results, please indicate them as XXX-XXX.
        catalyst: XXX
        co_catalyst: XXX
        """
        qa = RetrievalQA.from_chain_type(llm=llm,
                               chain_type="stuff",
                               retriever=retriever,
                               return_source_documents=True,
                               chain_type_kwargs={"prompt": rag_prompt})
        response = qa.invoke({"query": catalyst_prompt})
        res = response['result']
        light_prompt = f"""
        Please find the category of Light_source and Lamp from the provided context which describe an solar chemical experiment.
        The generation condition of the extraction is given: If Light_source is Solar or Solar Simulator, Lamp is always Solar Simulator.
        Please only select the generation from the provided possible choices.
        Possible Choices:
        Light_source: {item_data["Light_source"]}
        Lamp: {item_data["Lamp"]}
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        Light_source: XXX
        Lamp: XXX
        """
        response = qa.invoke({"query": light_prompt})
            # print(response['result'])
        res += "\n"
        res += response['result']
        
        medium_prompt = f"""
        Please find the category of Reaction_medium from the provided context which describe an solar chemical experiment.
        The generation condition of the extraction is given: If Ph Value is mentioned in the experiment, Reaction_medium is always Liquid.
        Please only select the generation from the provided possible choices.
        Possible Choices:
        Reaction_medium: {item_data["Reaction_medium"]}
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        Reaction_medium: XXX
        """
        response = qa.invoke({"query": medium_prompt})
            # print(response['result'])
        res += "\n"
        res += response['result']
        reactor_prompot = f"""
        Please find the category of Reactor_type from the provided context which describe an solar chemical experiment.
        Please only select the generation from the provided possible choices.
        Possible Choices:
        Reactor_type: {item_data["Reactor_type"]}
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        Reactor_type: XXX
        """
        response = qa.invoke({"query": reactor_prompot})
            # print(response['result'])
        res += "\n"
        res += response['result']
        operation_prompot = f"""
        Please find the category of Operation_mode from the provided context which describe an solar chemical experiment.
        Please only select the generation from the provided possible choices.
        Possible Choices:
        Operation_mode: {item_data["Operation_mode"]}
        Please generating restrictively follow the format, and must start the generation as the format. Do not generate anything else.
        Operation_mode: XXX
        """
        response = qa.invoke({"query": operation_prompot})
            # print(response['result'])
        res += "\n"
        res += response['result']
        print(res)
        result[str(idx)] = clean_gen(res)
        print(result[str(idx)])
        print(ground_truth[str(idx)])
        temp_flag = []
        for all_item in all_list:
            try:
                flag = eval_res(result[str(idx)][all_item], ground_truth[str(idx)][all_item])
            except:
                flag = 0
            result[str(idx)][all_item+"_ground"] = ground_truth[str(idx)][all_item]
            locals()[all_item].append(flag)
            temp_flag.append(flag)
        print(temp_flag)
    else:
        pass

catalyst: TiO2
co_catalyst: Ag
Light_source: UV
Lamp: Hg
Reaction_medium: Liquid
Reactor_type: Slurry
Operation_mode: Batch
{'catalyst': ' TiO2', 'co_catalyst': ' Ag', 'Light_source': ' UV', 'Lamp': ' Hg', 'Reaction_medium': ' Liquid', 'Reactor_type': ' Slurry', 'Operation_mode': ' Batch'}
{'catalyst': 'TiO2', 'co_catalyst': 'Ag', 'Light_source': 'UV', 'Lamp': 'Mercury', 'Reactor_type': 'Slurry', 'Reaction_medium': 'Liquid', 'Operation_mode': 'Batch'}
[1, 1, 1, 0, 1, 1, 1]
catalyst: TiO2
co_catalyst: None
Light_source: UV
Lamp: None
Reaction_medium: Gas
Reactor_type: Membrane
Operation_mode: Batch
{'catalyst': ' TiO2', 'co_catalyst': ' None', 'Light_source': ' UV', 'Lamp': ' None', 'Reaction_medium': ' Gas', 'Reactor_type': ' Membrane', 'Operation_mode': ' Batch'}
{'catalyst': 'TiO2', 'co_catalyst': 'None', 'Light_source': 'UV', 'Lamp': 'Fluorescent', 'Reactor_type': 'Fixed-bed', 'Reaction_medium': 'Gas', 'Operation_mode': 'Batch/Continous'}
[1, 1, 1, 0, 0, 1, 1]
catalyst: TiO2
co_cata

In [139]:
for key, value in ground_truth["1"].items():
    print(sum(locals()[key]) / len(locals()[key]))

0.8275862068965517
0.6551724137931034
0.7931034482758621
0.5862068965517241
0.3448275862068966
0.6206896551724138
0.7931034482758621


In [140]:
result["evaluation"] = {}
for key, value in ground_truth["1"].items():
    result["evaluation"][key] = sum(locals()[key]) / len(locals()[key])

In [141]:
with open("Group_Result_llama_3_70B_Groq.json", "w") as f:
    json.dump(result, f)