In [1]:
CREDENTIALS = 'Yjg4MTQzMmUtNDAwMS00NDk0LThjOGUtNmU5ZWQ2YzQ4NDQ2OmQ4MWMxZGZiLTFmNGYtNDk5NS05OGQzLTBiMzYyYWJmNjk3OA=='
TESTPDF = "../data/papers/10.1002@solr.201900061.pdf"

In [2]:
from langchain.chat_models.gigachat import GigaChat
from langchain.chains.question_answering import load_qa_chain
from PyPDF2 import PdfReader

from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import GigaChatEmbeddings
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter
)

from langchain.chains import SimpleSequentialChain
from langchain.prompts.prompt import PromptTemplate

import mdpd
import glob
import pandas as pd

ModuleNotFoundError: No module named 'PyPDF2'

In [None]:
import json

In [None]:
prompts = json.load("prompts.json")

In [3]:
# Авторизация в сервисе GigaChat
giga = GigaChat(credentials=CREDENTIALS, 
                verify_ssl_certs=False,
                scope='GIGACHAT_API_CORP',
                model="GigaChat-Pro",
                )

chain = load_qa_chain(llm=giga, chain_type="stuff")

In [4]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

model = giga
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You're an assistant who's good at {ability}. Respond in 20 words or fewer",
        ),
        MessagesPlaceholder(variable_name="history"),
        ("human", "{input}"),
    ]
)
runnable = prompt | model

In [5]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store = {}


def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


with_message_history = RunnableWithMessageHistory(
    runnable,
    get_session_history,
    input_messages_key="input",
    history_messages_key="history",
)

In [8]:
def extract_raw_text_from_pdf(path) -> str:
    reader = PdfReader(stream=path)

    raw_text = ''
    for _, page in enumerate(reader.pages):
        text = page.extract_text()
        if text:
            raw_text += text

    return raw_text

In [9]:
from typing import Any

def get_index_from_pdf(pdf_path) -> Any:

    text_splitter = RecursiveCharacterTextSplitter(
        # separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
    )

    raw_text = extract_raw_text_from_pdf(path=pdf_path)
    texts = text_splitter.split_text(text=raw_text)
    embeddings = GigaChatEmbeddings(credentials=CREDENTIALS, verify_ssl_certs=False, scope='GIGACHAT_API_CORP')
    index = FAISS.from_texts(texts=texts, embedding=embeddings)
    return index

def invoke_chain_with_index(chain, index, query) -> dict:
    query = "For each of these {Spiro HTM, Spiro-CB, Spiro-THF} report efficiency (PCE or optimized efficiency or η). Put that data in the markdown table with columns 'HTM' - 'PCE'"
    docs = index.similarity_search(query)
    return chain.invoke({"input_documents": docs, "question": query})

In [11]:
list_objects_prompt = PromptTemplate(
    input_variables=["object_type"],
    template=LIST_OBJECTS_PROMPT
)

make_md_table_prompt = PromptTemplate(
    input_variables=["object_type", "object"],
    template=LIST_OBJECTS_PROMPT
)

In [12]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

CONTEXT_BASED_Q_TEMPLATE = """"Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}
"""

def get_doc_chain(llm, template, prompt):
    prompt = ChatPromptTemplate.from_template(template)
    return create_stuff_documents_chain(llm, prompt)

def get_retrieval_chain(index, doc_chain):

    retriever = index.as_retriever()
    return create_retrieval_chain(retriever, doc_chain)

In [14]:
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
json_text = open("../data/json/htm_output_spec_light.json", "r", encoding="utf-8").read()
dic = json.loads(json_text)

chat_history = [
    SystemMessage(content=SYSTEM_PROMPT),
    AIMessage(content="Ok! Let's get to work!")
]

In [16]:

for path in glob.glob("../data/papers/2014/*.pdf"):
    print(path)
    index = get_index_from_pdf(path)
    doc_chain = get_doc_chain(llm=giga, template=CONTEXT_BASED_Q_TEMPLATE, prompt=list_objects_prompt)
    retrieve_chain = get_retrieval_chain(index=index, doc_chain=doc_chain)
    response = retrieve_chain.invoke({"chat_history": chat_history, "input": LIST_OBJECTS_PROMPT.format(**dic)})
    print(response["answer"])
    break

../data/papers/2014/2014_36.pdf


[Spiro-OMeTAD]


### Single prompt debug

In [17]:
LIST_OBJECTS_PROMPT = 'Please list all the {object_type}s for which device efficiency results are given in this paper. Use python list format and names of HTM that were originally given by authors. For example, ["HTM1", "HTM2", "HTM3"].'
for i,path in enumerate(sorted(glob.glob("../data/papers/2014/*.pdf"))):
    # if i >= 5:
    #     break
    print(path)
    index = get_index_from_pdf(path)
    doc_chain = get_doc_chain(llm=giga, template=CONTEXT_BASED_Q_TEMPLATE, prompt=list_objects_prompt)
    retrieve_chain = get_retrieval_chain(index=index, doc_chain=doc_chain)
    response = retrieve_chain.invoke({"chat_history": chat_history, "input": LIST_OBJECTS_PROMPT.format(**dic)})
    print(response["answer"])

../data/papers/2014/2014_1.pdf
["CuI", "spiro-OMeTAD"]
../data/papers/2014/2014_10.pdf
["spiro-OMeTAD", "pp-spiro-OMeTAD", "pm-spiro-OMeTAD", "po-spiro-OMeTAD"]
../data/papers/2014/2014_11.pdf
["HTM1", "HTM2"]
../data/papers/2014/2014_12.pdf
["SGT-404", "SGT-405", "SGT-407", "spiro(OMeTAD"]
../data/papers/2014/2014_13.pdf
["X19", "X51", "Spiro-OMeTAD"]
../data/papers/2014/2014_14.pdf
["spiro-OMeTAD", "H111", "H112"]
../data/papers/2014/2014_15.pdf
["spiro-OMeTAD", "T103", "T102", "T101"]
../data/papers/2014/2014_16.pdf
["TPA-MeOPh", "FA-MeOPh"]
../data/papers/2014/2014_17.pdf
["OMeTPA-FA", "OMeTPA-TPA"]
../data/papers/2014/2014_18.pdf
["spiro-MeOTAD", "Fused-F"]
../data/papers/2014/2014_19.pdf
["Triazine-Th-OMeTPA", "Triazine-Ph-OMeTPA", "spiro-OMeTAD"]
../data/papers/2014/2014_2.pdf
["CuSCN"]
../data/papers/2014/2014_20.pdf
["spiro-OMeTAD"]
../data/papers/2014/2014_21.pdf
["spiro-MeOTAD"]
../data/papers/2014/2014_22.pdf
["spiro-OMeTAD"]
../data/papers/2014/2014_23.pdf
["spiro-OMeTAD"]

ResponseError: (URL('https://gigachat.devices.sberbank.ru/api/v1/embeddings'), 500, b'{"status":500,"message":"Internal Server Error"}\n', Headers([('server', 'nginx'), ('date', 'Thu, 25 Apr 2024 08:13:53 GMT'), ('content-type', 'application/json; charset=utf-8'), ('content-length', '49'), ('connection', 'keep-alive'), ('access-control-allow-credentials', 'true'), ('access-control-allow-headers', 'Origin, X-Requested-With, Content-Type, Accept, Authorization'), ('access-control-allow-methods', 'GET, POST, DELETE, OPTIONS'), ('access-control-allow-origin', 'https://beta.saluteai.sberdevices.ru'), ('x-request-id', '924bb75a-2b43-4328-a04e-fd502912d6cf'), ('x-session-id', 'ef0b8cf0-87a4-453a-8780-25270ff65248'), ('allow', 'GET, POST'), ('strict-transport-security', 'max-age=31536000; includeSubDomains'), ('allow', 'GET, POST'), ('strict-transport-security', 'max-age=31536000; includeSubDomains')]))

### Continued

In [62]:
MAKE_MD_TABLE_PROMPT.format(json=json_text, **dic)

'Construct a table. Rows correspond to each of hole-transport materials, columns correspond to fields specified in order given in the json below. The table should be in markdown format table. The json: ```\n{\n    "object_type": "hole-transport material",\n    "object_properties": {\n        "HTL_name": "hole-transport layer material (aliases: HTM) (e.g. Spiro-OMeTAD, CuI, Li-TFSI and TBP doped spiro-MeOTAD etc.)",\n        "HTL_type": "type of HTL material (aliases: HTM type) (e.g. small molecule, inorganic, polymer, hybrid, metal complex etc.) [text]",\n        "PCE": "best efficiency achieved for given HTM measure in % (aliases: PCE, performance, ƞ, ƞbest) [number]",\n        "V": "open circuit voltage  - value in a range from 0 to ~10 V, if in mV should be converted to V (aliases: Voc) [number]",\n        "J": "short circuit current - value in range from 0 to 50 A/cm2 (aliases: Jsc) [number]",\n        "FF": "fill factor - value in % or in a range from 0 to 1 (aliases: FF) [number]

In [None]:
gt_df = pd.read_csv("../data/papers/2014/2014.csv")
gt_df