## 0. Settings

In [1]:
import os
from pprint import pprint
from langchain.document_loaders import WebBaseLoader, PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.schema import Document
from langchain.callbacks import wandb_tracing_enabled
from langchain_pipeline.model_hub import llm_resolver

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# OpenAI 모델들을 이용하려면 api key 를 적어야 합니다
os.environ['OPENAI_API_KEY'] = "YOUR_API_KEY"

## 1. Data Loading

In [3]:
# PDF 에서 text 추출하여 load 하기
loader = PyPDFLoader("example_paper.pdf")
data = loader.load()

## 2. Text data Splitting

In [4]:
# text 를 chunk size 에 따라 자른 후 list 화 하기
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

## 3. Store (PDF text to vector data)

In [5]:
# pdf 에서 추출한 text 를 vector 화 시키기 위한 embedding model load

# openai 의 embeding 을 이용하는 경우
# embedding_model = OpenAIEmbeddings()

# huggingface 의 opensource embedding 을 이용하는 경우
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# pdf 에서 추출한 text 를 vector 화 시키고 저장하기 위한 vectorstore 생성
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding_model)

## 4. Retrieve (Relative text query from vectorstore) (이 과정은 실제 chatPDF 할 때 쓰지는 않음)

In [6]:
# search the most similar documents to a question
question = "What is the full name of CGCNN?"
vectorstore.similarity_search(question, k=4)

[Document(page_content='Illustrative example for diﬀerentiating NaCl and KCl\nWe provide a simple illustrative example to explain how CGCNN works by completing the task of diﬀerentiating\nthe structures of NaCl and KCl. Concretely, the task is to predict +1 if the crystal structure is NaCl and -1 if the\nstructure is KCl. We can accomplish it with a CGCNN with only one convolutional layer and one pooling layer.', metadata={'page': 6, 'source': 'example_paper.pdf'}),
 Document(page_content='ˆyNaCl = 3wc1+ 3wc2+ 0.5ws1+ 0.5ws2 (S9)\nˆyKCl= 4wc1+ 4wc3+ 0.5ws1+ 0.5ws3 (S10)\nWe can easily ﬁnd WcandWsthat make ˆ yNaCl = 1 and ˆyKCl=−1, showing that CGCNN are capable of\ndiﬀerentiating NaCl and KCl. In this example, the existence of certain weights wciorwsiin Eq. S9 and Eq. S10\nindicates the occurrence of elements as centers or neighbors respectively, while the factor represents the frequency of\noccurrence, both of which can help diﬀerentiating the two crystals.', metadata={'page': 7, 'sou

## 5. Chat with PDF

In [7]:
# LLM model loading

# GPT 를 활용하는 경우
#llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Open source model 을 활용하는 경우 (langchain_pipeline/model_hub 에 존재하는 model 만 사용 가능 (function 의 이름을 string 으로 입력하면 됨))
llm = llm_resolver("llama_v2_13b_chat_gptq", device=0)

The safetensors archive passed at /home/rnwnsgud1234/.cache/huggingface/hub/models--TheBloke--Llama-2-13B-chat-GPTQ/snapshots/01bfd1c28783056bf8817b6d487f0efbbabe1804/gptq_model-4bit-128g.safetensors does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'LlamaGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHe

In [8]:
# chat with vectorstore chain 생성
# search_kwargs={'k': 4} : 질문과 유사한 문서 4개를 찾아서 그 문서들을 chain의 input으로 사용
# chain_type="stuff" : chain의 종류를 stuff로 설정 (map_rerank, stuff, map_reduce, refine)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever(search_kwargs={'k': 4}), chain_type="stuff", return_source_documents=True, chain_type_kwargs={"verbose": True})

In [9]:
question = "Please summary this paper."
result = qa_chain({"query": question})
pprint("===========================================================")
pprint(f"Question: {result['query']}")
pprint(f"Answer: {result['result']}")



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

S. Dacek, S. Cholia, D. Gunter, D. Skinner, G. Ceder,
et al. , Apl Materials 1, 011002 (2013).
[12] See Supplemental Material for further details, including
Refs. [4, 24, 32–39].
[13] A. Krizhevsky, I. Sutskever, and G. E. Hinton, in Ad-
vances in neural information processing systems (2012)
pp. 1097–1105.
[14] R. Collobert and J. Weston, in Proceedings of the 25th in-
ternational conference on Machine learning (ACM, 2008)
pp. 160–167.
[15] D. K. Duvenaud, D. Maclaurin, J. Iparraguirre, R. Bom-

tions. The shaded area denotes the MAE of DFT calculation
compared with experiments[18]. (c) 2D histogram represent-
ing the predicted formation per atom against DFT calculated
value. (d) Receiver ope


[1m> Finished chain.[0m

[1m> Finished chain.[0m
'Question: Please summary this paper.'
('Answer:  This paper describes a deep learning model for predicting material '
 'properties. The model uses a combination of convolutional and fully '
 "connected layers to learn the relationship between the material's chemical "
 'composition and its properties. The model is trained on a large dataset of '
 'materials and is able to accurately predict the properties of new materials. '
 'The paper also explores the use of diﬀerent hyperparameters and their eﬀect '
 "on the model's performance.")


In [10]:
question = "What is the full name of CGCNN?"
result = qa_chain({"query": question})
pprint("===========================================================")
pprint(f"Question: {result['query']}")
pprint(f"Answer: {result['result']}")



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Illustrative example for diﬀerentiating NaCl and KCl
We provide a simple illustrative example to explain how CGCNN works by completing the task of diﬀerentiating
the structures of NaCl and KCl. Concretely, the task is to predict +1 if the crystal structure is NaCl and -1 if the
structure is KCl. We can accomplish it with a CGCNN with only one convolutional layer and one pooling layer.

ˆyNaCl = 3wc1+ 3wc2+ 0.5ws1+ 0.5ws2 (S9)
ˆyKCl= 4wc1+ 4wc3+ 0.5ws1+ 0.5ws3 (S10)
We can easily ﬁnd WcandWsthat make ˆ yNaCl = 1 and ˆyKCl=−1, showing that CGCNN are capable of
diﬀerentiating NaCl and KCl. In this example, the existence of certain weights wciorwsiin Eq. S9 and Eq. S10
indicates the occurrence of

In [11]:
question = "What is the database name of used in this paper?"
result = qa_chain({"query": question})
pprint("===========================================================")
pprint(f"Question: {result['query']}")
pprint(f"Answer: {result['result']}")



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mUse the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

generate the crystal graph, architecture parameters that are used to deﬁne the convolutional neural network on top of
the crystal graph, and training parameters that are used for the training process. Unlike the weights that are trained
via SGD, the hyperparameters are chosen through a train-validation process.
We ﬁrst randomly divide our database into three parts: training set (60%), validation set (20%), and test set

2
(a), a crystal graph Gis an undirected multigraph which
is deﬁned by nodes representing atoms and edges rep-
resenting connections between atoms in a crystal (the
method for determining atom connectivity is explained
in Supplemental Material[12]). The crystal graph is un-
li


[1m> Finished chain.[0m

[1m> Finished chain.[0m
'Question: What is the database name of used in this paper?'
('Answer:  The database name used in this paper is not explicitly mentioned, '
 'but based on the context, it is likely that the database contains '
 'information about crystal structures and their properties.')
