In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 벡터 저장소 생성
from langchain.vectorstores import Chroma

In [39]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [3]:
from langchain_community.chat_models import ChatOllama

In [4]:
from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import RecursiveJsonSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings

import json
from pathlib import Path
from pprint import pprint

In [5]:
# 벡터 저장소 경로 설정
# ## 현재 경로에 'vectorstore' 경로 생성
vectorstore_path = './vectorstore2'
os.makedirs(vectorstore_path, exist_ok=True)

In [6]:
directory_path = Path('./files')

# 모든 JSON 파일 경로 가져오기
json_files = directory_path.glob('*.json')

In [7]:
splitter = RecursiveJsonSplitter(max_chunk_size=2000)

In [9]:
embedding_function = HuggingFaceEmbeddings(
    model_name="peoplecombine-schoolai/9-chemistry-atoms_and_molecules-finetuned_sentence_embedding",
    model_kwargs = {'device': 'cpu'},
    encode_kwargs = {'normalize_embeddings': True})

In [10]:
data = []
for json_file in json_files:
    with json_file.open('r', encoding='utf-8') as f:
        data.append(json.load(f))

In [22]:
print(len(data))

298


In [28]:
print(data[297][0])

{'name': 'CrNiHgAs', 'composition': 'As1 Cr1 Hg1 Ni1', 'prototype': None, 'spacegroup': 'F-43m', 'volume': 64.3956, 'natoms': 4, 'band_gap': 0.0, 'delta_e': 0.297176905008432, 'stability': 0.4708748129251, 'fit': 'standard', 'calculation_label': 'static'}


In [12]:
# for i in range(len(data)):
#     docs = splitter.create_documents(texts=[data[i]])
    
#     # 벡터스토어 생성 및 저장
#     vectorstore = Chroma.from_documents(docs, embeddings, persist_directory=vectorstore_path)


In [13]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever, WikipediaRetriever

wiki_retriever = WikipediaRetriever()

In [34]:
vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=embeddings)

In [31]:
flattened_data = [str(item) for sublist in data for item in sublist]

In [32]:
bm25_retriever = BM25Retriever.from_texts(
    # doc_list_1의 텍스트와 메타데이터를 사용하여 BM25Retriever를 초기화합니다.
    flattened_data,
    metadatas=[{"source": 1}] * len(flattened_data),
)
bm25_retriever.k = 1 

In [35]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

In [36]:
# # 앙상블 retriever를 초기화합니다.
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever],
    weights=[0.5, 0.5],
    search_type="mmr",
)

In [37]:
model = ChatOllama(model="chemllm-7b-chat.Q4_K_M.gguf:latest")

In [40]:
template = '''A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. ":
{context}

Question: {question}
'''

prompt = ChatPromptTemplate.from_template(template)

# RAG Chain 연결
rag_chain = (
    {'context': ensemble_retriever, 'question': RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [46]:
query = "Please tell me a chemical formula including SiO2 and Na elements"
answer = rag_chain.invoke(query)

print("Query:", query)
print("Answer:", answer)

Query: Please tell me a chemical formula including SiO2 and Na elements
Answer: Sure! One possible compound that includes both silicon dioxide (Si02)and sodium(Sn): Sodium silicate. Its molecular weight is 112 g/mol, the empirical formular of this molecule would be [Na1][[0]([Sr])[O2-]]{3}


In [None]:
Please tell me a chemical formula including SiO2 and AL elements
Please tell me a chemical formula including SiO2 and Ti elements
Please tell me a chemical formula including SiO2 and Na elements
Please tell me a chemical formula including SiO2 and Nb elements
Please tell me a chemical formula including SiO2 and Ni elements