#### データソースを準備

In [6]:
from trafilatura import fetch_url, extract

url = 'https://www.shugiin.go.jp/internet/itdb_annai.nsf/html/statics/shiryo/dl-constitution.htm'
filename = 'kenpo.txt'

document = fetch_url(url)
text = extract(document)

if text is not None:
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(text)
else:
    print("No text could be extracted from the document.")


#### チャンク分割

In [7]:
from langchain.document_loaders import TextLoader

loader = TextLoader(filename, encoding='utf-8')
raw_docs = loader.load()

from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=100,
    chunk_overlap=0,
)

docs = text_splitter.split_documents(raw_docs)

len(docs)


Created a chunk of size 172, which is longer than the specified 100
Created a chunk of size 435, which is longer than the specified 100
Created a chunk of size 332, which is longer than the specified 100
Created a chunk of size 165, which is longer than the specified 100
Created a chunk of size 116, which is longer than the specified 100
Created a chunk of size 147, which is longer than the specified 100
Created a chunk of size 124, which is longer than the specified 100
Created a chunk of size 154, which is longer than the specified 100
Created a chunk of size 135, which is longer than the specified 100
Created a chunk of size 118, which is longer than the specified 100
Created a chunk of size 105, which is longer than the specified 100
Created a chunk of size 156, which is longer than the specified 100
Created a chunk of size 107, which is longer than the specified 100
Created a chunk of size 108, which is longer than the specified 100
Created a chunk of size 147, which is longer tha

page_content='第百三条 この憲法施行の際現に在職する国務大臣、衆議院議員及び裁判官並びにその他の公務員で、その地位に相応する地位がこの憲法で認められてゐる者は、法律で特別の定をした場合を除いては、この憲法施行のため、当然にはその地位を失ふことはない。但し、この憲法によつて、後任者が選挙又は任命されたときは、当然その地位を失ふ。' metadata={'source': 'kenpo.txt'}


#### embedding

In [13]:
import os
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings

load_dotenv()

# OpenAIのEmbeddingモデルを読み込む
embeddings = OpenAIEmbeddings(deployment=os.environ["DEPLOYMENT_NAME_EMBEDDINGS"])

from langchain.vectorstores import Chroma

db = Chroma.from_documents(docs, embeddings)

#### 回答

In [20]:
from langchain.chains import  RetrievalQA
from langchain.chat_models import AzureChatOpenAI
from langchain.schema import HumanMessage

retriever = db.as_retriever()

chat = AzureChatOpenAI(
    deployment_name=os.environ["DEPLOYMENT_NAME"],
    temperature=0
)

rag_qa = RetrievalQA.from_chain_type(
    llm = chat,
    chain_type= "stuff",
    retriever = retriever,
)

query = "日本国憲法の第一条の内容は？"

result = chat([HumanMessage(content=query)])
rag_result = rag_qa.run(query)

print("RAGなしの回答")
print(result.content)
print("RAGを用いた回答")
print(rag_result)

RAGなしの回答
日本国民は、国民主権を基礎として、すべての権力を有する。この憲法は、国民の厳粛なる決意に基づくものであって、この憲法に定めるところを遵守し、努力してこれを守護することを誓う。
RAGを用いた回答
日本国憲法の第一条は、「天皇は、日本国の象徴であり日本国民統合の象徴であつて、この地位は、主権の存する日本国民の総意に基く」という内容です。
