# データの読み取り

In [None]:
from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader

In [None]:
import json

backlog_wiki_json_file_path = './backlog-wikis/wiki-pages.json'

with open(backlog_wiki_json_file_path, 'r') as f:
    backlog_wiki_json = json.load(f)

i = 1
backlog_wiki_json[i]['projectId'], backlog_wiki_json[i]['projectKey'], backlog_wiki_json[i]['projectName'], backlog_wiki_json[i]['name']

In [None]:
import os

file_contents = []

for wiki_page in backlog_wiki_json:
    content_dir_path: str = \
        './backlog-wikis/{project_key}_{project_name}/{wiki_id}_{name}/'.format(
            project_key = wiki_page['projectKey'],
            project_name = wiki_page['projectName'],
            wiki_id = wiki_page['id'],
            name = wiki_page['name'].replace('/', '／'),
        )

    content_file_name: str = \
        '{project_key}_{project_name}__{name}.md'.format(
            project_key = wiki_page['projectKey'],
            project_name = wiki_page['projectName'],
            name = wiki_page['name'].replace('/', '／'),
        )
    
    content_file_path: str = \
        '{dir}{file}'.format(
            dir = content_dir_path,
            file = content_file_name,
        )

    if not os.path.isfile(content_file_path):
        print('ERROR : ファイルは存在しません。{content_file_path}'.format(content_file_path = content_file_path,))
        continue
        
    with open(content_file_path, 'r') as f:
        content = f.read()

        file_contents.append({
                'content': content,
                'metadata': {
                    'project_key': wiki_page['projectKey'],
                    'project_name': wiki_page['projectName'],
                    'page_name': wiki_page['name'],
                    'filename': os.path.basename(content_file_path),
                },
            })

In [None]:
from langchain.docstore.document import Document

documents: list[Document] = []

In [None]:
for content in file_contents:
    document = \
        Document(
            page_content = content['content'],
            metadata = content['metadata'],
        )
    documents.append(document)

In [None]:
# documents

# (データをチャンクに分割)

In [None]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

## (分割方法の定義（以下のどちらかを有効に）)

In [None]:
# text_splitter = CharacterTextSplitter(
#     separator = "\n\n",
#     chunk_size=200,
#     chunk_overlap=0
# )

In [None]:
# text_splitter = RecursiveCharacterTextSplitter(
#     separators=[
#         "$",
#         "\n\n",
#         "\uff0e",  # 全角「。」
#         "\n",
#         "\uff0c",  # 全角コンマ
#         ".",
#         ",",
#         " ",
#         "",
#     ],
#     chunk_size=200,
#     chunk_overlap=0
# )

## (チャンクに分割)

In [None]:
# documents_chunk = text_splitter.split_documents(documents)

In [None]:
# documents_chunk

In [None]:
# type(documents_chunk[0])

In [None]:
# # 文字数の確認
# for doc in documents_chunk:
#     print("文字数：",len(doc.page_content))

# ベクトルDB作成

## 埋め込みモデル定義

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
# 埋め込みモデルの定義
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
embeddings

In [None]:
type(embeddings)

In [None]:
# for filepath in backlog_wiki_files:
#     print(os.path.basename(filepath))

In [None]:
# file_contents = []

In [None]:
# for filepath in backlog_wiki_files:
#     with open(filepath, 'r') as f:
#         content = f.read()

#         file_contents.append({
#             'content': content,
#             'metadata': {
#                 'filename': os.path.basename(filepath),
#             },
#         })

In [None]:
# type(file_contents)

## ベクトルDB作成

In [None]:
from langchain.vectorstores import Chroma

In [None]:
# ベクトルデータベースを作る（ここを2回実行すると、1つのベクトルデータベースに同じデータが2度格納されることになるので注意）
vectordb = Chroma.from_documents(
    documents=documents,
    embedding=embeddings
)

In [None]:
vectordb

## 検索確認

In [None]:
sc = vectordb.similarity_search_with_relevance_scores(query="時短勤務のルールは？", k=3)
for s in sc:
    print('--')
    print(s)

In [None]:
questions =[
    {'question':'時短勤務のルールは？',},
    {'question':'さくらインターネットの社員にカードキーを貸し出す方法は？',},
    {'question':'ロゴはどこにありますか？',},
    {'question': '開発用アカウント',},
]

In [None]:
result = []

for i, q in enumerate(questions):
    # questions[i]["chunks"] = "\n".join(vectordb.similarity_search_with_relevance_scores(query=q['question'], k=3))
    sc = vectordb.similarity_search_with_relevance_scores(query=q['question'], k=3)
    # documents = []
    for j, s in enumerate(sc):
        # print('--')
        # print(type(s[0]))
        document, score = s
        # print(document.page_content)
        # documents.append("{} : {}".format(document.page_content, score))
        result.append({
            'question no.': i,
            'question': q['question'],
            'document no.': j,
            'project key': document.metadata['project_key'],
            'project name': document.metadata['project_name'],
            'document name': document.metadata['page_name'],
            'document content': document.page_content,
            'score': score,
        })

    # q['documents'] = '\n'.join(documents)

In [None]:
import pandas as pd

In [None]:
result_df = pd.DataFrame(result)

In [None]:
result_df

In [None]:
import datetime
result_df.to_excel('output/backlog-wiki-chromadb.{}.xlsx'.format(datetime.datetime.now().strftime('%Y%m%d-%H%M%S')), index=False)