# 出力ファイル名に付与する日時を起動日時で設定

In [None]:
import datetime

START_DATETIME = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
START_DATETIME

# データの読み取り

In [None]:
from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader

In [None]:
import json

backlog_wiki_json_file_path = './backlog-wikis/wiki-pages.json'

with open(backlog_wiki_json_file_path, 'r') as f:
    backlog_wiki_json = json.load(f)

i = 1
backlog_wiki_json[i]['projectId'], backlog_wiki_json[i]['projectKey'], backlog_wiki_json[i]['projectName'], backlog_wiki_json[i]['name']

In [None]:
import os

file_contents = []

for wiki_page in backlog_wiki_json:
    content_dir_path: str = \
        './backlog-wikis/{project_key}_{project_name}/{wiki_id}_{name}/'.format(
            project_key = wiki_page['projectKey'],
            project_name = wiki_page['projectName'],
            wiki_id = wiki_page['id'],
            name = wiki_page['name'].replace('/', '／'),
        )

    content_file_name: str = \
        '{project_key}_{project_name}__{name}.md'.format(
            project_key = wiki_page['projectKey'],
            project_name = wiki_page['projectName'],
            name = wiki_page['name'].replace('/', '／'),
        )
    
    content_file_path: str = \
        '{dir}{file}'.format(
            dir = content_dir_path,
            file = content_file_name,
        )

    if not os.path.isfile(content_file_path):
        print('ERROR : ファイルは存在しません。{content_file_path}'.format(content_file_path = content_file_path,))
        continue
        
    with open(content_file_path, 'r') as f:
        content = f.read()

        file_contents.append({
                'content': content,
                'metadata': {
                    'project_key': wiki_page['projectKey'],
                    'project_name': wiki_page['projectName'],
                    'page_name': wiki_page['name'],
                    'filename': os.path.basename(content_file_path),
                },
            })

In [None]:
from langchain.docstore.document import Document

documents: list[Document] = []

In [None]:
for content in file_contents:
    document = \
        Document(
            page_content = content['content'],
            metadata = content['metadata'],
        )
    documents.append(document)

In [None]:
# documents

# データをチャンクに分割

## 分割方法の定義（以下のどれかを有効に）

### 一番シンプルな分割：改行2つ

In [None]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# text_splitter = CharacterTextSplitter(
#     separator = "\n\n",
#     chunk_size=10000,
#     chunk_overlap=0
# )
# documents_chunk = text_splitter.split_documents(documents)

### 再帰的に分割：句点、句読点

In [None]:
# text_splitter = RecursiveCharacterTextSplitter(
#     separators=[
#         "$",
#         "\n\n",
#         "\uff0e",  # 全角「。」
#         "\n",
#         "\uff0c",  # 全角コンマ
#         ".",
#         ",",
#         " ",
#         "",
#     ],
#     chunk_size=1000,
#     chunk_overlap=0
# )
# documents_chunk = text_splitter.split_documents(documents)

### Markdownの見出しで分割

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

# Markdownテキストを「#'記号で分割する関数
def split_markdown_text(md_text: str) -> list[str]:
    """
    Markdownテキストを「#'記号で分割します。
    
    :param md_text: Markdown形式のテキスト
    :return: 分割されたテキストのリスト
    """
    headers_to_split_on = [('#', 'H1'), ('##', 'H2'),]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
    md_header_splits = markdown_splitter.split_text(md_text)
    split_texts = [split_md.page_content for split_md in md_header_splits]
    return split_texts    

In [None]:
documents_chunk: list[Document] = []

for doc in documents:
    splited_contents = split_markdown_text(doc.page_content)

    for splited_content in splited_contents:
        documents_chunk.append(Document(
            page_content = splited_content,
            metadata = doc.metadata,
        ))

## チャンクに分割した内容を確認

In [None]:
documents_chunk[0]

In [None]:
type(documents_chunk[0])

In [None]:
# # 文字数の確認
# for doc in documents_chunk:
#     print("文字数：",len(doc.page_content))

In [None]:
# コンテントにWikiページ名を追加
documents_chunk_add = []
for doc in documents_chunk:
    doc.page_content = doc.metadata['page_name'] + '\n-----\n' + doc.page_content
    documents_chunk_add.append(doc)

## チャンクした内容を保存

In [None]:
document_chunk_df_list = []

for doc in documents_chunk_add:
    document_chunk_df_list.append({
        'project_key': doc.metadata['project_key'],
        'project_name': doc.metadata['project_name'],
        'page_name': doc.metadata['page_name'],
        'page_content': doc.page_content,
    })

In [None]:
# document_chunk_df_list

In [None]:
import pandas as pd

document_chunk_df = pd.DataFrame(document_chunk_df_list)
document_chunk_df

In [None]:
import datetime
document_chunk_df.to_excel('output/backlog-wiki-chromadb-{datetime}-document-chunk.xlsx'.format(datetime = START_DATETIME,), index=False)

# ベクトルDB作成

## 埋め込みモデル定義

### HuggingFaceEmbeddings

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
# model = 'all-MiniLM-L6-v2'
model = 'intfloat/multilingual-e5-large'
# 埋め込みモデルの定義
embeddings = HuggingFaceEmbeddings(model_name = model)

### OllamaEmbeddings

In [None]:
# from langchain_community.embeddings import OllamaEmbeddings
# embeddings = OllamaEmbeddings(
#     base_url = 'http://llm-rag-examples-ollama:11434/',
#     model = 'elyza:jp8b',
# )

### embeddings確認

In [None]:
embeddings

In [None]:
type(embeddings)

In [None]:
# for filepath in backlog_wiki_files:
#     print(os.path.basename(filepath))

In [None]:
# file_contents = []

In [None]:
# for filepath in backlog_wiki_files:
#     with open(filepath, 'r') as f:
#         content = f.read()

#         file_contents.append({
#             'content': content,
#             'metadata': {
#                 'filename': os.path.basename(filepath),
#             },
#         })

In [None]:
# type(file_contents)

## ベクトルDB作成

In [None]:
from langchain.vectorstores import Chroma

In [None]:
# ベクトルデータベースを作る（ここを2回実行すると、1つのベクトルデータベースに同じデータが2度格納されることになるので注意）
vectordb = Chroma.from_documents(
    documents=documents_chunk_add,
    embedding=embeddings
)

In [None]:
vectordb

## 検索確認

In [None]:
sc = vectordb.similarity_search_with_relevance_scores(query="時短勤務のルールは？", k=3)
for s in sc:
    print('--')
    print(s)

In [None]:
questions =[
    {'question':'時短勤務のルールは？',},
    {'question':'さくらインターネット',},
    {'question':'会社ロゴはどこにありますか？',},
    {'question': '開発用アカウント',},
]

In [None]:
result = []

for i, q in enumerate(questions):
    # questions[i]["chunks"] = "\n".join(vectordb.similarity_search_with_relevance_scores(query=q['question'], k=3))
    sc = vectordb.similarity_search_with_relevance_scores(query=q['question'], k=3)
    # documents = []
    for j, s in enumerate(sc):
        # print('--')
        # print(type(s[0]))
        document, score = s
        # print(document.page_content)
        # documents.append("{} : {}".format(document.page_content, score))
        result.append({
            'question no.': i,
            'question': q['question'],
            'document no.': j,
            'project key': document.metadata['project_key'],
            'project name': document.metadata['project_name'],
            'document name': document.metadata['page_name'],
            'document content': document.page_content,
            'score': score,
        })

    # q['documents'] = '\n'.join(documents)

In [None]:
import pandas as pd

In [None]:
result_df = pd.DataFrame(result)

In [None]:
result_df

In [None]:
import datetime
result_df.to_excel('output/backlog-wiki-chromadb-{datetime}-search-result.xlsx'.format(datetime = START_DATETIME,), index=False)