# Gitloaderを使用したソースコード問い合わせチャットボット

## 環境構築

In [None]:
!pip install langchain==0.0.218
!pip install openai==0.27.8
!pip install chromadb==0.3.26
!pip install tiktoken==0.4.0
!pip install GitPython==3.1.31

Collecting langchain==0.0.218
  Downloading langchain-0.0.218-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.6.0,>=0.5.7 (from langchain==0.0.218)
  Downloading dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting langchainplus-sdk>=0.0.17 (from langchain==0.0.218)
  Downloading langchainplus_sdk-0.0.20-py3-none-any.whl (25 kB)
Collecting openapi-schema-pydantic<2.0,>=1.2 (from langchain==0.0.218)
  Downloading openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.0/90.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.6.0,>=0.5.7->langchain==0.0.218)
  Downloading marshmallow-3.20.1-py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[

In [None]:
import os
from langchain.vectorstores import Chroma
from langchain.indexes import VectorstoreIndexCreator
from langchain.document_loaders import GitLoader
from datetime import datetime
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.memory import ConversationSummaryMemory
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain

#TODO: APIキーの登録が必要
os.environ["OPENAI_API_KEY"] = ""

## 実装方法

In [None]:
# 現在の日時を取得
now = datetime.now()

# ミリ秒までの文字列にフォーマット
formatted_now = now.strftime('%Y-%m-%d%H:%M:%S.%f')[:-3]

print(formatted_now)

clone_url = "https://github.com/hwchase17/langchain"
branch = "master"
repo_path = f"./tempo/{formatted_now}"
filter_ext = ".py"

if os.path.exists(repo_path):
    clone_url = None


2023-09-2915:50:30.963


In [None]:
formatted_now

'2023-09-2915:50:30.963'

In [None]:
loader = GitLoader(
    clone_url=clone_url,
    branch=branch,
    repo_path=repo_path,
    file_filter=lambda file_path: file_path.endswith(filter_ext),
)

In [None]:
data = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
## 永続化させる and 結果を良い感じのUIにレンダリングしたい
    # client_settings = chromadb.config.Settings(
    #     chroma_db_impl="duckdb+parquet",
    #     persist_directory=DB_DIR,
    #     anonymized_telemetry=False
    # )

In [None]:
llm = OpenAI(temperature=0)
memory = ConversationSummaryMemory(llm=llm,memory_key="chat_history",return_messages=True)

In [None]:
llm = ChatOpenAI()
retriever = vectorstore.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

In [None]:
qa ("LangChainについて詳細に説明してください？")

{'question': 'LangChainについて詳細に説明してください？',
 'chat_history': [SystemMessage(content='', additional_kwargs={})],
 'answer': 'LangChainは、自然言語処理（NLP）タスクを行うためのPythonパッケージです。LangChainには、さまざまなタスクを実行するための複数のチェーンが含まれています。これらのチェーンは、テキストデータの前処理、特徴エンジニアリング、モデルのトレーニング、推論などのステップを組み合わせてタスクを実行します。\n\nLangChainのバージョン2.0.0は現在利用可能ですが、3.0.0で廃止予定となっています。最新バージョンのLangChainを使用することをおすすめします。'}

In [None]:
qa ("さまざまなタスクとは具体的にはどのようなタスクですか？")

{'question': 'さまざまなタスクとは具体的にはどのようなタスクですか？',
 'chat_history': [SystemMessage(content='\nThe human asked for a detailed explanation of LangChain. The AI explained that LangChain is a Python package for performing natural language processing tasks, containing multiple chains for performing tasks such as preprocessing text data, feature engineering, training models, and inference. Version 2.0.0 of LangChain is currently available, but version 3.0.0 is scheduled to be discontinued. The AI recommended using the latest version of LangChain.', additional_kwargs={})],
 'answer': 'さまざまなタスクには、以下のようなものが含まれます。\n\n1. 簡単な質問への回答：例えば、天気予報や時間帯の確認などの基本的な情報を提供します。\n\n2. 情報の解説や説明：特定のトピックに関する詳細な情報や概念の説明を行います。\n\n3. 会話やディスカッション：特定のトピックについての意見交換や議論に参加します。\n\n4. ヘルプやアドバイスの提供：特定の問題や課題に対して解決策やアドバイスを提案します。\n\n5. タスクの自動化：特定の作業や手続きを自動化するための手助けをします。\n\nこれらは一部の例ですが、私は多くの異なるタスクに対応することができます。どのような質問やタスクがあるか具体的にお知りになりたいですか？'}

In [None]:
qa ("調べたいことについて最新データをgoogle検索から取得して答えてくれるようにLangChainを実装したいです。")

{'question': '調べたいことについて最新データをgoogle検索から取得して答えてくれるようにLangChainを実装したいです。',
 'chat_history': [SystemMessage(content='\nThe human asked for a detailed explanation of LangChain. The AI explained that LangChain is a Python package for performing natural language processing tasks, containing multiple chains for performing tasks such as preprocessing text data, feature engineering, training models, and inference. Version 2.0.0 of LangChain is currently available, but version 3.0.0 is scheduled to be discontinued. The AI recommended using the latest version of LangChain and mentioned that it can perform various tasks such as providing basic information, explaining detailed information and concepts, participating in conversations and discussions, providing help and advice, and automating tasks.', additional_kwargs={})],
 'answer': 'はい、LangChainを使用して最新のGoogle検索データを取得する方法があります。具体的には、以下の手順を実行することで実現できます。\n\n1. Google検索APIを使用して、指定したキーワードに関連する検索結果を取得します。\n2. 取得した検索結果をLangChainの入力として使用します。\n3. LangCh

In [None]:
qa ("serp apiを使った実装をpythonで書いてください")

{'question': 'serp apiを使った実装をpythonで書いてください',
 'chat_history': [SystemMessage(content="\nThe human asked for a detailed explanation of LangChain. The AI explained that LangChain is a Python package for performing natural language processing tasks, containing multiple chains for performing tasks such as preprocessing text data, feature engineering, training models, and inference. Version 2.0.0 of LangChain is currently available, but version 3.0.0 is scheduled to be discontinued. The AI recommended using the latest version of LangChain and mentioned that it can perform various tasks such as providing basic information, explaining detailed information and concepts, participating in conversations and discussions, providing help and advice, and automating tasks. The AI also explained that LangChain can be used to retrieve the latest Google search data for the desired information, by using the Google search API to get search results related to the specified keyword, and then using the resul

In [None]:
qa ("document loaderにはどのようなものがありますか？")

{'question': 'document loaderにはどのようなものがありますか？',
 'chat_history': [SystemMessage(content="\nThe human asked for a detailed explanation of LangChain. The AI explained that LangChain is a Python package for performing natural language processing tasks, containing multiple chains for performing tasks such as preprocessing text data, feature engineering, training models, and inference. Version 2.0.0 of LangChain is currently available, but version 3.0.0 is scheduled to be discontinued. The AI recommended using the latest version of LangChain and mentioned that it can perform various tasks such as providing basic information, explaining detailed information and concepts, participating in conversations and discussions, providing help and advice, and automating tasks. The AI also explained that LangChain can be used to retrieve the latest Google search data for the desired information, by using the Google search API to get search results related to the specified keyword, and then using the res

In [None]:
qa ("vector storeをファイルに保存して何回もEmbedding　APIに問い合わせないようにしたいので実装を教えてください")

{'question': 'vector storeをファイルに保存して何回もEmbedding\u3000APIに問い合わせないようにしたいので実装を教えてください',
 'chat_history': [SystemMessage(content="\nThe human asked for a detailed explanation of LangChain. The AI explained that LangChain is a Python package for performing natural language processing tasks, containing multiple chains for performing tasks such as preprocessing text data, feature engineering, training models, and inference. Version 2.0.0 of LangChain is currently available, but version 3.0.0 is scheduled to be discontinued. The AI recommended using the latest version of LangChain and mentioned that it can perform various tasks such as providing basic information, explaining detailed information and concepts, participating in conversations and discussions, providing help and advice, and automating tasks. The AI also explained that LangChain can be used to retrieve the latest Google search data for the desired information, by using the Google search API to get search results related to the spe

In [None]:
import os
import shutil

if os.path.exists(repo_path):
    shutil.rmtree(repo_path)
    print(f"{repo_path} を削除しました。")
else:
    print(f"{repo_path} は存在しません。")


./tempo/ を削除しました。
