In [1]:
# 0) Colab互換: requests を先に固定
!pip install -U "requests==2.32.4"

# 1) OpenAI SDK（公式）
!pip install -U "openai>=2.6.1,<3"  # 2025-10時点の最新 2.6.1 を含む範囲

# 2) LangChain 分割パッケージ
!pip install -U \
  "langchain-core==1.0.1" \
  "langchain-openai==1.0.1" \
  "langchain-community==0.4.1"

# 3) Chroma 連携（LangChain側の統合パッケージ + 本体）
!pip install -U "langchain-chroma>=0.1.2" "chromadb>=0.5.4"

!pip install html2text

Collecting requests==2.32.4
  Downloading requests-2.32.4-py3-none-any.whl.metadata (4.9 kB)
Downloading requests-2.32.4-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.8/64.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 2.32.5
    Uninstalling requests-2.32.5:
      Successfully uninstalled requests-2.32.5
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-community 0.4.1 requires requests<3.0.0,>=2.32.5, but you have requests 2.32.4 which is incompatible.
langchain 0.3.27 requires langchain-core<1.0.0,>=0.3.72, but you have langchain-core 1.0.1 which is incompatible.
langchain 0.3.27 requires langchain-text-splitters<1.0.0,>=0.3.9, but you have langchain-text-splitters 1.0.0 which is in

In [2]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = userdata.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "agent-book"

In [3]:
from langchain_community.document_loaders import RecursiveUrlLoader

loader = RecursiveUrlLoader(
    url="https://docs.langchain.com/oss/python/integrations/",
    max_depth=2,            # 必要に応じて広げる
    timeout=10,             # ネット環境に応じて
)
raw_docs = loader.load()
print("raw_docs:", len(raw_docs))

raw_docs: 55


In [4]:
from langchain_community.document_transformers import Html2TextTransformer
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

html2text = Html2TextTransformer()
md_like_docs = html2text.transform_documents(raw_docs)

import re
def preprocess_md_like(text: str) -> str:
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

md_headers = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#", "h1"), ("##", "h2"), ("###", "h3")]
)

md_docs = []
for d in md_like_docs:
    cleaned = preprocess_md_like(d.page_content)
    # 見出しが存在しないページもあるので try/except でガード
    try:
        for sd in md_headers.split_text(cleaned):
            sd.metadata.update(d.metadata)  # URLなどのメタデータを維持
            md_docs.append(sd)
    except Exception:
        # 見出しが無い/壊れている場合はそのまま1文書として扱う
        d.page_content = cleaned
        md_docs.append(d)

# Markdownはやや大きめ + コードブロック境界も考慮（docsはサンプルコード多め）
md_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=150,
    separators=["```", "\n## ", "\n### ", "\n\n", "\n", " ", ""],
)
docs = md_splitter.split_documents(md_docs)
print("split docs:", len(docs))

split docs: 1664


In [5]:
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

vector = Chroma.from_documents(
    docs,
    embedding=embeddings,
    collection_name="langchain_docs_integrations",
    persist_directory="./chroma_langchain_integrations",
)

retriever = vector.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 10, "fetch_k": 60, "lambda_mult": 0.65}
)

query = (
    "Is there a document loader for reading data from AWS S3? "
    "(Amazon S3, bucket, s3://, S3DirectoryLoader, S3FileLoader)"
)

docs_mmr = retriever.invoke(query)

print(len(docs_mmr))
for i, d in enumerate(docs_mmr[:5], 1):
    src = d.metadata.get("source") or d.metadata.get("file_path") or d.metadata.get("url")
    print(f"[{i}] {src}\n{d.page_content[:300]}...\n")

10
[1] https://docs.langchain.com/oss/python/integrations/providers/aws
AWS S3 Directory and File  
> Amazon Simple Storage Service (Amazon S3) is an object storage service. AWS
> S3 Directory AWS S3 Buckets  
See a usage example for S3DirectoryLoader. See a usage example for
S3FileLoader.  
Copy  
Ask AI  
from langchain_community.document_loaders import S3DirectoryLoa...

[2] https://docs.langchain.com/oss/python/integrations/document_loaders
Amazon Textract| Uses AWS API to load PDFs| API
MathPix| Uses MathPix to load PDFs| Package
PDFPlumber| Load PDF files using PDFPlumber| Package
PyPDFDirectry| Load a directory with PDF files| Package
PyPDFium2| Load PDF files using PyPDFium2| Package
PyMuPDF| Load PDF files using PyMuPDF| Package
P...

[3] https://docs.langchain.com/oss/python/integrations/document_loaders
Azure Blob Storage| Load documents from Azure Blob Storage| ✅|
`AzureBlobStorageLoader`
Dropbox| Load documents from Dropbox| ❌| `DropboxLoader`
Google Cloud Storage Director

In [6]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_template('''\
Please answer the question based solely on the context provided below.

context: """
{context}
"""

question: {question}
''')

model = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

output = chain.invoke(query)
print(output)

Yes, there are document loaders for reading data from AWS S3. Specifically, you can use `S3DirectoryLoader` to load documents from an AWS S3 directory and `S3FileLoader` to load documents from an AWS S3 file.
