# Document Loader


In [4]:
from langchain.document_loaders import TextLoader, ArxivLoader, UnstructuredURLLoader


docs = TextLoader("./state_of_the_union.txt").load()


query = "2005.14165"
arxiv_docs = ArxivLoader(query=query, load_max_docs=5).load()
arxiv_docs[0].metadata


# urls = [
#     "https://react-lm.github.io/",
# ]

# loader = UnstructuredURLLoader(urls=urls)
# data = loader.load()
# data[0].metadata

{'Published': '2020-07-22',
 'Title': 'Language Models are Few-Shot Learners',
 'Authors': 'Tom B. Brown, Benjamin Mann, Nick Ryder, Melanie Subbiah, Jared Kaplan, Prafulla Dhariwal, Arvind Neelakantan, Pranav Shyam, Girish Sastry, Amanda Askell, Sandhini Agarwal, Ariel Herbert-Voss, Gretchen Krueger, Tom Henighan, Rewon Child, Aditya Ramesh, Daniel M. Ziegler, Jeffrey Wu, Clemens Winter, Christopher Hesse, Mark Chen, Eric Sigler, Mateusz Litwin, Scott Gray, Benjamin Chess, Jack Clark, Christopher Berner, Sam McCandlish, Alec Radford, Ilya Sutskever, Dario Amodei',
 'Summary': "Recent work has demonstrated substantial gains on many NLP tasks and\nbenchmarks by pre-training on a large corpus of text followed by fine-tuning on\na specific task. While typically task-agnostic in architecture, this method\nstill requires task-specific fine-tuning datasets of thousands or tens of\nthousands of examples. By contrast, humans can generally perform a new language\ntask from only a few examples o

# Document Transformers


### 文本分割


In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open("./state_of_the_union.txt", "r") as file:
    text = file.read()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    add_start_index=True,
)

docs = text_splitter.create_documents([text])
print(docs[0])

page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and' metadata={'start_index': 0}


### Code 分割


In [7]:
from langchain.text_splitter import Language

html_text = """
<!DOCTYPE html>
<html>
    <head>
        <title>🦜️🔗 LangChain</title>
        <style>
            body {
                font-family: Arial, sans-serif;
            }
            h1 {
                color: darkblue;
            }
        </style>
    </head>
    <body>
        <div>
            <h1>🦜️🔗 LangChain</h1>
            <p>⚡ Building applications with LLMs through composability ⚡</p>
        </div>
        <div>
            As an open source project in a rapidly developing field, we are extremely open to contributions.
        </div>
    </body>
</html>
"""

html_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.HTML,
    chunk_size=60,
    chunk_overlap=0,
)

html_docs = html_splitter.create_documents([html_text])
print(html_docs)

[Document(page_content='<!DOCTYPE html>\n<html>'), Document(page_content='<head>\n        <title>🦜️🔗 LangChain</title>'), Document(page_content='<style>\n            body {\n                font-family: Aria'), Document(page_content='l, sans-serif;\n            }\n            h1 {'), Document(page_content='color: darkblue;\n            }\n        </style>\n    </head'), Document(page_content='>'), Document(page_content='<body>'), Document(page_content='<div>\n            <h1>🦜️🔗 LangChain</h1>'), Document(page_content='<p>⚡ Building applications with LLMs through composability ⚡'), Document(page_content='</p>\n        </div>'), Document(page_content='<div>\n            As an open source project in a rapidly dev'), Document(page_content='eloping field, we are extremely open to contributions.'), Document(page_content='</div>\n    </body>\n</html>')]


# Embedding


In [8]:
from langchain.embeddings import OpenAIEmbeddings

embeddings_model = OpenAIEmbeddings()
embedded_query = embeddings_model.embed_query("Embedding this text")
print(embedded_query)


embeddings = embeddings_model.embed_documents(
    ["This is a document", "This is another document"]
)
print(embeddings)

[-0.030698321132572707, 0.01052702103656507, -0.011462463763140729, -0.00022027363143233128, -0.00041069709438949775, 0.024571831767458557, -0.007753631872984188, -0.010217402852959264, -0.019815569338706915, -0.01338604903442265, -0.0006332352511676604, 0.033359720621489036, -0.007700930657869754, 0.005527015196870871, -0.006673261852581711, 0.022621895655788784, 0.023992120729506176, 0.01764165387770803, 0.003909754021315636, -0.02283270051624652, -0.018050085384461907, -0.0033761564883807225, 0.008623198429912764, 0.0037549449295127334, -0.012160751720193354, -0.014571821019392717, 0.015625841596391174, -0.04065880841402443, -0.01338604903442265, -0.013768131563433782, 0.008379456415954042, -0.025164719156927428, -0.012272740463535351, -0.008774713433836547, -0.023741792402434323, 0.0017457200090468792, -0.0012631766472868387, -0.0199341453264846, 0.03388672997866571, -0.007233209760243203, 0.01333993552940518, 0.007957848557683682, 0.011706205777099452, -0.006534921803190578, -0.01

# Vector Stores


In [11]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

raw_documents = TextLoader("./state_of_the_union.txt").load()
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

embeddings_model = OpenAIEmbeddings()
db = Chroma.from_documents(documents, embeddings_model)

# 使用文本进行语义相似度搜索
query = "What did the president say about the economy?"
docs = db.similarity_search(query, k=3)
print(docs[0].page_content)

Created a chunk of size 215, which is longer than the specified 200
Created a chunk of size 232, which is longer than the specified 200
Created a chunk of size 242, which is longer than the specified 200
Created a chunk of size 219, which is longer than the specified 200
Created a chunk of size 304, which is longer than the specified 200
Created a chunk of size 205, which is longer than the specified 200
Created a chunk of size 332, which is longer than the specified 200
Created a chunk of size 215, which is longer than the specified 200
Created a chunk of size 203, which is longer than the specified 200
Created a chunk of size 281, which is longer than the specified 200
Created a chunk of size 201, which is longer than the specified 200
Created a chunk of size 250, which is longer than the specified 200
Created a chunk of size 325, which is longer than the specified 200
Created a chunk of size 242, which is longer than the specified 200


Our economy grew at a rate of 5.7% last year, the strongest growth in nearly 40 years, the first step in bringing fundamental change to an economy that hasn’t worked for the working people of this nation for too long.


In [12]:
# 使用嵌入向量进行语义相似度搜索
embedding_vector = embeddings_model.embed_query(query)
docs = db.similarity_search_by_vector(embedding_vector)
print(docs[0].page_content)

Our economy grew at a rate of 5.7% last year, the strongest growth in nearly 40 years, the first step in bringing fundamental change to an economy that hasn’t worked for the working people of this nation for too long.
