In [41]:
!pip install pypdf2 unstructured langchain tiktoken
!pip install -U langchain-community



In [42]:
from langchain.document_loaders import ReadTheDocsLoader
import tiktoken
import itertools
import json

In [43]:
%%time

path="danny-moldovan-qa-using-vector-databases-lp/apidocs"

loader = ReadTheDocsLoader(path, encoding="utf-8")
documents = loader.load()

CPU times: user 2min 41s, sys: 220 ms, total: 2min 41s
Wall time: 2min 41s


In [44]:
num_documents = len(documents)
print(f"Number of documents processed: {num_documents}")

Number of documents processed: 2688


In [45]:
# Select the encoding based on the model
encoding = tiktoken.get_encoding("cl100k_base")

chunk_size = 512
chunk_overlap = 100

In [46]:
def get_number_tokens_in_text(text):
    tokens = encoding.encode(text)
    return len(tokens)

In [47]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Create the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function = get_number_tokens_in_text,
    separators=["\n\n", "\n", " ", ""] #split by paragraph, sentence, word, and character in that order
)

# Split the documents
docs = text_splitter.split_documents(documents)

In [48]:
docs[0]

Document(page_content='langchain API Reference¶\nlangchain.adapters¶\nClasses¶\nadapters.openai.ChatCompletion()\nFunctions¶\nadapters.openai.aenumerate(iterable[,\xa0start])\nAsync version of enumerate.\nadapters.openai.convert_dict_to_message(_dict)\nadapters.openai.convert_message_to_dict(message)\nadapters.openai.convert_messages_for_finetuning(...)\nConvert messages to a list of lists of dictionaries for fine-tuning.\nadapters.openai.convert_openai_messages(messages)\nConvert dictionaries representing OpenAI messages to LangChain format.\nlangchain.agents¶\nAgent is a class that uses an LLM to choose a sequence of actions to take.\nIn Chains, a sequence of actions is hardcoded. In Agents,\na language model is used as a reasoning engine to determine which actions\nto take and in which order.\nAgents select and use Tools and Toolkits for actions.\nClass hierarchy:\nBaseSingleActionAgent --> LLMSingleActionAgent\n                          OpenAIFunctionsAgent\n                       

In [49]:
len(docs)

10162

In [50]:
chunks = []

for d in docs:
    id = len(chunks)
    chunks.append({'id': id, 'page_content': d.page_content, 'source': d.metadata['source']})    

In [51]:
def save_to_jsonl(data, filename):
    """Saves a list of dictionaries (or JSON-serializable objects) to a JSONL file."""
    with open(filename, 'w', encoding='utf-8') as file:
        for item in data:
            json_string = json.dumps(item)
            file.write(json_string + '\n')

In [52]:
save_to_jsonl(chunks, "danny-moldovan-qa-using-vector-databases-lp/chunks.jsonl")