In [2]:
!pip install BeautifulSoup4 Tiktoken OpenAI pandas SciPy lxml



In [3]:
import glob
import re
import os

from bs4 import BeautifulSoup

# Define the subdirectory path
subdirectory_path = 'apidocs'

In [4]:
def extract_html_content_from_files_in_directory(subdirectory_path):
    # Use glob to recursively find all files
    all_files = glob.glob(os.path.join(subdirectory_path, '**'), recursive=True)
    
    # Define a regex pattern to match files ending with .html
    pattern = re.compile(r'.*\.html$')
    
    # Filter the files using the regex pattern
    html_files = [file for file in all_files if pattern.match(file)]
    
    html_texts = {}
    
    for f in html_files:
        # Load the HTML file
        with open(f, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'lxml')
        
        # Extract the text content
        text_content = soup.get_text()
        
        html_texts[f] = text_content

    return html_texts

In [5]:
import tiktoken
import itertools

# Select the encoding based on the model
encoding = tiktoken.get_encoding("cl100k_base")

max_chunk_size = 512

In [6]:
def chunkify_text(text, max_tokens, encoding):
    # Tokenize the text
    tokens = encoding.encode(text)

    # Initialize variables for chunking
    chunks = []
    current_chunk = []

    for token in tokens:
        if len(current_chunk) >= max_tokens:
            # If current chunk has reached the max token limit, save it
            chunks.append(current_chunk)
            current_chunk = []

        current_chunk.append(token)

    # Add the last chunk if any tokens left
    if current_chunk:
        chunks.append(current_chunk)

    # Decode tokens back to text chunks
    text_chunks = [encoding.decode(chunk) for chunk in chunks]
    return text_chunks

In [7]:
# Example usage
text = "Hello, world! This is a sample text to tokenize and chunkify using tiktoken."
text_chunks = chunkify_text(text, max_chunk_size, encoding)

for i, chunk in enumerate(text_chunks):
    print(f"Chunk {i+1}: {chunk}")

Chunk 1: Hello, world! This is a sample text to tokenize and chunkify using tiktoken.


In [8]:
text_chunks

['Hello, world! This is a sample text to tokenize and chunkify using tiktoken.']

In [9]:
def extract_html_and_chunkify_files_in_directory(subdirectory_path):
    html_texts = extract_html_content_from_files_in_directory(subdirectory_path)

    text_chunks = {}
    
    for f in html_texts.keys():
        text_chunks[f] =  chunkify_text(html_texts[f], max_chunk_size, encoding)

    return text_chunks

In [10]:
text_chunks = extract_html_and_chunkify_files_in_directory(subdirectory_path)

In [13]:
len(text_chunks)

2688

In [15]:
keys = list(text_chunks.keys())
text_chunks[keys[0]]

[' \n\n\n\nOverview: module code — 🦜🔗 LangChain 0.0.305\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nAPI\n\n\nExperimental\n\n\nPython Docs\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nToggle Menu\n\n\n\nPrev\nUp\nNext\n\n\n\nLangChain 0.0.305\n\n\n\n\n\n\n\n\nAll modules for which code is available\nlangchain.adapters.openai\nlangchain.agents.agent\nlangchain.agents.agent_iterator\nlangchain.agents.agent_toolkits.ainetwork.toolkit\nlangchain.agents.agent_toolkits.amadeus.toolkit\nlangchain.agents.agent_toolkits.azure_cognitive_services\nlangchain.agents.agent_toolkits.base\nlangchain.agents.agent_toolkits.conversational_retrieval.openai_functions\nlangchain.agents.agent_toolkits.conversational_retrieval.tool\nlangchain.agents.agent_toolkits.csv.base\nlangchain.agents.agent_toolkits.file_management.toolkit\nlangchain.agents.agent_toolkits.github.toolkit\nlangchain.agents.agent_toolkits.gitlab.toolkit\nlangchain.agents.agent_toolkits.gmail.toolkit\nlangchain.agents.agent_toolkits.ji

In [14]:
import pickle

file_text_chunks = open('text_chunks.pkl', 'wb') 
pickle.dump(text_chunks, file_text_chunks)

In [11]:
#test