In [2]:
import requests
from bs4 import BeautifulSoup
from transformers import GPT2TokenizerFast
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load tokenizer
tokenizer_gpt = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    return len(tokenizer_gpt.encode(text))

# Step 1: Extract text from webpage
def read_webpage(url: str) -> str:
    """Extract text from a web page and return it as a single string."""
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch page: {url}")

    soup = BeautifulSoup(response.text, "html.parser")
    main_content = soup.find("div", class_="h3-wrap-list")

    if main_content:
        return main_content.get_text(separator="\n", strip=True)
    else:
        return ""

# Dictionary to store processed web data
processed_web_data = {}

def process_webpage(url: str, page_name: str):
    """Process webpage text and split it into chunks using RecursiveCharacterTextSplitter."""
    
    # Extract text from webpage
    text = read_webpage(url)
    
    if not text:
        print(f"No content extracted from {url}")
        return
    
    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=256,  # Size of chunks (in tokens)
        chunk_overlap=24,  # Tokens overlap between chunks
        length_function=count_tokens,  # Function to count tokens in each chunk
    )

    # Split text into chunks
    chunks = text_splitter.create_documents([text])
    chunks = [chunk.page_content for chunk in chunks]

    print(f"Processed {page_name}: {len(chunks)} chunks")

    # Store processed data in dictionary
    processed_web_data[page_name] = chunks


In [4]:
# Example usage
url = "https://cheatsheets.zip/pandas"
process_webpage(url, "pandas_cheatsheet")

# Now you can inspect processed_web_data
# pprint.pprint(processed_web_data)
print(processed_web_data)

Processed pandas_cheatsheet: 6 chunks
{'pandas_cheatsheet': ["#\nIntroduction\nYouâ\x80\x99ll need to import pandas to get started:\nimport\npandas\nas\npd\n#\nCreating DataFrames\n-\n-\npd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]})\nFrom a dictionary\npd.DataFrame(data=[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}])\nFrom a list of dictionaries\npd.read_csv('file.csv')\nFrom a CSV file\npd.read_excel('file.xlsx')\nFrom an Excel file\n#\nInspecting Data\n-\n-\ndf.head()\nFirst 5 rows\ndf.tail()\nLast 5 rows\ndf.shape\nNumber of rows and columns\ndf.info()\nInfo on DataFrame\ndf.describe()\nSummary statistics\ndf.columns\nColumn names\ndf.index\nIndex\ndf.dtypes\nData types of columns\n#\nSelecting Data\n-\n-\ndf['col1']\nSelect column", "Data types of columns\n#\nSelecting Data\n-\n-\ndf['col1']\nSelect column\ndf[['col1', 'col2']]\nSelect multiple columns\ndf.loc[0]\nSelect row by index\ndf.loc[:, 'col1']\nSelect all rows for 'col1'\ndf.iloc[0]\nSelect row by position\ndf.iloc[0, 1]\nSele