# Download Raw Data

In [1]:
import os
import urllib.request
import zipfile
from pathlib import Path
from tqdm import tqdm

In [2]:
class DownloadProgressBar(tqdm):
    """Custom progress bar for download progress"""
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

In [3]:
def download_and_extract_godot_docs():
    """
    Download and extract Godot documentation from the nightly build.
    """
    # Create data/raw directory if it doesn't exist
    data_dir = Path("data/raw")
    data_dir.mkdir(parents=True, exist_ok=True)
    
    # URL for the Godot docs zip file
    docs_url = "https://nightly.link/godotengine/godot-docs/workflows/build_offline_docs/master/godot-docs-html-stable.zip"
    zip_filename = data_dir / "godot-docs-html-stable.zip"
    
    print(f"Downloading Godot documentation from: {docs_url}")
    print(f"Saving to: {zip_filename}")
    
    try:
        # Download the zip file with progress bar
        print("📥 Starting download...")
        with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc="Downloading") as t:
            urllib.request.urlretrieve(docs_url, zip_filename, reporthook=t.update_to)
        print(f"✅ Download completed: {zip_filename}")
        
        # Extract the zip file with progress bar
        print(f"📦 Extracting {zip_filename}...")
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            # Get list of files to extract
            file_list = zip_ref.namelist()
            
            # Extract with progress bar
            with tqdm(total=len(file_list), desc="Extracting", unit="files") as pbar:
                for file in file_list:
                    zip_ref.extract(file, data_dir)
                    pbar.update(1)
        
        print(f"✅ Extraction completed to: {data_dir}")
        
        # Optional: Remove the zip file after extraction
        print("🧹 Cleaning up...")
        os.remove(zip_filename)
        print("✅ Zip file removed after extraction")
        
        # List the contents of the data/raw directory
        print("\nContents of data/raw directory:")
        for item in data_dir.iterdir():
            if item.is_dir():
                print(f"  📁 {item.name}/")
            else:
                print(f"  📄 {item.name}")
                
    except Exception as e:
        print(f"Error downloading or extracting documentation: {e}")
        return False
    
    return True

In [5]:
success = download_and_extract_godot_docs()
if success:
     print("\n✅ Godot documentation download and extraction completed successfully!")
else:
     print("\n❌ Failed to download and extract Godot documentation.")

Downloading Godot documentation from: https://nightly.link/godotengine/godot-docs/workflows/build_offline_docs/master/godot-docs-html-stable.zip
Saving to: data/raw/godot-docs-html-stable.zip
📥 Starting download...


Downloading: 361MB [00:23, 15.4MB/s]                                                                                                                                                            


✅ Download completed: data/raw/godot-docs-html-stable.zip
📦 Extracting data/raw/godot-docs-html-stable.zip...


Extracting: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4808/4808 [00:03<00:00, 1378.88files/s]


✅ Extraction completed to: data/raw
🧹 Cleaning up...
✅ Zip file removed after extraction

Contents of data/raw directory:
  📄 index.html
  📁 classes/
  📁 getting_started/
  📄 objects.inv
  📄 404.html
  📁 tutorials/
  📄 searchindex.js
  📁 community/
  📁 about/
  📄 robots.txt
  📄 genindex.html
  📁 _downloads/
  📁 _images/
  📁 contributing/
  📄 search.html
  📁 _static/
  📁 _sources/

✅ Godot documentation download and extraction completed successfully!


# Load HTML with Unstructured

In [7]:
import os
from pathlib import Path
from typing import List
from langchain_community.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader
from langchain.schema import Document
from tqdm import tqdm
import glob


def find_html_files(directory: str) -> List[str]:
    """
    Find all HTML files in the given directory and subdirectories.
    
    Args:
        directory (str): The directory to search for HTML files
        
    Returns:
        List[str]: List of paths to HTML files
    """
    html_files = []
    
    # Use glob to find all HTML files recursively
    pattern = os.path.join(directory, "**", "*.html")
    html_files = glob.glob(pattern, recursive=True)
    
    return html_files

In [8]:
import os
from pathlib import Path
from typing import List
from langchain_community.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader
from langchain.schema import Document
from tqdm import tqdm
import glob

def load_html_with_unstructured(file_paths: List[str]) -> List[Document]:
    """
    Load HTML files using UnstructuredHTMLLoader.
    
    Args:
        file_paths (List[str]): List of HTML file paths
        
    Returns:
        List[Document]: List of LangChain Document objects
    """
    documents = []
    
    print("📄 Loading HTML files with Unstructured...")
    
    for file_path in tqdm(file_paths, desc="Processing with Unstructured"):
        try:
            loader = UnstructuredHTMLLoader(file_path)
            docs = loader.load()
            documents.extend(docs)
        except Exception as e:
            print(f"❌ Error loading {file_path} with Unstructured: {e}")
            continue
    
    return documents

In [11]:
import os
from pathlib import Path
from typing import List
from langchain_community.document_loaders import UnstructuredHTMLLoader, BSHTMLLoader
from langchain.schema import Document
from tqdm import tqdm
import glob

docs_dir = "data/raw"
documents = []

html_files = find_html_files(docs_dir)

documents = load_html_with_unstructured(html_files)

print(f"✅ Successfully loaded {len(documents)} documents")

📄 Loading HTML files with Unstructured...


Processing with Unstructured:   0%|                                                                                                                                    | 0/1490 [00:00<?, ?it/s]

Processing with Unstructured:  57%|█████████████████████████████████████████████████████████████████████▊                                                    | 852/1490 [01:14<00:52, 12.19it/s]short text: "All classes". Defaulting to English.
short text: "TextServerExtension". Defaulting to English.
short text: "Optional.". Defaulting to English.
short text: "Required.". Defaulting to English.
short text: "Required.". Defaulting to English.
short text: "Optional.". Defaulting to English.
short text: "Required.". Defaulting to English.
short text: "Optional.". Defaulting to English.
short text: "Removes all kerning overrides.". Defaulting to English.
short text: "Required.". Defaulting to English.
short text: "Required.". Defaulting to English.
short text: "Required.". Defaulting to English.
short text: "Required.". Defaulting to English.
short text: "Optional.". Defaulting to English.
short text: "Returns font anti-aliasing mode.". Defaulting to English.
short text: "Required.". Default

✅ Successfully loaded 1490 documents



