In [None]:
# !pip install requests beautifulsoup4 pathvalidate

In [None]:
import requests
from bs4 import BeautifulSoup
from pathvalidate import sanitize_filename
from itertools import islice
import os
import re


In [None]:
# URL of the EnergyPlus Input-Output Reference index page
INDEX_URL = "https://bigladdersoftware.com/epx/docs/23-1/input-output-reference/index.html"

# Send request to get the webpage
response = requests.get(INDEX_URL)
soup = BeautifulSoup(response.text, "html5lib")

# Find all links inside <li><a>...</a></li>
links = soup.select("li a")  # Assuming each section is inside <li><a>

# Dictionary to store section names and their corresponding URLs
section_links = {}

# Extract links
for link in links[20:]:
    section_name = link.text.strip()
    href = link.get("href")

    # Remove unwanted characters
    section_name = section_name.replace('/', '').replace(':', '')

    # Check if 'Group' is in the section name
    if 'Group' in section_name:
        # Convert relative link to absolute URL
        full_url = INDEX_URL.rsplit("/", 1)[0] + "/" + href if href else "No URL"
        # Add the section name and URL to the dictionary
        section_links[section_name] = full_url

# Now `section_links` contains the names and URLs of 'Group' sections
# for name, url in section_links.items():
    # print(f"{name}: {url}")

print(len(section_links))


63


In [None]:

# Target directory to save the text files
target_directory = '/content/drive/MyDrive/PYTHON_PRACTICE/Data_preprocessing/sample_data/'

os.makedirs(target_directory, exist_ok=True)

def sanitize_filename(filename):
    # Check if 'Group –' is in the filename before replacing
    if 'Group –' in filename:
        filename = filename.replace('Group –', '').strip()

    # Check if ' – ' is in the filename before replacing
    if ' – ' in filename:
        filename = filename.replace(' – ', '').strip()

    # Check if non-ASCII characters are in the filename before removing
    if re.search(r'[^\x00-\x7F]+', filename):
        filename = re.sub(r'[^\x00-\x7F]+', '', filename).strip()

    # Check if ':' is in the filename before replacing
    if ':' in filename:
        filename = filename.replace(':', '').strip()

    return filename.strip()


# Assuming 'section_links' is a dictionary containing section names and URLs
for section_name, url in islice(section_links.items(), 63):
    # Send a GET request to the URL
    response = requests.get(url)
    if response.status_code == 200:
        # Force UTF-8 encoding (if the response was misinterpreted)
        response.encoding = 'utf-8'

        # Decode the content correctly in case of encoding mismatch
        content = response.content.decode('utf-8', errors='replace')

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(content, "html5lib")

        # Find the div with id 'main'
        main_content = soup.find('div', id='main')

        if main_content:
            # Initialize a list to hold the text content
            text_content = []

            # Remove all <table> elements
            for table in main_content.find_all('table'):
                table.decompose()

            # Find and remove the footer with class 'footer-container' inside 'main'
            footer = main_content.find('div', id='footer-container')
            if footer:
                footer.decompose()

            # Find and remove all MathJax components
            for mjx in soup.find_all(class_=re.compile('MJXc-')):
                mjx.decompose()

            # Extract text from specific tags
            for tag in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'pre']):
                text_content.append(tag.get_text(strip=True))

            # Join all text content into a single string
            text = '\n'.join(text_content).strip()

            # Remove unwanted placeholders like '[LINK]'
            text = text.replace('[LINK]', '').strip()

            # Manually replace any known problematic characters
            text = text.replace("–", "-").strip()  # For en dash to simple dash
            text = text.replace('’', "'").strip()  # For curly apostrophe
            text = text.replace('—', "-").strip()  # For em dash to simple dash

            # Fix incorrect characters like 'â€“' and replace with en dash
            text = text.replace('â€“', '–').strip()

            # Sanitize the section name to create a valid filename
            filename = f"{sanitize_filename(section_name)}.txt"

            # Define the full path to save the file
            file_path = os.path.join(target_directory, filename)

            # Save the text to a .txt file
            with open(file_path, 'w', encoding='utf-8') as file:
                file.write(text)
            print(f"Saved content for '{section_name}' to '{file_path}'")
        else:
            print(f"Main content not found for '{section_name}'")
    else:
        print(f"Failed to retrieve '{url}' (Status code: {response.status_code})")


Saved content for 'Group â Simulation Parameters' to '/content/drive/MyDrive/PYTHON_PRACTICE/Data_preprocessing/sample_data/Group  Simulation Parameters.txt'
Saved content for 'Group â Compliance Objects' to '/content/drive/MyDrive/PYTHON_PRACTICE/Data_preprocessing/sample_data/Group  Compliance Objects.txt'
Saved content for 'Group â Location â Climate â Weather File Access' to '/content/drive/MyDrive/PYTHON_PRACTICE/Data_preprocessing/sample_data/Group  Location  Climate  Weather File Access.txt'
Saved content for 'Climate Group Outputs' to '/content/drive/MyDrive/PYTHON_PRACTICE/Data_preprocessing/sample_data/Climate Group Outputs.txt'
Saved content for 'Group â Schedules' to '/content/drive/MyDrive/PYTHON_PRACTICE/Data_preprocessing/sample_data/Group  Schedules.txt'
Saved content for 'Group â Surface Construction Elements' to '/content/drive/MyDrive/PYTHON_PRACTICE/Data_preprocessing/sample_data/Group  Surface Construction Elements.txt'
Saved content for 'WindowMateri