In [64]:
# Save in different folders

from bs4 import BeautifulSoup
import os
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
from urllib.parse import urljoin

folder_name = "triz_trisolver_eu"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)


def get_soup(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers, verify=False)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        print(f"Error: status code {response.status_code} for URL: {url}")
        print(response.text)
        return None


def get_subpages_urls(soup, base_url):
    subpages = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if not href.startswith('mailto') and not href.endswith('.pdf') and not href.endswith('.zip'):  # Exclude mailto, PDF and ZIP links
            if href.startswith('/') or href.startswith(base_url):
                if not href.startswith('http'):
                    href = base_url.rstrip('/') + '/' + href.lstrip('/')
                if not (href.startswith('//') or '://' in href and base_url.split('://')[0] != href.split('://')[0]):
                    subpages.append((href, link.text.strip()))
    return subpages

def find_subpages(soup):
    subpages = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and not href.startswith('#') and not href.startswith('javascript'):
            print(f"Found link: {href}")
            subpages.append(href)
    return subpages




def save_subpage(url, text):
    # Replace any special characters and spaces with underscores
    filename = text.strip().replace(" ", "_").replace("/", "-").replace(":", "-").replace("?", "")
    
    # Save the content to a file inside the folder
    with open(os.path.join(folder_name, f"{filename}.html"), "w", encoding="utf-8") as file:
        file.write(content)



def scrape_text(soup):
    text = ''
    for paragraph in soup.find_all('p'):
        text += paragraph.get_text() + '\n'
    return text

def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def save_text_to_file(text, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(text)

visited_urls = set()



def process_page(url, base_url, path='', link_text=''):
    if url in visited_urls:
        return
    visited_urls.add(url)

    soup = get_soup(url)
    if soup is None:
        return

    text = scrape_text(soup)
    page_name = link_text if link_text else url.replace(base_url, '').rstrip('/').replace('/', '_')
    if page_name == '':
        page_name = 'index'
    max_length = 100  # Adjust this value according to your file system limitations
    page_name = page_name[:max_length]

    # Create a subfolder for the current page and save the content to a file inside it
    subfolder_path = os.path.join(path, page_name)
    ensure_dir(subfolder_path)
    file_name = f"{page_name}.txt"
    file_path = os.path.join(subfolder_path, file_name)
    save_text_to_file(text, file_path)

    subpages = get_subpages_urls(soup, base_url)
    for subpage_url, subpage_link_text in subpages:
        print(f"Processing subpage URL: {subpage_url}")
        if subpage_url != url:
            process_page(subpage_url, base_url, subfolder_path, subpage_link_text)







def process_subpages(base_url, subpages):
    for subpage in subpages:
        subpage_url = urljoin(base_url, subpage)
        print(f"Processing subpage: {subpage_url}")
        process_page(subpage_url, base_url)



def main():
    base_url = 'https://triz.trisolver.eu/eng/ebf/'
    target_urls = [
        'https://triz.trisolver.eu/eng/ebf/vrg02.htm',
        'https://triz.trisolver.eu/eng/ebf/vrg03.htm',
        'https://triz.trisolver.eu/eng/ebf/vrg04.htm',
        'https://triz.trisolver.eu/eng/ebf/vrg05.htm',
        'https://triz.trisolver.eu/eng/ebf/vrg06.htm',
        'https://triz.trisolver.eu/eng/ebf/ake.htm',
    ]

    main_folder = 'triz_trisolver_eu'
    if not os.path.exists(main_folder):
        os.makedirs(main_folder)

    for url in target_urls:
        soup = get_soup(url)
        if soup is None:
            print(f"Error: Failed to retrieve content from {url}")
            continue

        subpages = find_subpages(soup)
        
        for subpage in subpages:
            subpage_url = urljoin(base_url, subpage)
            print(f"Processing subpage: {subpage_url}")
            
            link_element = soup.find('a', href=subpage)
            if link_element:
                link_text = link_element.get_text().strip()
            else:
                link_text = subpage.replace(base_url, '').rstrip('/').replace('/', '_')
                
            process_page(subpage_url, base_url, main_folder, link_text=link_text)


In [62]:
main()

Found link: http://www.triz.it/eng
Found link: ap01.htm
Found link: ap02.htm
Found link: ap03.htm
Found link: ap04.htm
Found link: ap05.htm
Found link: ap06.htm
Found link: ap07.htm
Found link: ap08.htm
Found link: ap09.htm
Found link: ap10.htm
Found link: ap11.htm
Found link: ap12.htm
Found link: http://www.tris-europe.com
Processing subpage: http://www.triz.it/eng
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap01.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap02.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap03.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap04.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap05.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap06.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap07.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap08.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap09.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap10.htm
P

In [66]:
# Save in the same file


from bs4 import BeautifulSoup
import os
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)
from urllib.parse import urljoin

folder_name = "triz_trisolver_eu"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)


def get_soup(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(url, headers=headers, verify=False)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        print(f"Error: status code {response.status_code} for URL: {url}")
        print(response.text)
        return None


def get_subpages_urls(soup, base_url):
    subpages = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if not href.startswith('mailto') and not href.endswith('.pdf') and not href.endswith('.zip'):  # Exclude mailto, PDF and ZIP links
            if href.startswith('/') or href.startswith(base_url):
                if not href.startswith('http'):
                    href = base_url.rstrip('/') + '/' + href.lstrip('/')
                if not (href.startswith('//') or '://' in href and base_url.split('://')[0] != href.split('://')[0]):
                    subpages.append((href, link.text.strip()))
    return subpages

def find_subpages(soup):
    subpages = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and not href.startswith('#') and not href.startswith('javascript'):
            print(f"Found link: {href}")
            subpages.append(href)
    return subpages




def save_subpage(url, text):
    # Replace any special characters and spaces with underscores
    filename = text.strip().replace(" ", "_").replace("/", "-").replace(":", "-").replace("?", "")
    
    # Save the content to a file inside the folder
    with open(os.path.join(folder_name, f"{filename}.html"), "w", encoding="utf-8") as file:
        file.write(content)



def scrape_text(soup):
    text = ''
    for paragraph in soup.find_all('p'):
        text += paragraph.get_text() + '\n'
    return text

def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def save_text_to_file(text, file_path, page_name):
    with open(file_path, 'a', encoding='utf-8') as file:
        file.write(f"{page_name}\n")
        file.write(text)
        file.write("\n\n")

visited_urls = set()



def process_page(url, base_url, path='', link_text='', main_file_path=None):
    if url in visited_urls:
        return
    visited_urls.add(url)

    soup = get_soup(url)
    if soup is None:
        return

    text = scrape_text(soup)
    page_name = link_text if link_text else url.replace(base_url, '').rstrip('/').replace('/', '_')
    if page_name == '':
        page_name = 'index'
    max_length = 100  # Adjust this value according to your file system limitations
    page_name = page_name[:max_length]

    if main_file_path is None:
        main_file_path = os.path.join(path, "website_content.txt")

    save_text_to_file(text, main_file_path, page_name)

    subpages = get_subpages_urls(soup, base_url)
    for subpage_url, subpage_link_text in subpages:
        print(f"Processing subpage URL: {subpage_url}")
        if subpage_url != url:
            process_page(subpage_url, base_url, path, subpage_link_text, main_file_path=main_file_path)




def process_subpages(base_url, subpages):
    for subpage in subpages:
        subpage_url = urljoin(base_url, subpage)
        print(f"Processing subpage: {subpage_url}")
        process_page(subpage_url, base_url)



def main():
    base_url = 'https://triz.trisolver.eu/eng/ebf/'
    target_urls = [
        'https://triz.trisolver.eu/eng/ebf/vrg02.htm',
        'https://triz.trisolver.eu/eng/ebf/vrg03.htm',
        'https://triz.trisolver.eu/eng/ebf/vrg04.htm',
        'https://triz.trisolver.eu/eng/ebf/vrg05.htm',
        'https://triz.trisolver.eu/eng/ebf/vrg06.htm',
        'https://triz.trisolver.eu/eng/ebf/ake.htm',
    ]

    main_folder = 'triz_trisolver_eu'
    if not os.path.exists(main_folder):
        os.makedirs(main_folder)

    for url in target_urls:
        soup = get_soup(url)
        if soup is None:
            print(f"Error: Failed to retrieve content from {url}")
            continue

        subpages = find_subpages(soup)
        
        for subpage in subpages:
            subpage_url = urljoin(base_url, subpage)
            print(f"Processing subpage: {subpage_url}")
            
            link_element = soup.find('a', href=subpage)
            if link_element:
                link_text = link_element.get_text().strip()
            else:
                link_text = subpage.replace(base_url, '').rstrip('/').replace('/', '_')
                
            process_page(subpage_url, base_url, main_folder, link_text=link_text)


In [67]:
main()

Found link: http://www.triz.it/eng
Found link: ap01.htm
Found link: ap02.htm
Found link: ap03.htm
Found link: ap04.htm
Found link: ap05.htm
Found link: ap06.htm
Found link: ap07.htm
Found link: ap08.htm
Found link: ap09.htm
Found link: ap10.htm
Found link: ap11.htm
Found link: ap12.htm
Found link: http://www.tris-europe.com
Processing subpage: http://www.triz.it/eng
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap01.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap02.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap03.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap04.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap05.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap06.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap07.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap08.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap09.htm
Processing subpage: https://triz.trisolver.eu/eng/ebf/ap10.htm
P