In [None]:
import requests
import csv
import os
from bs4 import BeautifulSoup

In [None]:
def download_websites(CSV_name):
    # Ensure to create a directory to save the HTML files
    if not os.path.exists('downloaded_pages'):
        os.makedirs('downloaded_pages')

    # Read websites from the CSV file
    with open(CSV_name, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header
        
        for row in reader:
            category, website, page, link = row
            
            try:
                # Make a GET request to fetch the raw HTML content
                response = requests.get(link, timeout=10)
                
                # Check if the request was successful (HTTP Status Code 200)
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'html.parser')
                    
                    # Save the HTML content to a file
                    with open(f'downloaded_pages/{website}.html', 'w', encoding='utf-8') as html_file:
                        html_file.write(str(soup))
                    print(f"Downloaded {website}")
                else:
                    print(f"Failed to retrieve {website}")
            except Exception as e:
                print(f"Error retrieving {website}: {str(e)}")

In [None]:
def get_body(HTML_files):
    """Get body from HTML files, and save to new HTML files"""
    # Ensure to create a directory to save the HTML files
    if not os.path.exists('body_pages'):
        os.makedirs('body_pages')

    for HTML_file in HTML_files:
        # Read HTML file
        with open(f'downloaded_pages/{HTML_file}', 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'html.parser')
            body = soup.find('body')
            # Save the HTML content to a file
            with open(f'body_pages/{HTML_file}', 'w', encoding='utf-8') as html_file:
                html_file.write(str(body))
            print(f"Saved body of {HTML_file}")

In [None]:
def split_csv(csv_name, n):
    """
    Split a CSV file into n equally long files.

    Parameters:
    csv_name (str): The path to the CSV file to split
    n (int): The number of equally long CSV files to create
    """
    with open(csv_name, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header
        
        # Create a list of rows
        rows = []
        for row in reader:
            rows.append(row)

        # Split the rows into n equally long lists
        rows_per_csv = len(rows) // n
        rows_split = [rows[i:i + rows_per_csv] for i in range(0, len(rows), rows_per_csv)]
        
        # Write each list of rows to a separate CSV file
        for i in range(n):
            with open(f'split_csv/{csv_name.split(".")[0]}_{i}.csv', mode='w', encoding='utf-8', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(header)
                writer.writerows(rows_split[i])

# Create a directory to save the CSV files
if not os.path.exists('split_csv'):
    os.makedirs('split_csv')

split_csv('websites.csv', 3)

In [None]:
def get_first_n_rows(csv_name, n):
    """
    Get the first n rows from a CSV file,
    and save them to a new CSV file.
    """
    with open(csv_name, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Skip the header
        
        # Create a list of rows
        rows = []
        for row in reader:
            rows.append(row)
        
        # Write the first n rows to a new CSV file
        with open(f'first_{n}_rows.csv', mode='w', encoding='utf-8', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(header)
            writer.writerows(rows[:n])

get_first_n_rows('websites_full_link_final.csv', 5)