In [1]:
import os
import codecs
from bs4 import BeautifulSoup

In [2]:
def modify_html_files_in_folder(folder):
    total_files = 0
    processed_files = 0
    failed_files = 0
    folder_stats = []

    for root, dirs, files in os.walk(folder):
        html_files = [file for file in files if file.endswith('.html') and not file.startswith('._')]
        folder_files_count = len(html_files)
        total_files += folder_files_count
        
        print(f"Processing folder: {root}")
        print(f"Found {folder_files_count} HTML files")

        folder_processed = 0
        folder_failed = 0

        for file in html_files:
            file_path = os.path.join(root, file)
            if modify_html_file(file_path):
                processed_files += 1
                folder_processed += 1
            else:
                failed_files += 1
                folder_failed += 1

        folder_stats.append((root, folder_files_count, folder_processed, folder_failed))

    print(f"Total HTML files found: {total_files}")
    print(f"Total processed files: {processed_files}")
    print(f"Total failed files: {failed_files}")

    log_file_path = os.path.join(folder, "KBFix_HTML Script Results.txt")
    with open(log_file_path, 'w') as log_file:
        log_file.write(f"Total folders processed: {len(folder_stats)}\n")
        log_file.write(f"Total HTML files found: {total_files}\n")
        log_file.write(f"Total processed files: {processed_files}\n")
        log_file.write(f"Total failed files: {failed_files}\n")
        log_file.write("\nDetailed per-folder stats:\n")
        for stat in folder_stats:
            log_file.write(f"Folder: {stat[0]}\n")
            log_file.write(f"  HTML files found: {stat[1]}\n")
            log_file.write(f"  Processed files: {stat[2]}\n")
            log_file.write(f"  Failed files: {stat[3]}\n")
            log_file.write("\n")

def modify_html_file(file_path):
    try:
        with codecs.open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        # Parse the HTML content
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Find all <img> tags
        img_tags = soup.find_all('img')
        
        # Modify each <img> tag
        for img in img_tags:
            src = img.get('src', '')
            alt = img.get('alt', '')
            
            # If alt attribute is empty or does not match src, update it
            if not alt or alt != src:
                img['alt'] = src
        # Find and modify <a> tags with href starting with "Attachments/"
        a_tags = soup.find_all('a', href=True)
        for a in a_tags:
            href = a['href']
            # Check if the href starts with "Attachments/"
            if href.startswith("Attachments/"):
                # Prepend "https://" to the href
                a['href'] = f"https://{href}"

        # Find and modify <iframe> tags to remove specific substring from 'src' attribute
        for iframe in soup.find_all('iframe', src=True):
            src = iframe['src']
            # Remove the substring if it exists in 'src'
            if src.startswith("//almavia2.sharepoint.com/"):
                iframe['src'] = src.replace("//almavia2.sharepoint.com/", "", 1)
        
        # Get the modified HTML content
        modified_content = str(soup)
        
        # Write modified content back to the original file
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(modified_content)
        
        return True
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return False




In [3]:
# Usage:
input_folder = '/Users/macbook/Desktop/Import KB BBC/for PP/BBC Knownledge'  # Update with your input folder path

if os.path.isdir(input_folder):
    modify_html_files_in_folder(input_folder)
else:
    print(f"Input folder does not exist: {input_folder}")

Processing folder: /Users/macbook/Desktop/Import KB BBC/for PP/BBC Knownledge
Found 0 HTML files
Processing folder: /Users/macbook/Desktop/Import KB BBC/for PP/BBC Knownledge/ru
Found 0 HTML files
Processing folder: /Users/macbook/Desktop/Import KB BBC/for PP/BBC Knownledge/ru/Data
Found 133 HTML files
Processing folder: /Users/macbook/Desktop/Import KB BBC/for PP/BBC Knownledge/fr
Found 0 HTML files
Processing folder: /Users/macbook/Desktop/Import KB BBC/for PP/BBC Knownledge/fr/Data
Found 4 HTML files
Processing folder: /Users/macbook/Desktop/Import KB BBC/for PP/BBC Knownledge/en-us
Found 0 HTML files
Processing folder: /Users/macbook/Desktop/Import KB BBC/for PP/BBC Knownledge/en-us/Data
Found 1315 HTML files
Total HTML files found: 1452
Total processed files: 1452
Total failed files: 0
