In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.display import display, Markdown
import gap_analyzer_sauce
from pipulate import pip
import nest_asyncio
nest_asyncio.apply()
import keys

job = "gapalyzer-03" # Give your session a unique name

# 1. Set all your Keys

In [None]:

botify_token = keys.botify
ROW_LIMIT = 3000
COMPETITOR_LIMIT = 3
BROWSER_DOWNLOAD_PATH = None
GLOBAL_WIDTH_ADJUSTMENT = 1.5
print(f'✅ Configuration set: Final report will be limited to {ROW_LIMIT} rows.')
if COMPETITOR_LIMIT:
    print(f'✅ Configuration set: Processing will be limited to the top {COMPETITOR_LIMIT} competitors.')
else:
    print(f'✅ Configuration set: Processing all competitors.')


## 2. List all your Foes

### 3. Save all of These

In [None]:
import gap_analyzer_sauce # Import the new module

# Call the function from the sauce module
# This performs the extraction, stores domains via pip.set, prints URLs,
# and returns the domains list if needed elsewhere (though we primarily rely on pip state now).
competitor_domains = gap_analyzer_sauce.extract_domains_and_print_urls(job)

# Optional: You could add a pip.get here for verification if desired
# stored_domains = pip.get(job, 'competitor_domains', [])
# print(f"\nVerification: Retrieved {len(stored_domains)} domains from pip state.")

#### 4. Process the Rows

## Verify Downloads

In [None]:
import gap_analyzer_sauce # Ensure module is imported

# Call the function from the sauce module.
# It handles moving files and storing relevant paths in pip state.
# BROWSER_DOWNLOAD_PATH should be defined in a config cell near the top.
semrush_dir, collected_files = gap_analyzer_sauce.collect_semrush_downloads(job, BROWSER_DOWNLOAD_PATH)

# Optional verification (can be commented out for cleaner output)
# if semrush_dir and collected_files:
#    print(f"\nVerification: Files collected in '{pip.get(job, 'semrush_download_dir')}'")
#    print(f"Files found/moved ({len(pip.get(job, 'collected_semrush_files'))}):")
#    # for f in pip.get(job, 'collected_semrush_files'): print(f" - {Path(f).name}") # Use Path for display if needed
# elif semrush_dir:
#    print(f"\nVerification: Destination directory '{pip.get(job, 'semrush_download_dir')}' confirmed, but no new files moved.")
# else:
#    print("\nVerification: File collection step encountered an error.")

# Call the function: It finds files, stores paths via pip.set, and returns Markdown summary
# COMPETITOR_LIMIT should be defined in a config cell near the top
markdown_summary = gap_analyzer_sauce.find_semrush_files_and_generate_summary(job, COMPETITOR_LIMIT)

# Display the returned Markdown summary
display(Markdown(markdown_summary))

# Optional Verification (can be commented out)
# stored_files = pip.get(job, 'collected_semrush_files', [])
# print(f"\nVerification: Retrieved {len(stored_files)} file paths from pip state.")

## Combine Downloads

In [None]:
# This one function now:
# 1. Reads the file list from pip state.
# 2. Loads and combines all SEMRush files into a master DataFrame.
# 3. Applies the COMPETITOR_LIMIT.
# 4. Stores the master DataFrame and competitor dictionary in pip state.
# 5. Returns the master DataFrame (for the next step) and domain counts (for display).
df2, domain_value_counts = gap_analyzer_sauce.load_and_combine_semrush_data(job, keys.client_domain, COMPETITOR_LIMIT)

# Display the domain value counts for verification
display(domain_value_counts)

## Make Pivot Table

In [None]:
# This function now handles:
# 1. Pivoting df2 by Keyword/Domain.
# 2. Calculating Competitors Positioning.
# 3. Loading or creating the competitors_df and saving it to CSV.
# 4. Printing summary statistics.
# 5. Storing pivot_df and competitors_df in pip state.
# It receives df2 directly from the previous cell's variable.
pivot_df = gap_analyzer_sauce.pivot_semrush_data(job, df2, keys.client_domain)

# Display the resulting pivot table
display(pivot_df)

# Optional verification
# print("\nVerification:")
# print(f"  Pivot DF stored: {'keyword_pivot_df_json' in pip.read_state(job)}")
# print(f"  Competitors DF stored: {'competitors_df_json' in pip.read_state(job)}")
# loaded_competitors = pd.read_json(pip.get(job, 'competitors_df_json', '[]'))
# print(f"  Competitors DF rows in state: {len(loaded_competitors)}")

## Filter Brand Names

In [None]:
# This function now handles:
# 1. Loading competitors_df from pip state.
# 2. Checking for and fetching missing homepage titles asynchronously.
# 3. Updating competitors_df with new titles.
# 4. Saving updated competitors_df to CSV and pip state.
# 5. Generating the keyword filter list from domains and titles.
# 6. Creating or updating the filter_keywords.csv file.
# 7. Storing the filter keyword list in pip state.
# It returns a status message.
status_message = gap_analyzer_sauce.fetch_titles_and_create_filters(job)

# Print the status message returned by the function
print(status_message)

# Optional verification
# print("\nVerification:")
# updated_competitors_df = pd.read_json(StringIO(pip.get(job, 'competitors_df_json', '[]')))
# print(f"  Competitors DF rows in state: {len(updated_competitors_df)}")
# print(f"  Example Title: {updated_competitors_df['Title'].iloc[0] if not updated_competitors_df.empty else 'N/A'}")
# filter_list = json.loads(pip.get(job, 'filter_keyword_list_json', '[]'))
# print(f"  Filter keywords stored: {len(filter_list)}")

## Make Aggregate Table

In [None]:
# This function now handles:
# 1. Defining aggregation rules for each metric.
# 2. Grouping df2 by Keyword and applying aggregations.
# 3. Calculating 'Number of Words'.
# 4. Dropping the aggregated 'Position' column.
# 5. Storing the resulting agg_df in pip state.
# 6. Returning agg_df for display and use in the next step.
# It receives df2 directly from the previous cell's variable.
agg_df = gap_analyzer_sauce.aggregate_semrush_metrics(job, df2)

# Display the aggregated data
display(agg_df)

# Optional verification
# print("\nVerification:")
# print(f"  Agg DF stored: {'keyword_aggregate_df_json' in pip.read_state(job)}")
# loaded_agg_df = pd.read_json(StringIO(pip.get(job, 'keyword_aggregate_df_json', '[]'))) # Use StringIO for verification
# print(f"  Agg DF rows in state: {len(loaded_agg_df)}")

## Join Pivot & Aggregate Table

In [None]:
# This function now handles:
# 1. Merging pivot_df and agg_df.
# 2. Reading the filter keyword list from the CSV file.
# 3. Applying the brand/negative keyword filter.
# 4. Reordering columns for readability.
# 5. Dropping unnecessary columns (Traffic metrics, Previous position).
# 6. Sorting the final DataFrame by Search Volume.
# 7. Storing the final arranged_df in pip state.
# It receives pivot_df and agg_df directly from previous cell variables.
arranged_df = gap_analyzer_sauce.merge_filter_arrange_data(job, pivot_df, agg_df)

# Display the final, arranged DataFrame
display(arranged_df)

# Optional verification
# print("\nVerification:")
# print(f"  Final Arranged DF stored: {'filtered_gap_analysis_df_json' in pip.read_state(job)}")
# loaded_arranged_df = pd.read_json(StringIO(pip.get(job, 'filtered_gap_analysis_df_json', '[]'))) # Use StringIO
# print(f"  Final Arranged DF rows in state: {len(loaded_arranged_df)}")

## Download Botify Data

In [None]:
# This one function now handles the entire Botify data fetching process...
# (comments remain the same)
botify_export_df, has_botify, report_path, csv_dir_path = gap_analyzer_sauce.fetch_botify_data_and_save(
    job,
    keys.botify,
    keys.botify_project_url
)

# --- Display Logic (Remains in Notebook) ---
# (Display logic remains the same)
if has_botify:
    print("\n--- Botify Data Summary ---")
    if "Internal Pagerank" in botify_export_df.columns:
        display(botify_export_df["Internal Pagerank"].value_counts())
    else:
        print("  ⚠️ 'Internal Pagerank' column not found for display.")
    print("-------------------------\n")
    if report_path:
        print(f"📁 Botify data saved to: {report_path.resolve()}")
    if csv_dir_path:
        print(f"📂 Containing folder: {csv_dir_path.resolve()}")
else:
    print("\nNo Botify data loaded or available to display summary.")

# Optional verification (using pip.get and StringIO)
# from io import StringIO
# print("\nVerification:")
# print(f"  Botify DF stored: {'botify_export_df_json' in pip.read_state(job)}")
# loaded_botify_df = pd.read_json(StringIO(pip.get(job, 'botify_export_df_json', '[]')))
# print(f"  Botify DF rows in state: {len(loaded_botify_df)}")

## Join Botify Data

In [None]:
# This function now handles:
# 1. Merging arranged_df with botify_export_df (if has_botify is True).
# 2. Renaming Botify's 'url' to 'Full URL' for the merge.
# 3. Inserting the new Botify columns neatly after the 'Competition' column.
# 4. Cleaning up redundant URL columns used for the merge.
# 5. Saving the intermediate 'unformatted.csv' file.
# 6. Storing the final DataFrame in pip state ('final_working_df_json').
# 7. Returning the final DataFrame (aliased as 'df') and a dict of data for display.

# It receives arranged_df, botify_export_df, and has_botify from previous cells.
df, display_data = gap_analyzer_sauce.merge_and_finalize_data(
    job,
    arranged_df,
    botify_export_df,
    has_botify
)

# --- Display Logic (Remains in Notebook, driven by return values) ---
print(f"Rows: {display_data['rows']:,}")
print(f"Cols: {display_data['cols']:,}")

if display_data['has_botify'] and display_data['pagerank_counts'] is not None:
    display(display_data['pagerank_counts'])
elif display_data['has_botify']:
    # This state means has_botify was true but 'Internal Pagerank' col was missing
    print("⚠️ Botify data was merged, but 'Internal Pagerank' column not found for display.")
else:
    # This state means has_botify was false
    print("ℹ️ No Botify data was merged.")

# Optional verification
# print("\nVerification:")
# print(f"  Final Working DF stored: {'final_working_df_json' in pip.read_state(job)}")

## Truncate Data

In [None]:
# This function now handles:
# 1. Iterating through volume cutoffs to find the best fit under ROW_LIMIT.
# 2. Handling edge cases (e.g., if filtering removes all rows).
# 3. Printing the truncation log.
# 4. Storing the truncated DataFrame in pip state ('truncated_df_for_clustering_json').
# 5. Returning the truncated DataFrame (aliased as 'df') for the next step.

# It receives 'df' (the final_df from the previous step) and 'ROW_LIMIT' from config.
df = gap_analyzer_sauce.truncate_dataframe_by_volume(job, df, ROW_LIMIT)

# Display the head of the final truncated DataFrame
display(df.head())

# Optional verification
# print("\nVerification:")
# print(f"  Truncated DF stored: {'truncated_df_for_clustering_json' in pip.read_state(job)}")
# loaded_truncated_df = pd.read_json(StringIO(pip.get(job, 'truncated_df_for_clustering_json', '[]')))
# print(f"  Truncated DF rows in state: {len(loaded_truncated_df)}")

## Cluster Keywords

In [None]:
# This one function now handles the entire clustering and finalization process:
# 1. Loads/tests clustering parameters from a JSON cache file.
# 2. Runs iterative ML clustering (TF-IDF, SVD, k-means) to find the best fit.
# 3. Names the resulting clusters using n-grams.
# 4. Performs the final column reordering.
# 5. Saves the final 'unformatted_csv'.
# 6. Prints the final cluster counts.
# 7. Stores the final DataFrame in pip state ('final_clustered_df_json').
# 8. Returns the final DataFrame for display.

# It receives 'df' (the truncated DF) and 'has_botify' from previous cells.
df = gap_analyzer_sauce.cluster_and_finalize_dataframe(job, df, has_botify)

# Display the head of the final, clustered, and arranged DataFrame
display(df.head())

# Optional verification
# print("\nVerification:")
# print(f"  Final Clustered DF stored: {'final_clustered_df_json' in pip.read_state(job)}")
# loaded_clustered_df = pd.read_json(StringIO(pip.get(job, 'final_clustered_df_json', '[]')))
# print(f"  Clustered DF rows in state: {len(loaded_clustered_df)}")

In [None]:
# This function now handles:
# 1. Deriving competitor info (list, lookup col) from pip state/keys.
# 2. Creating the 'deliverables' directory and Excel file path.
# 3. Normalizing and scoring the data for the "Gap Analysis" tab.
# 4. Writing this first tab to the Excel file.
# 5. Creating the "Open Deliverables Folder" button.
# 6. Storing all necessary paths and lists ('final_xl_file', 'loop_list', 'competitors_list', etc.) in pip state.
# 7. Returning the button and key variables for the next step.

# It receives 'df' and 'has_botify' from previous cells.
(
    button, 
    xl_file, 
    loop_list, 
    competitors, 
    semrush_lookup, 
    TARGET_COMPETITOR_COL, 
    has_botify
) = gap_analyzer_sauce.create_deliverables_excel_and_button(
    job,
    df,
    keys.client_domain, # Pass the clean domain
    has_botify
)

# Display the button
display(button)

# Optional verification
# print("\nVerification:")
# print(f"  Final XL File stored: {pip.get(job, 'final_xl_file')}")
# print(f"  Loop List stored: {pip.get(job, 'loop_list')}")
# print(f"  Competitors stored: {pip.get(job, 'competitors_list')}")
# print(f"  Target Col stored: {pip.get(job, 'target_competitor_col')}")

In [None]:
# This function now handles:
# 1. Defining and finding the canonical client competitor column (TARGET_COMPETITOR_COL).
# 2. Defining helper functions for reading/filtering keywords.
# 3. Looping through all filter definitions ("Important Keywords", "Best Opportunities", etc.).
# 4. For each filter:
#    - Slicing the main 'df'.
#    - Normalizing and scoring the slice.
#    - Sorting the slice.
#    - Appending it as a new tab to the existing Excel file.
# 5. Re-attaching the click handler to the button.
# 6. Returning the button for re-display.

# It receives all necessary objects from the previous cells.
button = gap_analyzer_sauce.add_filtered_excel_tabs(
    job,
    df,
    semrush_lookup,
    has_botify,
    competitors,
    xl_file,
    TARGET_COMPETITOR_COL, # This was returned by the previous function
    button
)

# Re-display the button (its on_click handler is preserved)
display(button)

# Optional verification
# print("\nVerification:")
# try:
#     wb = openpyxl.load_workbook(xl_file)
#     print(f"  Excel sheets: {wb.sheetnames}")
# except Exception as e:
#     print(f"  Could not read Excel file to verify sheets: {e}")

In [None]:
import pandas as pd
from pathlib import Path
from openpyxl import load_workbook
from openpyxl.styles import PatternFill, Font, Alignment, Border, Side
from openpyxl.formatting.rule import ColorScaleRule
from openpyxl.worksheet.table import Table, TableStyleInfo
from openpyxl.utils import get_column_letter
import re # Needed for is_safe_url
import validators # Need to import validators for URL check

print(f"🎨 Applying Excel Formatting to all data tabs in {xl_file.name} (third pass)...")

# NOTE: This cell assumes 'xl_file', 'competitors', 'semrush_lookup', 'has_botify'
#       'TARGET_COMPETITOR_COL' (the verified column name) are defined in previous cells.

# --- REQUIRED SUPPORT FUNCTIONS (Surgically Ported/Defined) ---

def create_column_mapping(sheet):
    """Creates a dictionary mapping header names to column letters."""
    mapping = {}
    for col_idx, column_cell in enumerate(sheet[1], 1): # Assumes headers are in row 1
        column_letter = get_column_letter(col_idx)
        mapping[str(column_cell.value)] = column_letter
    return mapping

def apply_fill_to_column_labels(sheet, column_mapping, columns_list, fill):
    """Applies a fill color to the header cells of specified columns."""
    for column_name in columns_list:
        column_letter = column_mapping.get(column_name)
        if column_letter:
            cell = sheet[f"{column_letter}1"]
            cell.fill = fill

def find_last_data_row(sheet, keyword_column_letter):
    """Finds the last row containing data in a specific column (e.g., 'Keyword')."""
    if not keyword_column_letter: # Handle case where keyword column might be missing
        return sheet.max_row

    last_row = sheet.max_row
    # Iterate backwards from the max row
    while last_row > 1 and sheet[f"{keyword_column_letter}{last_row}"].value in [None, "", " "]:
        last_row -= 1
    return last_row

def apply_conditional_formatting(sheet, column_mapping, last_row, conditionals_descending, conditionals_ascending, rule_desc, rule_asc):
    """Applies color scale conditional formatting to specified columns."""
    for label in conditionals_descending + conditionals_ascending:
        column_letter = column_mapping.get(label)
        if column_letter and last_row > 1: # Ensure there is data to format
            range_string = f'{column_letter}2:{column_letter}{last_row}'
            rule = rule_desc if label in conditionals_descending else rule_asc
            try:
                sheet.conditional_formatting.add(range_string, rule)
            except Exception as e:
                print(f"⚠️ Failed to apply conditional formatting for {label}: {e}")

def is_safe_url(url):
    """ Check if the given string is a valid URL using the validators library. """
    if not isinstance(url, str):
        return False
    # Use validators library for robust URL check
    return validators.url(url)

# Color schemes and patterns
green = '33FF33'
client_color = PatternFill(start_color='FFFF00', end_color='FFFF00', fill_type='solid') # Yellow
competitor_color = PatternFill(start_color='EEECE2', end_color='EEECE2', fill_type='solid') # Light Gray
semrush_color = PatternFill(start_color='FAEADB', end_color='FAEADB', fill_type='solid') # Light Orange
semrush_opportunity_color = PatternFill(start_color='F1C196', end_color='F1C196', fill_type='solid') # Darker Orange
botify_color = PatternFill(start_color='EADFF2', end_color='EADFF2', fill_type='solid') # Light Purple
botify_opportunity_color = PatternFill(start_color='AEA1C4', end_color='AEA1C4', fill_type='solid') # Darker Purple
color_scale_rule_desc = ColorScaleRule(start_type='min', start_color='FFFFFF', end_type='max', end_color=green) # White to Green (Higher is Better)
color_scale_rule_asc = ColorScaleRule(start_type='min', start_color=green, end_type='max', end_color='FFFFFF') # Green to White (Lower is Better)

# Create a border style (Subtle hair lines, thin bottom for headers)
thin_border = Border(left=Side(style='hair'), right=Side(style='hair'), top=Side(style='hair'), bottom=Side(style='thin'))

# Commonly reused column widths
tiny_width = 11
small_width = 15
medium_width = 20
description_width = 50
url_width = 70 # Adjusted slightly down from 100 for better viewability

# Define column widths (Verbatim)
column_widths = {
    'Keyword': 40, 'Search Volume': small_width, 'Number of Words': tiny_width,
    'Keyword Group (Experimental)': small_width, 'Competitors Positioning': tiny_width,
    'CPC': tiny_width, 'Keyword Difficulty': tiny_width, 'Competition': tiny_width,
    'Depth': tiny_width, 'No. of Keywords': tiny_width,
    'No. of Impressions excluding anonymized queries': small_width,
    'No. of Clicks excluding anonymized queries': small_width,
    'No. of Missed Clicks excluding anonymized queries': small_width,
    'Avg. URL CTR excluding anonymized queries': tiny_width,
    'Avg. URL Position excluding anonymized queries': tiny_width,
    'No. of Keywords for the URL To Achieve 90% Audience': tiny_width,
    'Raw Internal Pagerank': small_width, 'Internal Pagerank': tiny_width,
    'Internal Pagerank Position': tiny_width, 'No. of Unique Inlinks': tiny_width,
    'Title': description_width, 'Meta Description': description_width,
    'Timestamp': 12, 'SERP Features by Keyword': description_width,
    'Keyword Intents': medium_width, 'Position Type': small_width,
    'Number of Results': medium_width, 'Competitor URL': url_width,
    'Client URL': url_width, # This gets renamed later
    # Normalized/Score columns
    'Normalized CPC': tiny_width, 'Normalized Keyword Difficulty': tiny_width,
    'Normalized Raw Internal Pagerank': tiny_width, 'Normalized Search Volume': tiny_width,
    'Normalized Search Position': tiny_width, 'Normalized Missed Clicks': tiny_width,
    'Combined Score': tiny_width
}

# Commonly used number formats (Verbatim)
int_fmt = '0'
comma_fmt = '#,##0'
pct_fmt = '0.00'
date_fmt = 'yyyy-mm-dd' # Added for Timestamp clarity

# Define number formats (Added Timestamp)
number_formats = {
    'Search Volume': comma_fmt, 'Number of Words': int_fmt, 'CPC': pct_fmt,
    'Keyword Difficulty': int_fmt, 'Competition': pct_fmt, 'Depth': int_fmt,
    'No. of Keywords': comma_fmt, 'No. of Impressions excluding anonymized queries': comma_fmt,
    'No. of Clicks excluding anonymized queries': comma_fmt,
    'No. of Missed Clicks excluding anonymized queries': comma_fmt,
    'Avg. URL CTR excluding anonymized queries': pct_fmt,
    'Avg. URL Position excluding anonymized queries': '0.0',
    'No. of Keywords for the URL To Achieve 90% Audience': comma_fmt,
    'Raw Internal Pagerank': '0.0000000', 'Internal Pagerank': pct_fmt,
    'Internal Pagerank Position': int_fmt, 'No. of Unique Inlinks': comma_fmt,
    'Number of Results': comma_fmt, 'Timestamp': date_fmt,
    # Apply comma format to positioning and scores for consistency
    'Competitors Positioning': int_fmt, 'Normalized CPC': pct_fmt,
    'Normalized Keyword Difficulty': pct_fmt, 'Normalized Raw Internal Pagerank': pct_fmt,
    'Normalized Search Volume': pct_fmt, 'Normalized Search Position': pct_fmt,
    'Normalized Missed Clicks': pct_fmt, 'Combined Score': '0.00'
}

# --- DEFINE COLUMN GROUPS FOR COLORING (Verbatim, adapted for known columns) ---
# Higher Numbers More Green (Descending is better)
conditionals_descending = [
    'Search Volume', 'CPC', 'Competition', # Removed Traffic metrics as they were dropped
    'Avg. URL CTR excluding anonymized queries',
    'No. of Missed Clicks excluding anonymized queries', 'Combined Score',
    'No. of Unique Inlinks' # Added Inlinks (usually higher is better contextually)
]
# Lower Numbers More Green (Ascending is better)
conditionals_ascending = [
    'Keyword Difficulty', 'Raw Internal Pagerank', 'Internal Pagerank',
    'Internal Pagerank Position', 'Avg. URL Position excluding anonymized queries', 'Depth',
    TARGET_COMPETITOR_COL # Add the client's position column dynamically
] + [col for col in competitors if col != TARGET_COMPETITOR_COL] # Add other competitor position columns

# SEMRush Data Columns
semrush_columns = [
    'Keyword', 'Search Volume', 'CPC', 'Keyword Difficulty', 'Competition',
    'SERP Features by Keyword', 'Keyword Intents', 'Position Type',
    'Number of Results', 'Timestamp', 'Competitor URL', 'Client URL' # Includes Client URL before rename
]
# Botify Data Columns (Ensure these match final DataFrame after merge)
botify_columns = [
    'Depth', 'No. of Keywords', 'No. of Impressions excluding anonymized queries',
    'No. of Clicks excluding anonymized queries', 'No. of Missed Clicks excluding anonymized queries',
    'Avg. URL CTR excluding anonymized queries', 'Avg. URL Position excluding anonymized queries',
    'No. of Keywords for the URL To Achieve 90% Audience', 'Raw Internal Pagerank',
    'Internal Pagerank', 'Internal Pagerank Position', 'No. of Unique Inlinks',
    'Title', 'Meta Description' # Changed from API name
]
# Columns which get bigger header fonts
bigger_font_headers = [
    "Keyword", "Search Volume", "Title", "Meta Description",
    "Competitor URL", "Client URL", "SERP Features by Keyword"
]
# Columns which get darker Botify color
botify_opportunity_columns = [
    'Internal Pagerank', 'No. of Unique Inlinks',
    'No. of Missed Clicks excluding anonymized queries',
    'Normalized Raw Internal Pagerank', 'Normalized Missed Clicks'
]
# Columns which get darker SEMRush color
semrush_opportunity_columns = [
    'CPC', 'Keyword Difficulty', 'Normalized CPC', 'Normalized Keyword Difficulty',
    'Normalized Search Volume', 'Normalized Search Position', 'Combined Score' # Added Combined Score here
]


# --- APPLY FORMATTING TO EXCEL FILE ---
try:
    wb = load_workbook(xl_file)

    # --- UPDATED: Get all sheet names EXCEPT the diagnostics sheet ---
    sheets_to_format = [name for name in wb.sheetnames if name != "Filter Diagnostics"]
    # -----------------------------------------------------------------

    if not sheets_to_format:
         print("⚠️ No data sheets found in the Excel file to format. Skipping formatting.")

    for sheet_name in sheets_to_format:
        print(f"- Formatting '{sheet_name}' tab...")
        sheet = wb[sheet_name]
        column_mapping = create_column_mapping(sheet)

        # Determine the last row with data based on the 'Keyword' column
        keyword_col_letter = column_mapping.get("Keyword")
        # Add a check in case a sheet somehow doesn't have a Keyword column
        if not keyword_col_letter:
             print(f"  Skipping sheet '{sheet_name}': Cannot find 'Keyword' column for formatting reference.")
             continue
             
        last_row = find_last_data_row(sheet, keyword_col_letter)
        
        # --- Apply Formatting ---

        # 1. Fill client column (using TARGET_COMPETITOR_COL identified earlier)
        client_column_letter = column_mapping.get(TARGET_COMPETITOR_COL)
        if client_column_letter:
            for row in range(1, last_row + 1):
                cell = sheet[f"{client_column_letter}{row}"]
                cell.fill = client_color
                if row == 1: cell.font = Font(bold=True) # Bold header

        # 2. Fill Header Backgrounds
        apply_fill_to_column_labels(sheet, column_mapping, semrush_columns, semrush_color)
        apply_fill_to_column_labels(sheet, column_mapping, botify_columns, botify_color)
        # Apply competitor color only to competitor columns *present* in this sheet
        present_competitors = [c for c in competitors if c in column_mapping and c != TARGET_COMPETITOR_COL]
        apply_fill_to_column_labels(sheet, column_mapping, present_competitors, competitor_color)
        apply_fill_to_column_labels(sheet, column_mapping, botify_opportunity_columns, botify_opportunity_color)
        apply_fill_to_column_labels(sheet, column_mapping, semrush_opportunity_columns, semrush_opportunity_color)

        # 3. Header Styling (Alignment, Font, Border)
        header_font = Font(bold=True)
        header_align = Alignment(horizontal='center', vertical='center', wrap_text=True)
        for header, col_letter in column_mapping.items():
            cell = sheet[f"{col_letter}1"]
            cell.alignment = header_align
            cell.font = header_font
            cell.border = thin_border # Apply border to header
            if header in bigger_font_headers:
                 cell.font = Font(size=14, bold=True) # Slightly smaller than original for balance

        # 4. Hyperlinks (Competitor URL, Client URL)
        for col_label in ["Competitor URL", "Client URL"]:
            col_letter = column_mapping.get(col_label)
            if col_letter:
                for row in range(2, last_row + 1):
                    cell = sheet[f"{col_letter}{row}"]
                    url = cell.value
                    if url and is_safe_url(url) and not str(url).startswith('=HYPERLINK'):
                        # Truncate displayed URL if very long, keep full URL in link
                        display_text = url if len(url) <= 80 else url[:77] + "..."
                        cell.value = f'=HYPERLINK("{url}", "{display_text}")'
                        cell.font = Font(color="0000FF", underline="single")
                        cell.alignment = Alignment(vertical='top', wrap_text=False) # Prevent wrap for URLs


        # 5. Rotate Competitor Headers & Set Width
        competitor_header_align = Alignment(vertical='bottom', textRotation=90, horizontal='center')
        for competitor_col_name in competitors:
            col_letter = column_mapping.get(competitor_col_name)
            if col_letter:
                cell = sheet[f"{col_letter}1"]
                cell.alignment = competitor_header_align
                sheet.column_dimensions[col_letter].width = 4

        # 6. Apply Column Widths (with Global Adjustment)
        for label, width in column_widths.items():
            column_letter = column_mapping.get(label)
            if column_letter:
                # Apply the global width adjustment multiplier
                sheet.column_dimensions[column_letter].width = width * GLOBAL_WIDTH_ADJUSTMENT

        # 7. Apply Number Formats
        for label, format_code in number_formats.items():
            column_letter = column_mapping.get(label)
            if column_letter:
                for row in range(2, last_row + 1):
                    cell = sheet[f"{column_letter}{row}"]
                    # Apply only if cell is not empty, prevents formatting issues
                    if cell.value is not None:
                        cell.number_format = format_code

        # 8. Apply Conditional Formatting (Using the combined rules)
        apply_conditional_formatting(sheet, column_mapping, last_row, conditionals_descending, conditionals_ascending, color_scale_rule_desc, color_scale_rule_asc)

        # 9. Rename 'Client URL' Header Dynamically
        client_url_column_letter = column_mapping.get("Client URL")
        if client_url_column_letter:
            header_cell = sheet[f"{client_url_column_letter}1"]
            header_cell.value = f"{TARGET_COMPETITOR_COL} URL" # Use the canonical name

        # 10. Data Cell Alignment (Wrap text, top align)
        data_align = Alignment(wrap_text=False, vertical='top')
        url_columns = [column_mapping.get("Competitor URL"), column_mapping.get("Client URL")] # Get letters before loop
        for row_idx in range(2, last_row + 1):
            for col_idx in range(1, sheet.max_column + 1):
                cell = sheet.cell(row=row_idx, column=col_idx)
                col_letter = get_column_letter(col_idx)
                # Apply default alignment, skip URL columns handled earlier
                if col_letter not in url_columns:
                    cell.alignment = data_align


        # 11. Header Row Height & Freeze Panes
        # Use the explicit configuration variable for header height
        sheet.row_dimensions[1].height = locals().get('max_length', 15) * 9 if 'max_length' in locals() else 100
        sheet.freeze_panes = 'C2' # Freeze panes more appropriately after Keyword/Volume

        # 12. Apply AutoFilter
        max_col_letter = get_column_letter(sheet.max_column)
        if last_row > 0: # Ensure there are rows to filter
             sheet.auto_filter.ref = f"A1:{max_col_letter}{last_row}"

        # 13. (Optional but recommended) Add Table for banded rows (replaces manual banding)
        if last_row > 0: # Ensure there is data for the table
            table_range = f"A1:{max_col_letter}{last_row}"
            table_name = f"DataTable_{re.sub(r'[^A-Za-z0-9_]', '', sheet_name)}" # Sanitize sheet name for table name

            # --- CORRECTED TABLE CHECK ---
            # Defensively check if items in sheet._tables have a .name attribute
            existing_table_names = [t.name for t in sheet._tables if hasattr(t, 'name')]
            if table_name not in existing_table_names:
            # --- END CORRECTION ---
                 tab = Table(displayName=table_name, ref=table_range)
                 style = TableStyleInfo(name="TableStyleMedium9", showFirstColumn=False,
                                       showLastColumn=False, showRowStripes=True, showColumnStripes=False)
                 tab.tableStyleInfo = style
                 try:
                      sheet.add_table(tab)
                 except ValueError as ve:
                      print(f"  Note: Could not add Excel Table '{table_name}' to sheet '{sheet_name}'. Maybe overlaps existing table? Error: {ve}")
            # Optional: Add an else here if you want to log that the table already exists
            # else:
            #    print(f"  Skipping table creation: Table '{table_name}' already exists in sheet '{sheet_name}'.")

    # Save the workbook with all formatting applied
    wb.save(xl_file)
    print(f"✅ Formatting applied to all data tabs and saved to {xl_file.name}")

except FileNotFoundError:
    print(f"❌ Error: Excel file not found at {xl_file}. Cannot apply formatting.")
except KeyError as e:
     print(f"❌ Error during formatting: A required column key was not found: {e}. Check DataFrame structure.")
     # Safely attempt to print mapping if it exists
     if 'column_mapping' in locals(): print(f"   Column Mapping: {column_mapping}")
except Exception as e:
    print(f"❌ An unexpected error occurred during Excel formatting: {e}")
# Use a lambda function to call the portable _open_folder function on click
button.on_click(lambda b: _open_folder(str(deliverables_dir)))
display(button)