# GAPalyzer 📐

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.display import display, Markdown
import gap_analyzer_sauce
from pipulate import pip
import nest_asyncio
nest_asyncio.apply()
import _config as keys

job = "gapalyzer-05" # Give your session a unique name

# 1. Set all your Keys

In [None]:

botify_token = keys.botify
ROW_LIMIT = 3000
COMPETITOR_LIMIT = 100
BROWSER_DOWNLOAD_PATH = None
GLOBAL_WIDTH_ADJUSTMENT = 1.5
print(f'✅ Configuration set: Final report will be limited to {ROW_LIMIT} rows.')
if COMPETITOR_LIMIT:
    print(f'✅ Configuration set: Processing will be limited to the top {COMPETITOR_LIMIT} competitors.')
else:
    print(f'✅ Configuration set: Processing all competitors.')


## 2. List all your Foes

### 3. Save all of These

In [None]:
import gap_analyzer_sauce # Import the new module

# Call the function from the sauce module
# This performs the extraction, stores domains via pip.set, prints URLs,
# and returns the domains list if needed elsewhere (though we primarily rely on pip state now).
competitor_domains = gap_analyzer_sauce.extract_domains_and_print_urls(job)

# Optional: You could add a pip.get here for verification if desired
# stored_domains = pip.get(job, 'competitor_domains', [])
# print(f"\nVerification: Retrieved {len(stored_domains)} domains from pip state.")

#### 4. Process the Rows

## Verify Downloads

In [None]:
import gap_analyzer_sauce # Ensure module is imported

# Call the function from the sauce module.
# It handles moving files and storing relevant paths in pip state.
# BROWSER_DOWNLOAD_PATH should be defined in a config cell near the top.
semrush_dir, collected_files = gap_analyzer_sauce.collect_semrush_downloads(job, BROWSER_DOWNLOAD_PATH)

# Optional verification (can be commented out for cleaner output)
# if semrush_dir and collected_files:
#    print(f"\nVerification: Files collected in '{pip.get(job, 'semrush_download_dir')}'")
#    print(f"Files found/moved ({len(pip.get(job, 'collected_semrush_files'))}):")
#    # for f in pip.get(job, 'collected_semrush_files'): print(f" - {Path(f).name}") # Use Path for display if needed
# elif semrush_dir:
#    print(f"\nVerification: Destination directory '{pip.get(job, 'semrush_download_dir')}' confirmed, but no new files moved.")
# else:
#    print("\nVerification: File collection step encountered an error.")

# Call the function: It finds files, stores paths via pip.set, and returns Markdown summary
# COMPETITOR_LIMIT should be defined in a config cell near the top
markdown_summary = gap_analyzer_sauce.find_semrush_files_and_generate_summary(job, COMPETITOR_LIMIT)

# Display the returned Markdown summary
display(Markdown(markdown_summary))

# Optional Verification (can be commented out)
# stored_files = pip.get(job, 'collected_semrush_files', [])
# print(f"\nVerification: Retrieved {len(stored_files)} file paths from pip state.")

## Combine Downloads

In [None]:
# This one function now:
# 1. Reads the file list from pip state.
# 2. Loads and combines all SEMRush files into a master DataFrame.
# 3. Applies the COMPETITOR_LIMIT.
# 4. Stores the master DataFrame and competitor dictionary in pip state.
# 5. Returns the master DataFrame (for the next step) and domain counts (for display).
df2, domain_value_counts = gap_analyzer_sauce.load_and_combine_semrush_data(job, keys.client_domain, COMPETITOR_LIMIT)

# Display the domain value counts for verification
display(domain_value_counts)

## Make Pivot Table

In [None]:
# This function now handles:
# 1. Pivoting df2 by Keyword/Domain.
# 2. Calculating Competitors Positioning.
# 3. Loading or creating the competitors_df and saving it to CSV.
# 4. Printing summary statistics.
# 5. Storing pivot_df and competitors_df in pip state.
# It receives df2 directly from the previous cell's variable.
pivot_df = gap_analyzer_sauce.pivot_semrush_data(job, df2, keys.client_domain)

# Display the resulting pivot table
display(pivot_df)

# Optional verification
# print("\nVerification:")
# print(f"  Pivot DF stored: {'keyword_pivot_df_json' in pip.read_state(job)}")
# print(f"  Competitors DF stored: {'competitors_df_json' in pip.read_state(job)}")
# loaded_competitors = pd.read_json(pip.get(job, 'competitors_df_json', '[]'))
# print(f"  Competitors DF rows in state: {len(loaded_competitors)}")

## Filter Brand Names

In [None]:
# This function now handles:
# 1. Loading competitors_df from pip state.
# 2. Checking for and fetching missing homepage titles asynchronously.
# 3. Updating competitors_df with new titles.
# 4. Saving updated competitors_df to CSV and pip state.
# 5. Generating the keyword filter list from domains and titles.
# 6. Creating or updating the filter_keywords.csv file.
# 7. Storing the filter keyword list in pip state.
# It returns a status message.
status_message = gap_analyzer_sauce.fetch_titles_and_create_filters(job)

# Print the status message returned by the function
print(status_message)

# Optional verification
# print("\nVerification:")
# updated_competitors_df = pd.read_json(StringIO(pip.get(job, 'competitors_df_json', '[]')))
# print(f"  Competitors DF rows in state: {len(updated_competitors_df)}")
# print(f"  Example Title: {updated_competitors_df['Title'].iloc[0] if not updated_competitors_df.empty else 'N/A'}")
# filter_list = json.loads(pip.get(job, 'filter_keyword_list_json', '[]'))
# print(f"  Filter keywords stored: {len(filter_list)}")

## Make Aggregate Table

In [None]:
# This function now handles:
# 1. Defining aggregation rules for each metric.
# 2. Grouping df2 by Keyword and applying aggregations.
# 3. Calculating 'Number of Words'.
# 4. Dropping the aggregated 'Position' column.
# 5. Storing the resulting agg_df in pip state.
# 6. Returning agg_df for display and use in the next step.
# It receives df2 directly from the previous cell's variable.
agg_df = gap_analyzer_sauce.aggregate_semrush_metrics(job, df2)

# Display the aggregated data
display(agg_df)

# Optional verification
# print("\nVerification:")
# print(f"  Agg DF stored: {'keyword_aggregate_df_json' in pip.read_state(job)}")
# loaded_agg_df = pd.read_json(StringIO(pip.get(job, 'keyword_aggregate_df_json', '[]'))) # Use StringIO for verification
# print(f"  Agg DF rows in state: {len(loaded_agg_df)}")

## Join Pivot & Aggregate Table

In [None]:
# This function now handles:
# 1. Merging pivot_df and agg_df.
# 2. Reading the filter keyword list from the CSV file.
# 3. Applying the brand/negative keyword filter.
# 4. Reordering columns for readability.
# 5. Dropping unnecessary columns (Traffic metrics, Previous position).
# 6. Sorting the final DataFrame by Search Volume.
# 7. Storing the final arranged_df in pip state.
# It receives pivot_df and agg_df directly from previous cell variables.
arranged_df = gap_analyzer_sauce.merge_filter_arrange_data(job, pivot_df, agg_df)

# Display the final, arranged DataFrame
display(arranged_df)

# Optional verification
# print("\nVerification:")
# print(f"  Final Arranged DF stored: {'filtered_gap_analysis_df_json' in pip.read_state(job)}")
# loaded_arranged_df = pd.read_json(StringIO(pip.get(job, 'filtered_gap_analysis_df_json', '[]'))) # Use StringIO
# print(f"  Final Arranged DF rows in state: {len(loaded_arranged_df)}")

## Truncate Data

In [None]:
# %% [markdown]
# ## Truncate Data (Moved Upstream for Performance)
#
# We apply the ROW_LIMIT *before* merging with Botify data.
# This speeds up the merge and all subsequent steps (clustering, Excel)
# by operating on a much smaller, pre-filtered set of keywords.

# %%
# This function now handles:
# 1. Iterating through volume cutoffs to find the best fit under ROW_LIMIT.
# ... (comments) ...
# 5. Returning the truncated DataFrame (aliased as 'df') for the next step.

# It receives 'arranged_df' (the final_df from the previous step) and 'ROW_LIMIT' from config.
# We will re-alias the output to 'arranged_df' so the next cell works.
arranged_df = gap_analyzer_sauce.truncate_dataframe_by_volume(job, arranged_df, ROW_LIMIT)

# Display the head of the final truncated DataFrame
display(arranged_df.head())

## Download Botify Data

In [None]:
# --- START URGENT FIX: Bypassing stale kernel cache ---
# We are redefining the function *locally* in this cell
# to force the kernel to use the corrected version.

import pandas as pd
from pathlib import Path
import json
from pipulate import pip # Make sure pip is imported
import _config as keys # Make sure keys is imported

# (Private helper functions _fetch_analysis_slugs, _export_data, etc., are assumed to be OK)
# (If they are not, they would need to be pasted here too, but the error is in the main function)

def fetch_botify_data_and_save(job: str, botify_token: str, botify_project_url: str):
    """
    Orchestrates fetching data from the Botify API using pre-defined helpers,
    handling slug detection, API calls with fallbacks, downloading, decompression,
    and storing the final DataFrame in pip state.
    """
    print("🤖 Fetching data from Botify API...")
    report_name = None # Initialize report_name
    csv_dir = None # Initialize csv_dir
    botify_export_df = pd.DataFrame() # Initialize as empty DataFrame

    # --- 1. Parse URL and get latest analysis slug ---
    try:
        cleaned_url = botify_project_url.rstrip('/')
        url_parts = cleaned_url.split('/')
        if len(url_parts) < 2:
             raise ValueError(f"Could not parse org/project from URL: {botify_project_url}")

        org = url_parts[-2]
        project = url_parts[-1]
        print(f"  Parsed Org: {org}, Project: {project}")

        slugs = gap_analyzer_sauce._fetch_analysis_slugs(org, project, botify_token) # Call helper from module
        if not slugs:
            raise ValueError("Could not find any Botify analysis slugs for the provided project.")
        analysis = slugs[0] # Use the most recent analysis
        print(f"  ✅ Found latest Analysis Slug: {analysis}")

    except (IndexError, ValueError, Exception) as e: 
        print(f"  ❌ Critical Error during Botify setup: {e}")
        pip.set(job, 'botify_export_df_json', pd.DataFrame().to_json(orient='records')) # Use old key as fallback on error
        return pd.DataFrame(), False, None, None 

    # --- 2. Define Paths and Payloads ---
    try:
        csv_dir = Path("data") / f"{job}_botify"
        csv_dir.mkdir(parents=True, exist_ok=True)
        report_name = csv_dir / "botify_export.csv"

        payload_full = {
            "fields": ["url", "depth", "gsc_by_url.count_missed_clicks", "gsc_by_url.avg_ctr", "gsc_by_url.avg_position", "inlinks_internal.nb.unique", "internal_page_rank.value", "internal_page_rank.position", "internal_page_rank.raw", "gsc_by_url.count_impressions", "gsc_by_url.count_clicks", "gsc_by_url.count_keywords", "gsc_by_url.count_keywords_on_url_to_achieve_90pc_clicks", "metadata.title.content", "metadata.description.content"],
            "sort": []
        }
        payload_fallback = {
            "fields": ["url", "depth", "inlinks_internal.nb.unique", "internal_page_rank.value", "internal_page_rank.position", "internal_page_rank.raw", "metadata.title.content", "metadata.description.content"],
            "sort": []
        }
    except Exception as e:
        print(f"  ❌ Error defining paths/payloads: {e}")
        pip.set(job, 'botify_export_df_json', pd.DataFrame().to_json(orient='records')) # Use old key as fallback on error
        return pd.DataFrame(), False, None, csv_dir 

    # --- 3. Main Logic: Check existing, call API with fallback ---
    loaded_from_existing = False
    if report_name.exists():
        print(f"  ☑️ Botify export file already exists at '{report_name}'. Reading from disk.")
        try:
            botify_export_df = pd.read_csv(report_name, skiprows=1)
            loaded_from_existing = True 
        except Exception as e:
            print(f"  ⚠️ Could not read existing CSV file '{report_name}', will attempt to re-download. Error: {e}")
            botify_export_df = pd.DataFrame() 

    if not loaded_from_existing:
        print("  Attempting download with Full GSC Payload...")
        status_code, _ = gap_analyzer_sauce._export_data('v1', org, project, payload_full, report_name, analysis=analysis)

        if status_code not in [200, 201]: 
            print("    -> Full Payload failed. Attempting Fallback Payload (no GSC data)...")
            status_code, _ = gap_analyzer_sauce._export_data('v1', org, project, payload_fallback, report_name, analysis=analysis)

        if report_name.exists():
             try:
                  botify_export_df = pd.read_csv(report_name, skiprows=1)
                  print("  ✅ Successfully downloaded and/or loaded Botify data.")
             except Exception as e:
                  print(f"  ❌ Download/decompression seemed successful, but failed to read the final CSV file '{report_name}'. Error: {e}")
                  botify_export_df = pd.DataFrame() 
        else:
             print("  ❌ Botify export failed critically after both attempts, and no file exists.")
             botify_export_df = pd.DataFrame()

    # --- 4. Store State and Return (THE FIX IS HERE) ---
    has_botify = not botify_export_df.empty
    
    # --- THIS IS THE FIX ---
    # We are storing the *path* to the CSV, not the *entire DataFrame*
    # This avoids the TooBigError: string or blob too big
    if has_botify:
        pip.set(job, 'botify_export_csv_path', str(report_name.resolve()))
        print(f"💾 Stored Botify CSV path in pip state for job '{job}': {report_name.resolve()}")
    else:
        pip.set(job, 'botify_export_csv_path', None)
        print("🤷 No Botify data loaded. Stored 'None' for path in pip state.")
    # --- END FIX ---

    # Return necessary info for display logic in notebook
    return botify_export_df, has_botify, report_name, csv_dir

# --- END URGENT FIX ---


# ... now your original cell content ...
botify_export_df, has_botify, report_path, csv_dir_path = fetch_botify_data_and_save(
    job,
    keys.botify,
    keys.botify_project_url
)

## Join Botify Data

In [None]:
# This function now handles:
# 1. Merging arranged_df with botify_export_df (if has_botify is True).
# 2. Renaming Botify's 'url' to 'Full URL' for the merge.
# 3. Inserting the new Botify columns neatly after the 'Competition' column.
# 4. Cleaning up redundant URL columns used for the merge.
# 5. Saving the intermediate 'unformatted.csv' file.
# 6. Storing the final DataFrame in pip state ('final_working_df_json').
# 7. Returning the final DataFrame (aliased as 'df') and a dict of data for display.

# It receives arranged_df, botify_export_df, and has_botify from previous cells.
df, display_data = gap_analyzer_sauce.merge_and_finalize_data(
    job,
    arranged_df,
    botify_export_df,
    has_botify
)

# --- Display Logic (Remains in Notebook, driven by return values) ---
print(f"Rows: {display_data['rows']:,}")
print(f"Cols: {display_data['cols']:,}")

if display_data['has_botify'] and display_data['pagerank_counts'] is not None:
    display(display_data['pagerank_counts'])
elif display_data['has_botify']:
    # This state means has_botify was true but 'Internal Pagerank' col was missing
    print("⚠️ Botify data was merged, but 'Internal Pagerank' column not found for display.")
else:
    # This state means has_botify was false
    print("ℹ️ No Botify data was merged.")

# Optional verification
# print("\nVerification:")
# print(f"  Final Working DF stored: {'final_working_df_json' in pip.read_state(job)}")

## Cluster Keywords

In [None]:
# This one function now handles the entire clustering and finalization process:
# 1. Loads/tests clustering parameters from a JSON cache file.
# 2. Runs iterative ML clustering (TF-IDF, SVD, k-means) to find the best fit.
# 3. Names the resulting clusters using n-grams.
# 4. Performs the final column reordering.
# 5. Saves the final 'unformatted_csv'.
# 6. Prints the final cluster counts.
# 7. Stores the final DataFrame in pip state ('final_clustered_df_json').
# 8. Returns the final DataFrame for display.

# It receives 'df' (the truncated DF) and 'has_botify' from previous cells.
df = gap_analyzer_sauce.cluster_and_finalize_dataframe(job, df, has_botify)

# Display the head of the final, clustered, and arranged DataFrame
display(df.head())

# Optional verification
# print("\nVerification:")
# print(f"  Final Clustered DF stored: {'final_clustered_df_json' in pip.read_state(job)}")
# loaded_clustered_df = pd.read_json(StringIO(pip.get(job, 'final_clustered_df_json', '[]')))
# print(f"  Clustered DF rows in state: {len(loaded_clustered_df)}")

In [None]:
# This function now handles:
# 1. Deriving competitor info (list, lookup col) from pip state/keys.
# 2. Creating the 'deliverables' directory and Excel file path.
# 3. Normalizing and scoring the data for the "Gap Analysis" tab.
# 4. Writing this first tab to the Excel file.
# 5. Creating the "Open Deliverables Folder" button.
# 6. Storing all necessary paths and lists ('final_xl_file', 'loop_list', 'competitors_list', etc.) in pip state.
# 7. Returning the button and key variables for the next step.

# It receives 'df' and 'has_botify' from previous cells.
(
    button, 
    xl_file, 
    loop_list, 
    competitors, 
    semrush_lookup, 
    TARGET_COMPETITOR_COL, 
    has_botify
) = gap_analyzer_sauce.create_deliverables_excel_and_button(
    job,
    df,
    keys.client_domain, # Pass the clean domain
    has_botify
)

# Display the button
display(button)

# Optional verification
# print("\nVerification:")
# print(f"  Final XL File stored: {pip.get(job, 'final_xl_file')}")
# print(f"  Loop List stored: {pip.get(job, 'loop_list')}")
# print(f"  Competitors stored: {pip.get(job, 'competitors_list')}")
# print(f"  Target Col stored: {pip.get(job, 'target_competitor_col')}")

In [None]:
# This function now handles:
# 1. Defining and finding the canonical client competitor column (TARGET_COMPETITOR_COL).
# 2. Defining helper functions for reading/filtering keywords.
# 3. Looping through all filter definitions ("Important Keywords", "Best Opportunities", etc.).
# 4. For each filter:
#    - Slicing the main 'df'.
#    - Normalizing and scoring the slice.
#    - Sorting the slice.
#    - Appending it as a new tab to the existing Excel file.
# 5. Re-attaching the click handler to the button.
# 6. Returning the button for re-display.

# It receives all necessary objects from the previous cells.
button = gap_analyzer_sauce.add_filtered_excel_tabs(
    job,
    df,
    semrush_lookup,
    has_botify,
    competitors,
    xl_file,
    TARGET_COMPETITOR_COL, # This was returned by the previous function
    button
)

# Re-display the button (its on_click handler is preserved)
display(button)

# Optional verification
# print("\nVerification:")
# try:
#     wb = openpyxl.load_workbook(xl_file)
#     print(f"  Excel sheets: {wb.sheetnames}")
# except Exception as e:
#     print(f"  Could not read Excel file to verify sheets: {e}")

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import pandas as pd
from pathlib import Path
from openpyxl import load_workbook
from openpyxl.styles import PatternFill, Font, Alignment, Border, Side
from openpyxl.formatting.rule import ColorScaleRule
from openpyxl.worksheet.table import Table, TableStyleInfo
from openpyxl.utils import get_column_letter
import re # Needed for is_safe_url
import validators # Need to import validators for URL check
import gap_analyzer_sauce # Ensure module is imported
from IPython.display import display # Ensure display is imported

# This is the final "painterly" step.
# This one function now handles:
# 1. Loading the Excel workbook from disk.
# 2. Getting ALL sheet names from the workbook.
# 3. Defining all helper functions for formatting (column mapping, cell finding, etc.).
# 4. Defining all formatting rules (colors, widths, number formats, column groups).
# 5. Iterating through each sheet and applying all 13+ formatting steps.
# 6. Saving the final, polished Excel file.
# 7. Returning the "Open Folder" button for re-display.

# It receives all necessary objects from the previous cells.
button = gap_analyzer_sauce.apply_excel_formatting(
    job,
    xl_file,
    competitors,
    semrush_lookup,
    TARGET_COMPETITOR_COL,
    has_botify,
    GLOBAL_WIDTH_ADJUSTMENT, # This is from the config cell
    button
)

# Re-display the button (its on_click handler is preserved)
display(button)

# Optional verification
# print("\nVerification: Final formatting applied.")