In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pipulate import pip
import secretsauce
import nest_asyncio
import keys
nest_asyncio.apply()

# --- ⚙️ Workflow Configuration ---

# Set the maximum number of rows for the final Excel export.
# A smaller number (like 1000) is great for fast testing.
ROW_LIMIT = 3000 

# --- NEW LOGIC ---
# Set the maximum number of *competitors* to process (in addition to your_domain)
# Set to 10 for rapid testing, or -1 to process all competitors.
COMPETITOR_LIMIT = 3
# --- END NEW LOGIC ---

print(f"✅ Configuration set: Final report will be limited to {ROW_LIMIT} rows.")
if COMPETITOR_LIMIT != -1:
    print(f"✅ Configuration set: Processing will be limited to the top {COMPETITOR_LIMIT} competitors.")
else:
    print(f"✅ Configuration set: Processing all competitors.")
    
job = "gapalyzer-01" # Give your session a unique name 

print(f"✅ Configuration set: Final report will be limited to {ROW_LIMIT} rows.")

job = "gapalyzer-01" # Give your session a unique name

# Here are the Keys

In [None]:

pip.api_key(job, key=keys.google)
botify_token = keys.botify


## Here are your Foes

### Save all of These

In [None]:
import nbformat
from pathlib import Path

def get_competitors_from_notebook(notebook_filename="GAPalyzer.ipynb"):
    """Parses this notebook to get the domain list from the 'url-list-input' cell."""
    try:
        notebook_path = Path(notebook_filename)
        with open(notebook_path, 'r', encoding='utf-8') as f:
            nb = nbformat.read(f, as_version=4)
        
        for cell in nb.cells:
            if "url-list-input" in cell.metadata.get("tags", []):
                domains_raw = cell.source
                domains = [
                    line.split('#')[0].strip() 
                    for line in domains_raw.splitlines() 
                    if line.strip() and not line.strip().startswith('#')
                ]
                return domains
        print("⚠️ Warning: Could not find a cell tagged with 'url-list-input'.")
        return []
    except Exception as e:
        print(f"❌ Error reading domains from notebook: {e}")
        return []

# --- Main Logic ---
print("🚀 Generating SEMrush URLs for GAP analysis...")

domains = get_competitors_from_notebook()
url_template = "https://www.semrush.com/analytics/organic/positions/?db=us&q={domain}&searchType=domain"

if not domains:
    print("🛑 No domains found. Please add competitor domains to the 'url-list-input' cell and re-run.")
else:
    print(f"✅ Found {len(domains)} competitor domains. Click the links below to open each report:")
    print("-" * 30)
    for i, domain in enumerate(domains):
        full_url = url_template.format(domain=domain)
        print(f"{i+1}. {domain}:\n   {full_url}\n")

## Here's how it goes:
### 1. Stack 'Em (All downloads to 1 file)

In [None]:
import pandas as pd
from pathlib import Path
import os
import nbformat # Make sure this import is here

def get_all_domains_from_notebook(notebook_filename="GAPalyzer.ipynb"):
    """Parses this notebook to get the full domain list from the 'url-list-input' cell."""
    try:
        notebook_path = Path(notebook_filename)
        with open(notebook_path, 'r', encoding='utf-8') as f:
            nb = nbformat.read(f, as_version=4)
        
        for cell in nb.cells:
            if "url-list-input" in cell.metadata.get("tags", []):
                domains_raw = cell.source
                domains = [
                    line.split('#')[0].strip() 
                    for line in domains_raw.splitlines() 
                    if line.strip() and not line.strip().startswith('#')
                ]
                return domains
        print("⚠️ Warning: Could not find a cell tagged with 'url-list-input'.")
        return []
    except Exception as e:
        print(f"❌ Error reading domains from notebook: {e}")
        return []

# --- Main Logic ---
print("🥞 Stacking 'Em: Consolidating all SEMrush Excel files...")

all_domains = get_all_domains_from_notebook()

if not all_domains:
    print("🛑 No domains found. Please add your client and competitor domains to the 'url-list-input' cell.")
else:
    your_domain = all_domains[0]
    print(f"✅ Client domain identified as: {your_domain}")
    
    # --- NEW LOGIC ---
    # 3. Apply the COMPETITOR_LIMIT
    competitor_domains = all_domains[1:]
    if COMPETITOR_LIMIT != -1:
        competitor_domains = competitor_domains[:COMPETITOR_LIMIT]
        print(f"🔪 Limiting to Top {len(competitor_domains)} competitors based on 'url-list-input' order.")
    
    # This is the final list of domains we will actually load files for
    domains_to_process = [your_domain] + competitor_domains
    # --- END NEW LOGIC ---
    
    downloads_dir = Path.home() / "Downloads"
    all_excel_files = list(downloads_dir.glob("*.xlsx"))
    
    list_of_dataframes = []
    for file_path in all_excel_files:
        try:
            domain_name = file_path.name.split('-organic.Positions')[0]
            
            # --- MODIFIED: Use domains_to_process to filter ---
            if domain_name in domains_to_process:
                df = pd.read_excel(file_path)
                df['domain'] = domain_name
                df["Client URL"] = df.apply(lambda row: row["URL"] if row["domain"] == your_domain else None, axis=1)
                df["Competitor URL"] = df.apply(lambda row: row["URL"] if row["domain"] != your_domain else None, axis=1)
                list_of_dataframes.append(df)
                print(f"  -> ✅ Loaded and tagged {file_path.name}")
            # --- END MODIFIED ---
            
        except Exception as e:
            print(f"  -> ⚠️ Could not process file {file_path.name}: {e}")

    if list_of_dataframes:
        combined_df = pd.concat(list_of_dataframes, ignore_index=True)
        print("\n🎉 Success! All relevant Excel files have been consolidated.")
        display(combined_df.head())
    else:
        print("🤷 No matching Excel files were found in the Downloads folder.")

### Aggregate All Metrics 🧮

In [None]:
import pandas as pd

print("⚙️ Aggregating all keyword metrics from the master list...")

if 'combined_df' not in locals():
    print("❌ 'combined_df' not found. Please run the 'Stack \'Em' cell first.")
else:
    try:
        # Define the aggregation functions for each metric
        # This is the "smart" logic from your blueprint
        agg_funcs = {
            'Position': 'min',  # Get the *best* (lowest) position
            'Search Volume': 'max', # Get the *highest* (most accurate) volume
            'CPC': 'mean',
            'Keyword Difficulty': 'mean',
            'Competition': 'mean',
            'Number of Results': 'max',
            'Timestamp': 'max',
            'SERP Features by Keyword': 'first',
            'Keyword Intents': 'first',
            'Position Type': 'first',
            'URL': 'first', 
            'Client URL': 'first',
            'Competitor URL': 'first'
        }
        
        # Select only columns that exist in the DataFrame to avoid errors
        cols_to_agg = {k: v for k, v in agg_funcs.items() if k in combined_df.columns}
        
        print(f"  -> Aggregating columns: {list(cols_to_agg.keys())}")
        
        # Create the aggregate DataFrame
        agg_df = combined_df.groupby('Keyword').agg(cols_to_agg).reset_index()
        
        # Add 'Number of Words' column
        agg_df['Number of Words'] = agg_df["Keyword"].apply(lambda x: len(x.split()))
        
        print("\n🎉 Success! Aggregate metrics table created.")
        display(agg_df.head())
        
    except Exception as e:
        print(f"\n❌ An error occurred during aggregation: {e}")

### 2. Join 'em

In [None]:
import pandas as pd

print("🧩 Joining 'Em: Pivoting data to map keywords against competitors...")

if 'combined_df' not in locals() or 'your_domain' not in locals():
    print("❌ 'combined_df' or 'your_domain' not found. Please run the 'Stack \'Em' cell first.")
else:
    try:
        pivot_data = combined_df[['Keyword', 'domain', 'Search Volume']]
        pivoted_df = pivot_data.pivot_table(
            index='Keyword', 
            columns='domain', 
            values='Search Volume',
            aggfunc='first'
        ).fillna(0)

        # Reorder columns to put the client's domain first
        if your_domain in pivoted_df.columns:
            cols = [your_domain] + [col for col in pivoted_df if col != your_domain]
            pivoted_df = pivoted_df[cols]
        else:
            print(f"⚠️ Warning: Your specified domain '{your_domain}' was not found in the data.")

        print("\n🎉 Success! Data has been pivoted into a competitive matrix.")
        display(pivoted_df.head())

    except Exception as e:
        print(f"\n❌ An error occurred during the pivot process: {e}")

### 3. Aggregate All Metrics 🧮

In [None]:
import pandas as pd

print("⚙️ Aggregating all keyword metrics from the master list...")

if 'combined_df' not in locals():
    print("❌ 'combined_df' not found. Please run the 'Stack \'Em' cell first.")
else:
    try:
        # Define the aggregation functions for each metric
        agg_funcs = {
            'Position': 'min',
            'Search Volume': 'max',
            'CPC': 'mean',
            'Keyword Difficulty': 'mean',
            'Competition': 'mean',
            'Number of Results': 'max',
            'Timestamp': 'max',
            'SERP Features by Keyword': 'first',
            'Keyword Intents': 'first',
            'Position Type': 'first',
            
            # --- NEW LOGIC (from 40_GAP_Analysis.py) ---
            'URL': 'first', # Keep a general URL for reference
            'Client URL': 'first',      # This will find the client's URL (if any)
            'Competitor URL': 'first'   # This will find the first competitor's URL
            # --- END NEW LOGIC ---
        }
        
        cols_to_agg = {k: v for k, v in agg_funcs.items() if k in combined_df.columns}
        print(f"  -> Aggregating columns: {list(cols_to_agg.keys())}")
        
        agg_df = combined_df.groupby('Keyword').agg(cols_to_agg).reset_index()
        agg_df['Number of Words'] = agg_df["Keyword"].apply(lambda x: len(x.split()))
        
        print("\n🎉 Success! Aggregate metrics table created.")
        display(agg_df.head())
        
    except Exception as e:
        print(f"\n❌ An error occurred during aggregation: {e}")

### 4. Merge, Tag & Score 'em

In [None]:
import pandas as pd

print("🧩🏷️ Joining, Tagging, & Scoring: Building the final analysis table...")

if 'combined_df' not in locals() or 'agg_df' not in locals() or 'your_domain' not in locals():
    print("❌ Required DataFrames ('combined_df', 'agg_df') or 'your_domain' variable not found. Please run the previous cells first.")
else:
    try:
        # 1. PIVOT ON POSITION (THE FIX)
        # This creates the wide matrix of competitor *rankings*, not search volumes.
        print("  -> Pivoting on 'Position' to create competitor rank matrix...")
        pivot_df = combined_df.pivot_table(
            index='Keyword', 
            columns='domain', 
            values='Position', # <-- The correct value to pivot!
            aggfunc='min'     # Get the *best* rank
        ).fillna(0) # A 0 here now correctly means "no rank"

        # 2. MERGE 'EM: Join the pivoted ranks with the aggregate metrics
        # This is the "flat-table to flat-table join"
        print("  -> Merging pivoted ranks with aggregate metrics...")
        analysis_df = pd.merge(
            pivot_df.reset_index(),  # Start with (Keyword, your_domain_rank, comp1_rank, ...)
            agg_df,                    # Join with (Keyword, Search Volume, CPC, Client URL, ...)
            on='Keyword',
            how='left'
        )
        
        # 3. TAG 'EM: Identify competitor columns and find the gaps
        competitor_cols = [col for col in pivot_df.columns if col != your_domain]
        analysis_df['competitor_count'] = (analysis_df[competitor_cols] > 0).sum(axis=1)
        
        # Filter for gaps:
        # - Your domain's rank is 0 (or NaN)
        # - At least one competitor ranks (count > 0)
        gap_df = analysis_df[
            (analysis_df[your_domain].fillna(0) == 0) &
            (analysis_df['competitor_count'] > 0)
        ].copy()
        
        # 4. SCORE 'EM: Create the opportunity score
        gap_df['opportunity_score'] = gap_df['Search Volume'] * gap_df['competitor_count']
        
        # 5. SORT & ORGANIZE:
        metric_cols = [
            'Search Volume', 'opportunity_score', 'competitor_count', 
            'Keyword Difficulty', 'CPC', 'Number of Words', 'Keyword Intents', 
            'SERP Features by Keyword', 'Position Type', 'Number of Results'
        ]
        url_cols = ['Client URL', 'Competitor URL'] 

        final_metric_cols = [col for col in metric_cols if col in gap_df.columns]
        final_url_cols = [col for col in url_cols if col in gap_df.columns]
        domain_cols = [your_domain] + competitor_cols
        
        # Combine all columns in the correct order
        final_cols = (
            ['Keyword'] + 
            final_metric_cols + 
            [col for col in domain_cols if col in gap_df.columns] +
            final_url_cols # Add the URLs at the very end
        )
        
        gap_df_sorted = gap_df[final_cols].sort_values(by='opportunity_score', ascending=False)

        
        print("\n🎉 Success! Master gap analysis table created and scored.")
        print("-" * 30)
        
        print("Top 20 Content Opportunities (Head):")
        display(gap_df_sorted.head(20))

    except Exception as e:
        print(f"\n❌ An error occurred during this step: {e}")

### 5. Restrict Size

In [None]:
import pandas as pd

print(f"✂️ Truncating results to the top {ROW_LIMIT} opportunities...")

if 'gap_df_sorted' not in locals():
    print("❌ 'gap_df_sorted' not found. Please run the 'Tag \'Em' cell first.")
else:
    # 1. Get the original, full count of opportunities
    original_count = len(gap_df_sorted)
    
    # 2. Truncate the DataFrame to the specified ROW_LIMIT
    final_df_for_export = gap_df_sorted.head(ROW_LIMIT)
    
    # 3. Report the change
    new_count = len(final_df_for_export)
    print(f"✅ DataFrame truncated from {original_count:,} rows to {new_count:,} rows.")

### 6. Export to a Beautifully Formatted Excel File 🎨

In [None]:
import pandas as pd
from pathlib import Path
import openpyxl
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font, Alignment
from openpyxl.worksheet.table import Table, TableStyleInfo
import ipywidgets as widgets
from IPython.display import display
import platform
import subprocess
import os

def export_gap_analysis_to_excel(job: str, df: pd.DataFrame):
    """
    Exports the GAPalyzer DataFrame to a professionally formatted Excel file
    inside the 'output' folder.
    """
    if df.empty:
        print("⚠️ DataFrame is empty, skipping file export.")
        return

    output_dir = Path("output")
    output_dir.mkdir(parents=True, exist_ok=True)

    excel_filename = output_dir / f"{job}_gap_analysis.xlsx"
    
    print(f"🎨 Formatting and exporting data to Excel: {excel_filename}")
    
    with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
        df.to_excel(writer, index=False, sheet_name='GAP_Analysis')
        
        worksheet = writer.sheets['GAP_Analysis']

        # 1. Create an Excel Table for banded rows and filtering
        table_range = f"A1:{get_column_letter(worksheet.max_column)}{worksheet.max_row}"
        table = Table(displayName="GapAnalysisTable", ref=table_range)
        style = TableStyleInfo(name="TableStyleMedium9", showFirstColumn=False,
                               showLastColumn=False, showRowStripes=True, showColumnStripes=False)
        table.tableStyleInfo = style
        worksheet.add_table(table)

        # 2. Define consistent column widths
        width_map = {
            "Keyword": 50,
            "Search Volume": 18,
            "competitor_count": 18,
            "opportunity_score": 18,
            "Client URL": 50,
            "Competitor URL": 50
        }
        default_width = 15 # For competitor domains

        # 3. Apply formatting
        for col_idx, column_cell in enumerate(worksheet[1], 1):
            column_letter = get_column_letter(col_idx)
            header_text = str(column_cell.value)
            
            # A. Format header cell
            column_cell.font = Font(bold=True)
            column_cell.alignment = Alignment(horizontal='center', vertical='center')

            # B. Set column width
            width = default_width
            # Find the matching width, even if header is 'your-domain.com'
            for key, w in width_map.items():
                if key.lower() in header_text.lower():
                    width = w
                    break
            
            # Special check for the client domain column
            if header_text == your_domain:
                width = 15 # Keep client domain column width standard

            worksheet.column_dimensions[column_letter].width = width

        # 4. Apply text wrapping and vertical alignment to all data cells
        for row in worksheet.iter_rows(min_row=2):
            for cell in row:
                cell.alignment = Alignment(wrap_text=True, vertical='top')

    print(f"✅ Success! Files saved in the '{output_dir}' folder.")
    
    # --- Helper to open the output folder ---
    def _open_folder(path_str: str = "."):
        folder_path = Path(path_str).resolve()
        print(f"Attempting to open folder: {folder_path}")
        if not folder_path.exists() or not folder_path.is_dir():
            print(f"❌ Error: Path is not a valid directory: {folder_path}")
            return
        system = platform.system()
        try:
            if system == "Windows": os.startfile(folder_path)
            elif system == "Darwin": subprocess.run(["open", folder_path])
            else: subprocess.run(["xdg-open", folder_path])
        except Exception as e:
            print(f"❌ Failed to open folder. Please navigate to it manually. Error: {e}")

    # --- Display the "Open Folder" button ---
    button = widgets.Button(
        description="📂 Open Output Folder",
        tooltip=f"Open {output_dir.resolve()}",
        button_style='success'
    )
    button.on_click(lambda b: _open_folder("output"))
    display(button)

# --- Main Logic ---
# ⭐️ MODIFIED: Check for the new 'final_df_for_export' variable
if 'final_df_for_export' not in locals():
    print("❌ 'final_df_for_export' not found. Please run the 'Truncate' cell first.")
else:
    # ⭐️ MODIFIED: Pass the truncated DataFrame to the export function
    export_gap_analysis_to_excel(job, final_df_for_export)

# Todo
- Make it work with Excel or CSV files
- Bring your own CSV (Botify, GSC, etc) for joined rows & PageRank and other signals
- A way to take ad hoc metrics into account with opportunity scores
- Botify API integration in place of bring your own CSV
- Each deliverable written to its own folder so clients don't see each other's files when Open Output is pressed
- 