In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pipulate import pip
import secretsauce
import nest_asyncio
import keys
nest_asyncio.apply()

job = "gapalyzer-01" # Give your session a unique name

# Here are the Keys

In [None]:

pip.api_key(job, key=keys.google)
botify_token = keys.botify


## Here are your Foes

### Save all of These

In [None]:
import nbformat
from pathlib import Path

def get_competitors_from_notebook(notebook_filename="GAPalyzer.ipynb"):
    """Parses this notebook to get the domain list from the 'url-list-input' cell."""
    try:
        notebook_path = Path(notebook_filename)
        with open(notebook_path, 'r', encoding='utf-8') as f:
            nb = nbformat.read(f, as_version=4)
        
        for cell in nb.cells:
            if "url-list-input" in cell.metadata.get("tags", []):
                domains_raw = cell.source
                domains = [
                    line.split('#')[0].strip() 
                    for line in domains_raw.splitlines() 
                    if line.strip() and not line.strip().startswith('#')
                ]
                return domains
        print("⚠️ Warning: Could not find a cell tagged with 'url-list-input'.")
        return []
    except Exception as e:
        print(f"❌ Error reading domains from notebook: {e}")
        return []

# --- Main Logic ---
print("🚀 Generating SEMrush URLs for GAP analysis...")

domains = get_competitors_from_notebook()
url_template = "https://www.semrush.com/analytics/organic/positions/?db=us&q={domain}&searchType=domain"

if not domains:
    print("🛑 No domains found. Please add competitor domains to the 'url-list-input' cell and re-run.")
else:
    print(f"✅ Found {len(domains)} competitor domains. Click the links below to open each report:")
    print("-" * 30)
    for i, domain in enumerate(domains):
        full_url = url_template.format(domain=domain)
        print(f"{i+1}. {domain}:\n   {full_url}\n")

### Stack 'Em, Join 'Em & Tag 'Em! 🥞🧠📊
#### 1. Stack 'Em (turns all downloads into 1)

In [None]:
import pandas as pd
from pathlib import Path
import os
import nbformat # Make sure this import is here

def get_all_domains_from_notebook(notebook_filename="GAPalyzer.ipynb"):
    """Parses this notebook to get the full domain list from the 'url-list-input' cell."""
    try:
        notebook_path = Path(notebook_filename)
        with open(notebook_path, 'r', encoding='utf-8') as f:
            nb = nbformat.read(f, as_version=4)
        
        for cell in nb.cells:
            if "url-list-input" in cell.metadata.get("tags", []):
                domains_raw = cell.source
                domains = [
                    line.split('#')[0].strip() 
                    for line in domains_raw.splitlines() 
                    if line.strip() and not line.strip().startswith('#')
                ]
                return domains
        print("⚠️ Warning: Could not find a cell tagged with 'url-list-input'.")
        return []
    except Exception as e:
        print(f"❌ Error reading domains from notebook: {e}")
        return []

# --- Main Logic ---
print("🥞 Stacking 'Em: Consolidating all SEMrush Excel files...")

# 1. Get all domains from the notebook cell
all_domains = get_all_domains_from_notebook()

if not all_domains:
    print("🛑 No domains found. Please add your client and competitor domains to the 'url-list-input' cell.")
else:
    # 2. THE NEW CONVENTION: The first domain is always the client's domain.
    your_domain = all_domains[0]
    print(f"✅ Client domain identified as: {your_domain}")
    
    downloads_dir = Path.home() / "Downloads"
    all_excel_files = list(downloads_dir.glob("*.xlsx"))
    
    list_of_dataframes = []
    for file_path in all_excel_files:
        try:
            domain_name = file_path.name.split('-organic.Positions')[0]
            # Only process files that are in our domain list
            if domain_name in all_domains:
                df = pd.read_excel(file_path)
                df['domain'] = domain_name
                list_of_dataframes.append(df)
                print(f"  -> ✅ Loaded and tagged {file_path.name}")
        except Exception as e:
            print(f"  -> ⚠️ Could not process file {file_path.name}: {e}")

    if list_of_dataframes:
        combined_df = pd.concat(list_of_dataframes, ignore_index=True)
        print("\n🎉 Success! All relevant Excel files have been consolidated.")
        display(combined_df.head())
    else:
        print("🤷 No matching Excel files were found in the Downloads folder.")

### 2. Join 'em

In [None]:
import pandas as pd

print("🧩 Joining 'Em: Pivoting data to map keywords against competitors...")

if 'combined_df' not in locals() or 'your_domain' not in locals():
    print("❌ 'combined_df' or 'your_domain' not found. Please run the 'Stack \'Em' cell first.")
else:
    try:
        pivot_data = combined_df[['Keyword', 'domain', 'Search Volume']]
        pivoted_df = pivot_data.pivot_table(
            index='Keyword', 
            columns='domain', 
            values='Search Volume',
            aggfunc='first'
        ).fillna(0)

        # Reorder columns to put the client's domain first
        if your_domain in pivoted_df.columns:
            cols = [your_domain] + [col for col in pivoted_df if col != your_domain]
            pivoted_df = pivoted_df[cols]
        else:
            print(f"⚠️ Warning: Your specified domain '{your_domain}' was not found in the data.")

        print("\n🎉 Success! Data has been pivoted into a competitive matrix.")
        display(pivoted_df.head())

    except Exception as e:
        print(f"\n❌ An error occurred during the pivot process: {e}")

In [None]:
import pandas as pd

print("🏷️ Tagging 'Em: Identifying and scoring content gaps...")

if 'pivoted_df' not in locals() or 'combined_df' not in locals() or 'your_domain' not in locals():
    print("❌ Required DataFrames or 'your_domain' variable not found. Please run the previous cells first.")
else:
    try:
        # Merge Search Volume back into the pivoted table
        search_volumes = combined_df[['Keyword', 'Search Volume']].drop_duplicates().set_index('Keyword')
        analysis_df = pivoted_df.merge(search_volumes, left_index=True, right_index=True)
        
        # Identify competitor columns
        competitor_cols = [col for col in pivoted_df.columns if col != your_domain]
        
        # Count competitor rankings
        analysis_df['competitor_count'] = (analysis_df[competitor_cols] > 0).sum(axis=1)
        
        # Find the gaps where your domain ranks 0
        gap_df = analysis_df[analysis_df[your_domain] == 0].copy()
        
        # Score the opportunity
        gap_df['opportunity_score'] = gap_df['Search Volume'] * gap_df['competitor_count']
        
        # Sort by the highest opportunity score
        gap_df_sorted = gap_df.sort_values(by='opportunity_score', ascending=False)
        
        print("\n🎉 Success! Content gaps have been identified and scored.")
        print("-" * 30)
        
        print("Top 20 Content Opportunities:")
        display(gap_df_sorted.head(20))

    except Exception as e:
        print(f"\n❌ An error occurred during the 'Tag \'Em' process: {e}")

### Step 4: Export to a Beautifully Formatted Excel File 🎨

# Todo
- Make it work with Excel or CSV files
- Bring your own CSV (Botify, GSC, etc) for joined rows & PageRank and other signals
- A way to take ad hoc metrics into account with opportunity scores
- Botify API integration in place of bring your own CSV