In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pipulate import pip
import secretsauce
import nest_asyncio
import keys
nest_asyncio.apply()

job = "gapalyzer-01" # Give your session a unique name

# Here are the Keys

In [None]:

pip.api_key(job, key=keys.google)
botify_token = keys.botify


## Here are your Foes

### Save all of These

In [None]:
import nbformat
from pathlib import Path

def get_competitors_from_notebook(notebook_filename="GAPalyzer.ipynb"):
    """Parses this notebook to get the domain list from the 'url-list-input' cell."""
    try:
        notebook_path = Path(notebook_filename)
        with open(notebook_path, 'r', encoding='utf-8') as f:
            nb = nbformat.read(f, as_version=4)
        
        for cell in nb.cells:
            if "url-list-input" in cell.metadata.get("tags", []):
                domains_raw = cell.source
                domains = [
                    line.split('#')[0].strip() 
                    for line in domains_raw.splitlines() 
                    if line.strip() and not line.strip().startswith('#')
                ]
                return domains
        print("⚠️ Warning: Could not find a cell tagged with 'url-list-input'.")
        return []
    except Exception as e:
        print(f"❌ Error reading domains from notebook: {e}")
        return []

# --- Main Logic ---
print("🚀 Generating SEMrush URLs for GAP analysis...")

domains = get_competitors_from_notebook()
url_template = "https://www.semrush.com/analytics/organic/positions/?db=us&q={domain}&searchType=domain"

if not domains:
    print("🛑 No domains found. Please add competitor domains to the 'url-list-input' cell and re-run.")
else:
    print(f"✅ Found {len(domains)} competitor domains. Click the links below to open each report:")
    print("-" * 30)
    for i, domain in enumerate(domains):
        full_url = url_template.format(domain=domain)
        print(f"{i+1}. {domain}:\n   {full_url}\n")

#### Turn 'em to Rows

In [None]:
import pandas as pd
from pathlib import Path
import os

print("🚀 Consolidating all SEMrush Excel files from Downloads...")

try:
    # 1. Define the path to your Downloads folder
    downloads_dir = Path.home() / "Downloads"
    
    # 2. Find all .xlsx files in the directory
    # Using glob to find all Excel files, not just for one domain
    all_excel_files = list(downloads_dir.glob("*.xlsx"))
    
    if not all_excel_files:
        raise FileNotFoundError("No .xlsx files found in your Downloads folder.")
        
    print(f"🔍 Found {len(all_excel_files)} Excel files to process.")
    
    list_of_dataframes = []
    
    # 3. Loop through each file, load it, and add the domain column
    for file_path in all_excel_files:
        try:
            # 3a. Extract the domain name from the filename
            # This splits the name at "-organic.Positions" and takes the first part
            domain_name = file_path.name.split('-organic.Positions')[0]
            
            # 3b. Load the Excel file into a DataFrame
            df = pd.read_excel(file_path)
            
            # 3c. Add the new 'domain' column with the extracted name
            df['domain'] = domain_name
            
            list_of_dataframes.append(df)
            print(f"  -> ✅ Loaded and tagged {file_path.name} with domain '{domain_name}'")
        except Exception as e:
            print(f"  -> ⚠️ Could not process file {file_path.name}: {e}")

    # 4. Concatenate all the individual DataFrames into one master DataFrame
    if list_of_dataframes:
        print("\nConcatenating all DataFrames into a single master dataset...")
        combined_df = pd.concat(list_of_dataframes, ignore_index=True)
        
        print("\n🎉 Success! All Excel files have been consolidated.")
        print("-" * 30)
        
        # 5. Display info and head of the combined DataFrame
        print("Combined DataFrame Info:")
        combined_df.info()
        
        print("\nCombined DataFrame Head (First 5 Rows):")
        display(combined_df.head())
        
        # You can also check the last few rows to see data from another domain
        print("\nCombined DataFrame Tail (Last 5 Rows):")
        display(combined_df.tail())
        
    else:
        print("🤷 No valid Excel files were processed.")

except Exception as e:
    print(f"\n❌ An unexpected error occurred: {e}")

### Step 3: Stack 'Em, Join 'Em & Tag 'Em! 🥞🧠📊

In [None]:
import pandas as pd

print("🧩 Joining 'Em: Pivoting data to map keywords against competitors...")

# --- USER CUSTOMIZATION ---
# ⚠️ IMPORTANT: Replace this with your client's actual domain name.
# This must exactly match the domain name extracted from the filenames.
your_domain = "your-client-domain.com"
# -------------------------

if 'combined_df' not in locals():
    print("❌ 'combined_df' not found. Please run the 'Stack \'Em' cell first.")
else:
    try:
        # 1. Select only the columns we need for the pivot. This is efficient.
        pivot_data = combined_df[['Keyword', 'domain', 'Search Volume']]

        # 2. Perform the pivot. This is the magic step.
        # It turns the 'domain' rows into columns.
        pivoted_df = pivot_data.pivot_table(
            index='Keyword', 
            columns='domain', 
            values='Search Volume',
            aggfunc='first' # Use 'first' since each keyword-domain pair should be unique
        )

        # 3. Fill missing values (NaN) with 0.
        # This is CRITICAL. A 0 now explicitly means "does not rank".
        pivoted_df = pivoted_df.fillna(0)

        # 4. Ensure your domain is the first column for easy reference.
        if your_domain in pivoted_df.columns:
            cols = [your_domain] + [col for col in pivoted_df if col != your_domain]
            pivoted_df = pivoted_df[cols]
        else:
            print(f"⚠️ Warning: Your specified domain '{your_domain}' was not found in the data.")

        print("\n🎉 Success! Data has been pivoted into a competitive matrix.")
        print("-" * 30)
        
        print("Pivoted DataFrame Info:")
        pivoted_df.info()
        
        print("\nPivoted DataFrame Head (First 5 Rows):")
        display(pivoted_df.head())

    except Exception as e:
        print(f"\n❌ An error occurred during the pivot process: {e}")

### Step 4: Export to a Beautifully Formatted Excel File 🎨

# Todo
- Make it work with Excel or CSV files
- Bring your own CSV (Botify, GSC, etc) for joined rows & PageRank and other signals
- A way to take ad hoc metrics into account with opportunity scores
- Botify API integration in place of bring your own CSV