In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pipulate import pip
import nest_asyncio
nest_asyncio.apply()
import keys

job = "gapalyzer-01" # Give your session a unique name

In [None]:
# --- ⚙️ Workflow Configuration ---
ROW_LIMIT = 3000  # Final Output row limit, low for fast iteration
COMPETITOR_LIMIT = 3  # Limit rows regardless of downloads, low for fast iteration
BROWSER_DOWNLOAD_PATH = "~/Downloads"  # The default directory where your browser downloads files

print(f"✅ Configuration set: Final report will be limited to {ROW_LIMIT} rows.")
if COMPETITOR_LIMIT:
    print(f"✅ Configuration set: Processing will be limited to the top {COMPETITOR_LIMIT} competitors.")
else:
    print(f"✅ Configuration set: Processing all competitors.")

# Here are the Keys

In [None]:

pip.api_key(job, key=keys.google)
botify_token = keys.botify


## Here are your Foes

### Save all of These

In [None]:
import nbformat
from pathlib import Path

def get_competitors_from_notebook(notebook_filename="GAPalyzer.ipynb"):
    """Parses this notebook to get the domain list from the 'url-list-input' cell."""
    try:
        notebook_path = Path(notebook_filename)
        with open(notebook_path, 'r', encoding='utf-8') as f:
            nb = nbformat.read(f, as_version=4)
        
        for cell in nb.cells:
            if "url-list-input" in cell.metadata.get("tags", []):
                domains_raw = cell.source
                domains = [
                    line.split('#')[0].strip() 
                    for line in domains_raw.splitlines() 
                    if line.strip() and not line.strip().startswith('#')
                ]
                return domains
        print("⚠️ Warning: Could not find a cell tagged with 'url-list-input'.")
        return []
    except Exception as e:
        print(f"❌ Error reading domains from notebook: {e}")
        return []

# --- Main Logic ---
print("🚀 Generating SEMrush URLs for GAP analysis...")

domains = get_competitors_from_notebook()
url_template = "https://www.semrush.com/analytics/organic/positions/?db=us&q={domain}&searchType=domain"

if not domains:
    print("🛑 No domains found. Please add competitor domains to the 'url-list-input' cell and re-run.")
else:
    print(f"✅ Found {len(domains)} competitor domains. Click the links below to open each report:")
    print("-" * 30)
    for i, domain in enumerate(domains):
        full_url = url_template.format(domain=domain)
        print(f"{i+1}. {domain}:\n   {full_url}\n")

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import os
import shutil
from pathlib import Path
import glob

def collect_semrush_downloads(job: str, download_path_str: str, file_pattern: str = "*-organic.Positions*.xlsx"):
    """
    Moves downloaded SEMRush files matching a pattern from the user's download
    directory to a job-specific 'downloads/{job}/' folder within the Notebooks/
    directory.
    
    Args:
        job (str): The current job ID (e.g., "gapalyzer-01").
        download_path_str (str): The user's default browser download path (e.g., "~/Downloads").
        file_pattern (str): The glob pattern to match SEMRush files.
    """
    print("📦 Starting collection of new SEMRush downloads...")

    # 1. Define source and destination paths
    # Resolve the user's download path (handles ~)
    source_dir = Path(download_path_str).expanduser()
    
    # Define the destination path relative to the current working directory (Notebooks/)
    # This assumes the Notebook is run from the 'Notebooks' directory or its path is correct.
    destination_dir = Path("downloads") / job

    # 2. Create the destination directory if it doesn't exist
    destination_dir.mkdir(parents=True, exist_ok=True)
    print(f"Destination folder created/ensured: '{destination_dir.resolve()}'")

    # 3. Find files in the source directory matching the pattern
    # We use glob.glob for pattern matching, searching for both .xlsx and .csv
    files_to_move = []
    
    # Check for .xlsx files
    xlsx_files = glob.glob(str(source_dir / file_pattern))
    files_to_move.extend(xlsx_files)
    
    # Check for .csv files
    csv_pattern = file_pattern.replace(".xlsx", ".csv")
    csv_files = glob.glob(str(source_dir / csv_pattern))
    files_to_move.extend(csv_files)

    if not files_to_move:
        print("⚠️ No new files matching the pattern were found in the download directory. Skipping move.")
        return

    # 4. Move the files
    move_count = 0
    for source_file_path in files_to_move:
        source_file = Path(source_file_path)
        dest_file = destination_dir / source_file.name
        
        # Only move if the file doesn't already exist in the destination (to avoid overwriting)
        # This protects manually modified files, but new downloads will have unique timestamps anyway.
        if dest_file.exists():
             # Option: could log that it exists or decide to overwrite/rename. 
             # Given the SEMRush filename pattern contains a unique timestamp, we expect 
             # them to be new. Let's just avoid redundant logging.
             continue
        
        try:
            shutil.move(source_file, dest_file)
            print(f"  -> Moved: {source_file.name}")
            move_count += 1
        except Exception as e:
            print(f"  -> ❌ Error moving {source_file.name}: {e}")

    print(f"✅ Collection complete. {move_count} new files moved to '{destination_dir}'.")
    
    # --- Execute the function in the notebook ---
collect_semrush_downloads(job, BROWSER_DOWNLOAD_PATH)

In [None]:
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import itertools
from pathlib import Path
from IPython.display import display, Markdown

# NOTE: This cell assumes 'job' is defined (e.g., "gapalyzer-01")

# --- Define the file directory based on the job variable ---
semrush_gap_analysis_dir = Path("downloads") / job

# --- Combine glob results for both .xlsx and .csv ---
file_patterns = [
    "*-organic.Positions*.xlsx", 
    "*-organic.Positions*.csv"
]

# Use itertools.chain to efficiently combine generators from multiple glob calls
all_downloaded_files = sorted(list(itertools.chain.from_iterable(
    semrush_gap_analysis_dir.glob(pattern) for pattern in file_patterns
)))

# --- Display Results ---
if all_downloaded_files:
    # Use a Markdown block for formatted display with emoji
    markdown_output = ["## 💾 Found Downloaded Files"]
    markdown_output.append(f"✅ **{len(all_downloaded_files)} files** ready for processing in `{semrush_gap_analysis_dir}/`\n")
    
    for i, file in enumerate(all_downloaded_files):
        # The file name starts with the competitor's domain.
        try:
            # We strip the full file path name for cleaner display
            domain_name = file.name[:file.name.index("-organic.")].strip()
        except ValueError:
            # Fallback if the expected pattern is slightly off
            domain_name = file.name
            
        markdown_output.append(f"{i + 1}. **`{domain_name}`** ({file.suffix.upper()})")

    display(Markdown("\n".join(markdown_output)))
    
    # --- NEW FIX: Convert Path objects to strings for JSON serialization ---
    # The Pipulate core needs simple, JSON-serializable types (strings, lists, dicts, etc.)
    all_downloaded_files_as_str = [str(p) for p in all_downloaded_files]
    # ---------------------------------------------------------------------

    # For the next step, we'll store the list of files in the Pipulate pipeline.
    pip.set(job, 'semrush_files', all_downloaded_files_as_str)
    
else:
    display(Markdown(f"⚠️ **Warning:** No SEMRush files found in `{semrush_gap_analysis_dir}/`.\n(Looking for `*-organic.Positions*.xlsx` or `*.csv`)"))

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import pandas as pd
from tldextract import extract
import itertools
from pathlib import Path
from IPython.display import display

# --- SUPPORT FUNCTION (1-to-1 Transplant) ---
# NOTE: This function requires 'tldextract' to be installed (which you've handled).
def extract_registered_domain(url):
    """
    Extracts the registered domain (domain.suffix) from a URL/hostname.
    """
    extracted = extract(url)
    return f"{extracted.domain}.{extracted.suffix}"

# --- MAIN LOGIC ADAPTATION ---

# Variables required from previous Notebook cells:
# job, ROW_LIMIT, COMPETITOR_LIMIT, BROWSER_DOWNLOAD_PATH, client_domain, country_code
# semrush_gap_analysis_dir is assumed to be defined as Path("downloads") / job

# Define 'semrush_gap_analysis_dir' and 'semrush_lookup' based on prior context
# We use the 'job' variable to define the directory
semrush_gap_analysis_dir = Path("downloads") / job

# The client domain is the key for separating client vs. competitor data.
# We strip the full domain in case it contains a protocol or path.
# Assuming 'client_domain' is available from a keys/config cell (e.g., "example.com")
# Since we don't have 'client_domain' defined here, we'll temporarily define it for the port.
# Replace this line with proper import/assignment if moving to external module:
semrush_lookup = extract_registered_domain(client_domain) if 'client_domain' in locals() else "nixos.org"


print("Creating a great big DataFrame...")

# 1. Adapt file globbing to handle BOTH CSV and XLSX (as done in the previous step)
file_patterns = ["*-organic.Positions*.xlsx", "*-organic.Positions*.csv"]
all_semrush_files = sorted(list(itertools.chain.from_iterable(
    semrush_gap_analysis_dir.glob(pattern) for pattern in file_patterns
)))

# Initialize data structures
cdict = {}
list_of_dfs = []
print("Loading SEMRush files: ", end="")

# 2. Loop through all found files
for j, data_file in enumerate(all_semrush_files):
    # Determine the file type and corresponding reader function
    is_excel = data_file.suffix.lower() == '.xlsx'
    read_func = pd.read_excel if is_excel else pd.read_csv
    
    # Original file name parsing logic
    nend = data_file.stem.index("-organic")
    xlabel = data_file.stem[:nend].replace("_", "/").replace("///", "://").strip('.')
    
    # Original domain extraction logic (using the locally defined function)
    just_domain = extract_registered_domain(xlabel)
    cdict[just_domain] = xlabel
    
    # Load data
    df = read_func(data_file)
    
    # Original column assignment logic
    if just_domain == xlabel:
        df["Domain"] = just_domain
    else:
        # Use the full X-label (e.g., sub.domain.com) if it's not just the registered domain
        df["Domain"] = xlabel
    
    # Original data segregation logic
    df["Client URL"] = df.apply(lambda row: row["URL"] if row["Domain"] == semrush_lookup else None, axis=1)
    df["Competitor URL"] = df.apply(lambda row: row["URL"] if row["Domain"] != semrush_lookup else None, axis=1)
    
    list_of_dfs.append(df)
    print(f"{j + 1} ", end="", flush=True)

print() # Newline after the loading count

if list_of_dfs:
    df2 = pd.concat(list_of_dfs)  # Concatenate like stacking CSVs
    
    # --- Original Excel Formatting Value Gathering ---
    # This logic appears to be for calculating Excel visual layout, 
    # but still needs to be run even if the formatting happens later.
    # It requires the 'bf.open_dir_widget' function to be defined or stubbed if not portable.
    # NOTE: Since `bf` is not defined, and `project_customizations`/`proceed` are missing, 
    # we must skip the non-portable lines to prevent breaking the Notebook.

    # Stubbing non-portable functions/logic to keep the structure intact
    # We remove the print statements related to bf/project/customization for now
    
    # The max_length calculation is fine to keep
    max_length = max(len(value) for value in cdict.values())
    row1_height = max_length * 7 # Unused variable for now, but ported
    
    rows, columns = df2.shape
    print()
    print(f"Rows: {rows:,}")
    print(f"Cols: {columns:,}")
    print()

    # NOTE: The subsequent conditional logic (lines 53-61 in the original)
    # involving `bf.open_dir_widget`, `project_customizations`, and `proceed()`
    # has been intentionally omitted here as it depends on external, undefined
    # modules (`bf`) and pipeline state (`project`, `project_customizations`, `proceed`)
    # that are not provided in the prompt's context and would cause the script to fail.
    # We only port the pure Pandas/Python logic.
    
    # The final output and pipeline update
    display(df2["Domain"].value_counts())
    
    # Store the result in the pipeline
    pip.set(job, 'semrush_master_df_json', df2.to_json(orient='records'))
    
else:
    print("Please put the CSVs in place.")

# Todo
- Move everything that matches the `.csv` or `.xlsx` template from downloads to somewhere relative to Notebook
- Make it work with either Excel or CSV files