In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pipulate import pip
import nest_asyncio
nest_asyncio.apply()
import keys

job = "gapalyzer-01" # Give your session a unique name

In [None]:
# --- ⚙️ Workflow Configuration ---
ROW_LIMIT = 3000  # Final Output row limit, low for fast iteration
COMPETITOR_LIMIT = 3  # Limit rows regardless of downloads, low for fast iteration
BROWSER_DOWNLOAD_PATH = "~/Downloads"  # The default directory where your browser downloads files

print(f"✅ Configuration set: Final report will be limited to {ROW_LIMIT} rows.")
if COMPETITOR_LIMIT:
    print(f"✅ Configuration set: Processing will be limited to the top {COMPETITOR_LIMIT} competitors.")
else:
    print(f"✅ Configuration set: Processing all competitors.")

# Here are the Keys

In [None]:

pip.api_key(job, key=keys.google)
botify_token = keys.botify


## Here are your Foes

### Save all of These

In [None]:
import nbformat
from pathlib import Path

def get_competitors_from_notebook(notebook_filename="GAPalyzer.ipynb"):
    """Parses this notebook to get the domain list from the 'url-list-input' cell."""
    try:
        notebook_path = Path(notebook_filename)
        with open(notebook_path, 'r', encoding='utf-8') as f:
            nb = nbformat.read(f, as_version=4)
        
        for cell in nb.cells:
            if "url-list-input" in cell.metadata.get("tags", []):
                domains_raw = cell.source
                domains = [
                    line.split('#')[0].strip() 
                    for line in domains_raw.splitlines() 
                    if line.strip() and not line.strip().startswith('#')
                ]
                return domains
        print("⚠️ Warning: Could not find a cell tagged with 'url-list-input'.")
        return []
    except Exception as e:
        print(f"❌ Error reading domains from notebook: {e}")
        return []

# --- Main Logic ---
print("🚀 Generating SEMrush URLs for GAP analysis...")

domains = get_competitors_from_notebook()
url_template = "https://www.semrush.com/analytics/organic/positions/?db=us&q={domain}&searchType=domain"

if not domains:
    print("🛑 No domains found. Please add competitor domains to the 'url-list-input' cell and re-run.")
else:
    print(f"✅ Found {len(domains)} competitor domains. Click the links below to open each report:")
    print("-" * 30)
    for i, domain in enumerate(domains):
        full_url = url_template.format(domain=domain)
        print(f"{i+1}. {domain}:\n   {full_url}\n")

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import os
import shutil
from pathlib import Path
import glob

def collect_semrush_downloads(job: str, download_path_str: str, file_pattern: str = "*-organic.Positions*.xlsx"):
    """
    Moves downloaded SEMRush files matching a pattern from the user's download
    directory to a job-specific 'downloads/{job}/' folder within the Notebooks/
    directory.
    
    Args:
        job (str): The current job ID (e.g., "gapalyzer-01").
        download_path_str (str): The user's default browser download path (e.g., "~/Downloads").
        file_pattern (str): The glob pattern to match SEMRush files.
    """
    print("📦 Starting collection of new SEMRush downloads...")

    # 1. Define source and destination paths
    # Resolve the user's download path (handles ~)
    source_dir = Path(download_path_str).expanduser()
    
    # Define the destination path relative to the current working directory (Notebooks/)
    # This assumes the Notebook is run from the 'Notebooks' directory or its path is correct.
    destination_dir = Path("downloads") / job

    # 2. Create the destination directory if it doesn't exist
    destination_dir.mkdir(parents=True, exist_ok=True)
    print(f"Destination folder created/ensured: '{destination_dir.resolve()}'")

    # 3. Find files in the source directory matching the pattern
    # We use glob.glob for pattern matching, searching for both .xlsx and .csv
    files_to_move = []
    
    # Check for .xlsx files
    xlsx_files = glob.glob(str(source_dir / file_pattern))
    files_to_move.extend(xlsx_files)
    
    # Check for .csv files
    csv_pattern = file_pattern.replace(".xlsx", ".csv")
    csv_files = glob.glob(str(source_dir / csv_pattern))
    files_to_move.extend(csv_files)

    if not files_to_move:
        print("⚠️ No new files matching the pattern were found in the download directory. Skipping move.")
        return

    # 4. Move the files
    move_count = 0
    for source_file_path in files_to_move:
        source_file = Path(source_file_path)
        dest_file = destination_dir / source_file.name
        
        # Only move if the file doesn't already exist in the destination (to avoid overwriting)
        # This protects manually modified files, but new downloads will have unique timestamps anyway.
        if dest_file.exists():
             # Option: could log that it exists or decide to overwrite/rename. 
             # Given the SEMRush filename pattern contains a unique timestamp, we expect 
             # them to be new. Let's just avoid redundant logging.
             continue
        
        try:
            shutil.move(source_file, dest_file)
            print(f"  -> Moved: {source_file.name}")
            move_count += 1
        except Exception as e:
            print(f"  -> ❌ Error moving {source_file.name}: {e}")

    print(f"✅ Collection complete. {move_count} new files moved to '{destination_dir}'.")
    
    # --- Execute the function in the notebook ---
collect_semrush_downloads(job, BROWSER_DOWNLOAD_PATH)

In [None]:
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import itertools
from pathlib import Path
from IPython.display import display, Markdown

# NOTE: This cell assumes 'job' is defined (e.g., "gapalyzer-01")

# --- Define the file directory based on the job variable ---
semrush_gap_analysis_dir = Path("downloads") / job

# --- Combine glob results for both .xlsx and .csv ---
file_patterns = [
    "*-organic.Positions*.xlsx", 
    "*-organic.Positions*.csv"
]

# Use itertools.chain to efficiently combine generators from multiple glob calls
all_downloaded_files = sorted(list(itertools.chain.from_iterable(
    semrush_gap_analysis_dir.glob(pattern) for pattern in file_patterns
)))

# --- Display Results ---
if all_downloaded_files:
    # Use a Markdown block for formatted display with emoji
    markdown_output = ["## 💾 Found Downloaded Files"]
    markdown_output.append(f"✅ **{len(all_downloaded_files)} files** ready for processing in `{semrush_gap_analysis_dir}/`\n")
    
    for i, file in enumerate(all_downloaded_files):
        # The file name starts with the competitor's domain.
        try:
            # We strip the full file path name for cleaner display
            domain_name = file.name[:file.name.index("-organic.")].strip()
        except ValueError:
            # Fallback if the expected pattern is slightly off
            domain_name = file.name
            
        markdown_output.append(f"{i + 1}. **`{domain_name}`** ({file.suffix.upper()})")

    display(Markdown("\n".join(markdown_output)))
    
    # --- NEW FIX: Convert Path objects to strings for JSON serialization ---
    # The Pipulate core needs simple, JSON-serializable types (strings, lists, dicts, etc.)
    all_downloaded_files_as_str = [str(p) for p in all_downloaded_files]
    # ---------------------------------------------------------------------

    # For the next step, we'll store the list of files in the Pipulate pipeline.
    pip.set(job, 'semrush_files', all_downloaded_files_as_str)
    
else:
    display(Markdown(f"⚠️ **Warning:** No SEMRush files found in `{semrush_gap_analysis_dir}/`.\n(Looking for `*-organic.Positions*.xlsx` or `*.csv`)"))

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import pandas as pd
from tldextract import extract
import itertools
from pathlib import Path
from IPython.display import display

# --- SUPPORT FUNCTION (1-to-1 Transplant) ---
# NOTE: This function requires 'tldextract' to be installed (which you've handled).
def extract_registered_domain(url):
    """
    Extracts the registered domain (domain.suffix) from a URL/hostname.
    """
    extracted = extract(url)
    return f"{extracted.domain}.{extracted.suffix}"

# --- MAIN LOGIC ADAPTATION ---

# Variables required from previous Notebook cells:
# job, ROW_LIMIT, COMPETITOR_LIMIT, BROWSER_DOWNLOAD_PATH, client_domain, country_code
# semrush_gap_analysis_dir is assumed to be defined as Path("downloads") / job

# Define 'semrush_gap_analysis_dir' and 'semrush_lookup' based on prior context
# We use the 'job' variable to define the directory
semrush_gap_analysis_dir = Path("downloads") / job

# The client domain is the key for separating client vs. competitor data.
# We strip the full domain in case it contains a protocol or path.
# Assuming 'client_domain' is available from a keys/config cell (e.g., "example.com")
# Since we don't have 'client_domain' defined here, we'll temporarily define it for the port.
# Replace this line with proper import/assignment if moving to external module:
semrush_lookup = extract_registered_domain(client_domain) if 'client_domain' in locals() else "nixos.org"


print("Creating a great big DataFrame...")

# 1. Adapt file globbing to handle BOTH CSV and XLSX (as done in the previous step)
file_patterns = ["*-organic.Positions*.xlsx", "*-organic.Positions*.csv"]
all_semrush_files = sorted(list(itertools.chain.from_iterable(
    semrush_gap_analysis_dir.glob(pattern) for pattern in file_patterns
)))

# Initialize data structures
cdict = {}
list_of_dfs = []
print("Loading SEMRush files: ", end="")

# 2. Loop through all found files
for j, data_file in enumerate(all_semrush_files):
    # Determine the file type and corresponding reader function
    is_excel = data_file.suffix.lower() == '.xlsx'
    read_func = pd.read_excel if is_excel else pd.read_csv
    
    # Original file name parsing logic
    nend = data_file.stem.index("-organic")
    xlabel = data_file.stem[:nend].replace("_", "/").replace("///", "://").strip('.')
    
    # Original domain extraction logic (using the locally defined function)
    just_domain = extract_registered_domain(xlabel)
    cdict[just_domain] = xlabel
    
    # Load data
    df = read_func(data_file)
    
    # Original column assignment logic
    if just_domain == xlabel:
        df["Domain"] = just_domain
    else:
        # Use the full X-label (e.g., sub.domain.com) if it's not just the registered domain
        df["Domain"] = xlabel
    
    # Original data segregation logic
    df["Client URL"] = df.apply(lambda row: row["URL"] if row["Domain"] == semrush_lookup else None, axis=1)
    df["Competitor URL"] = df.apply(lambda row: row["URL"] if row["Domain"] != semrush_lookup else None, axis=1)
    
    list_of_dfs.append(df)
    print(f"{j + 1} ", end="", flush=True)

print() # Newline after the loading count

if list_of_dfs:
    df2 = pd.concat(list_of_dfs)  # Concatenate like stacking CSVs
    
    # --- Original Excel Formatting Value Gathering ---
    # This logic appears to be for calculating Excel visual layout, 
    # but still needs to be run even if the formatting happens later.
    # It requires the 'bf.open_dir_widget' function to be defined or stubbed if not portable.
    # NOTE: Since `bf` is not defined, and `project_customizations`/`proceed` are missing, 
    # we must skip the non-portable lines to prevent breaking the Notebook.

    # Stubbing non-portable functions/logic to keep the structure intact
    # We remove the print statements related to bf/project/customization for now
    
    # The max_length calculation is fine to keep
    max_length = max(len(value) for value in cdict.values())
    row1_height = max_length * 7 # Unused variable for now, but ported
    
    rows, columns = df2.shape
    print()
    print(f"Rows: {rows:,}")
    print(f"Cols: {columns:,}")
    print()

    # NOTE: The subsequent conditional logic (lines 53-61 in the original)
    # involving `bf.open_dir_widget`, `project_customizations`, and `proceed()`
    # has been intentionally omitted here as it depends on external, undefined
    # modules (`bf`) and pipeline state (`project`, `project_customizations`, `proceed`)
    # that are not provided in the prompt's context and would cause the script to fail.
    # We only port the pure Pandas/Python logic.
    
    # The final output and pipeline update
    display(df2["Domain"].value_counts())
    
    # Store the result in the pipeline
    pip.set(job, 'semrush_master_df_json', df2.to_json(orient='records'))
    
else:
    print("Please put the CSVs in place.")

# Todo
- Move everything that matches the `.csv` or `.xlsx` template from downloads to somewhere relative to Notebook
- Make it work with either Excel or CSV files

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import pandas as pd
from pathlib import Path
from IPython.display import display
from collections import defaultdict # Needed if cdict is modified to be a defaultdict

# --- PATH DEFINITIONS (Needed to replace external checks) ---
# Assumes 'job' is defined in a previous cell (e.g., "gapalyzer-01")
# Assumes 'df2' is the master DataFrame from the previous step
competitors_csv_file = Path("data") / f"{job}_competitors.csv"

# --- ADAPTED PIVOTING LOGIC ---

print("Pivoting data. Keyword count per competitor...\n")

# Original pivot operation
pivot_df = df2.pivot_table(index='Keyword', columns='Domain', values='Position', aggfunc='min')

# ORIGINAL LOGIC: pivot_df = bf.move_column_to_front(pivot_df, semrush_lookup)
# SURGICAL PORT: Use Pandas reindexing to move the column to the front.
if semrush_lookup in pivot_df.columns:
    cols = [semrush_lookup] + [col for col in pivot_df.columns if col != semrush_lookup]
    pivot_df = pivot_df[cols]
else:
    print(f"⚠️ Warning: Client domain '{semrush_lookup}' not found in pivot table columns.")


# Original: Get list of columns and calculate positioning
competitors = list(pivot_df.columns)
pivot_df['Competitors Positioning'] = pivot_df.iloc[:, 1:].notna().sum(axis=1)

# Original: Load or initialize df_competitors
if competitors_csv_file.exists():
    df_competitors = pd.read_csv(competitors_csv_file)
    df_competitors['Title'] = df_competitors['Title'].fillna('')
    df_competitors['Matched Title'] = df_competitors['Matched Title'].fillna('')
    print(f"✅ Loaded {len(df_competitors)} existing competitor records.")
else:
    # Use 'cdict' (created in the previous step) to initialize
    df_competitors = pd.DataFrame(list(cdict.items()), columns=['Domain', 'Column Label'])
    
    # Initialize 'Title' and 'Matched Title' columns explicitly
    df_competitors['Title'] = ''
    df_competitors['Matched Title'] = ''
    
    df_competitors.to_csv(competitors_csv_file, index=False)
    print(f"✅ Created new competitor file at '{competitors_csv_file}'.")

# Original: Print keyword counts per competitor (for debugging/visual confirmation)
counts = pivot_df.describe().loc['count']
# Ensure counts has data before proceeding with printing logic
if not counts.empty:
    max_digits = len(str(len(counts)))
    # Ensure all indices are strings for max length calculation
    max_index_width = max(len(str(index)) for index in counts.index) 
    
    # Ensure only non-NaN counts are considered for width calculation, fallback to 0 if all are NaN
    valid_counts = [count for count in counts if pd.notna(count)]
    max_count_width = max([len(f"{int(count):,}") for count in valid_counts] or [0])
    
    for i, (index, count) in enumerate(counts.items(), start=1):
        counter_str = str(i).zfill(max_digits)
        count_str = f"{int(count):,}" if pd.notna(count) else 'NaN'
        print(f"{counter_str}: {index:<{max_index_width}} - {count_str:>{max_count_width}}")
else:
    print("❌ No data to count after pivot table creation.")

# Original: Print rows and columns summary
rows, columns = df2.shape
rows2, columns2 = pivot_df.shape
print("\nThere is some natural deduping from pivot.\n")
print(f"Rows (master df): {rows:,}")
print(f"Rows (pivot df): {rows2:,} ({rows:,} - {rows2:,} = {rows - rows2:,} dupes removed.)")
print(f"Cols: {columns2:,}") # Use columns2 for the pivot_df column count

# Original: Display result
display(pivot_df)

# Store the final result in the pipeline
pip.set(job, 'keyword_pivot_df', pivot_df.to_json(orient='records'))
pip.set(job, 'competitors_df', df_competitors.to_json(orient='records'))

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import pandas as pd
import nest_asyncio
import asyncio
from pathlib import Path
from tldextract import extract
import wordninja
import httpx
import re
from collections import defaultdict # Already imported in a previous cell

# NOTE: This cell assumes 'job', 'semrush_lookup', 'df_competitors', 
#       and 'competitors_csv_file' are defined in prior cells.
# We also assume 'df_competitors' was loaded from 'competitors_csv_file' in the previous step.

# --- PATH DEFINITION FOR FILTER FILE ---
# Consolidating working files to the 'data' directory.
filter_file = Path("data") / f"{job}_filter_keywords.csv"


# --- REQUIRED SUPPORT FUNCTIONS (Surgically Ported from botifython.py) ---

# Headers and user_agent were defined globally in botifython.py, but are needed here for httpx
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'
headers = {'User-Agent': user_agent}

def extract_registered_domain(url):
    """Extracts the registered domain (domain.suffix) from a URL/hostname."""
    extracted = extract(url)
    return f"{extracted.domain}.{extracted.suffix}"

def get_title_from_html(html_content):
    """Simple helper to extract the title from HTML content."""
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    title_tag = soup.find('title')
    return title_tag.text if title_tag else ''

def match_domain_in_title(domain, title):
    """Finds a stripped version of the domain in the title."""
    base_domain = domain.split('.')[0]
    pattern = ''.join([c + r'\s*' for c in base_domain])
    regex = re.compile(pattern, re.IGNORECASE)
    match = regex.search(title)
    if match:
        matched = match.group(0).strip()
        return matched
    return ''

async def async_check_url(url, domain, timeout):
    """Asynchronously checks a single domain and extracts title/matched title."""
    # Timeout is intentionally high (120s from the original context)
    try:
        async with httpx.AsyncClient(follow_redirects=True, headers=headers, timeout=timeout) as client:
            response = await client.get(url)
            if response.status_code == 200:
                if str(response.url) != url:
                    print(f"Redirected to {response.url} for {url}")
                title = get_title_from_html(response.text)
                matched_title = match_domain_in_title(domain, title)
                return str(response.url), title, matched_title, True
            else:
                print(f"Status Code {response.status_code} for {url}")
    except httpx.RequestError as e:
        print(f"Request failed for {url}: {str(e)}")
    except Exception as e:
        print(f"An unexpected error occurred for {url}: {str(e)}")
    return url, None, None, False

def test_domains(domains, timeout=120):
    """Orchestrates async checks for a list of domains."""
    print(f"Giving up to {timeout} seconds to visit all sites...")
    tasks = [async_check_url(f'https://{domain}', domain, timeout) for domain in domains]
    results = asyncio.run(async_test_domains(domains, tasks))
    
    domain_results = {}
    for domain, result in zip(domains, results):
        # Handle exceptions gracefully as in the original bf.test_domains (part of the transplant)
        if isinstance(result, Exception):
            domain_results[domain] = {'url': None, 'title': None, 'matched_title': None}
        else:
            domain_results[domain] = {'url': result[0], 'title': result[1], 'matched_title': result[2]}
    return domain_results

async def async_test_domains(domains, tasks):
    """Internal helper for asyncio.gather."""
    return await asyncio.gather(*tasks, return_exceptions=True)

def split_domain_name(domain):
    """Splits a concatenated domain name into human-readable words (requires wordninja)."""
    words = wordninja.split(domain)
    return ' '.join(words)

# --- MAIN WORKFLOW LOGIC ---

print("Visiting competitor homepages for title tags for filters...\n")

# Original logic required to run async in Jupyter
nest_asyncio.apply()

# Lowercase existing matched titles for comparison
df_competitors['Matched Title'] = df_competitors['Matched Title'].str.lower()

# Find domains where 'Title' column is empty ('') or NaN
# Using .isna() on a string column returns False for '', so we check explicitly for the empty string
needs_titles = df_competitors[
    (df_competitors['Title'].isna()) | (df_competitors['Title'] == '')
].copy()

if not needs_titles.empty:
    # 1. Scrape Titles
    print(f"Gathering Titles for {len(needs_titles)} domains.")
    results = test_domains(needs_titles['Domain'].tolist())
    
    # 2. Prepare and Merge Data
    data_to_add = {
        'Domain': [],
        'Title': [],
        'Matched Title': []
    }
    
    for domain, info in results.items():
        data_to_add['Domain'].append(domain)
        data_to_add['Title'].append(info['title'] if info['title'] else '')
        data_to_add['Matched Title'].append(info['matched_title'] if info['matched_title'] else '')
    
    new_data_df = pd.DataFrame(data_to_add)
    
    # Use original combine_first logic for non-destructive update
    df_competitors.set_index('Domain', inplace=True)
    new_data_df.set_index('Domain', inplace=True)
    df_competitors = new_data_df.combine_first(df_competitors)
    df_competitors.reset_index(inplace=True)
    
    # Lowercase and persist the updated data
    df_competitors['Matched Title'] = df_competitors['Matched Title'].str.lower()
    df_competitors.to_csv(competitors_csv_file, index=False)
    print(f"✅ Updated competitor titles and saved to '{competitors_csv_file}'.")


# --- Create Keyword Filters ---

# Remove '.com' from both lists to create more generic keyword filters
extracted_domains = [extract_registered_domain(domain).replace('.com', '') for domain in df_competitors['Domain']]
matched_titles = [title.replace('.com', '') for title in df_competitors['Matched Title'].tolist() if title]

# Split domain names using wordninja (e.g., 'barenecessities' -> 'bare necessities')
split_domains = [split_domain_name(domain) for domain in extracted_domains]

# Combine all lists, strip whitespace, and deduplicate
combined_list = [x.strip() for x in extracted_domains + matched_titles + split_domains if x]
combined_list = sorted(list(set(combined_list)))

# Persist to external filter file (allows user editing)
if not filter_file.exists():
    df_filter = pd.DataFrame(combined_list, columns=['Filter'])
    df_filter.to_csv(filter_file, index=False)
    print(f"✅ Created initial keyword filter file at '{filter_file}' for user editing.")
else:
    print(f"☑️ Keyword filter file already exists at '{filter_file}'. Skipping creation.")

# Store the final competitors DF in the pipeline
pip.set(job, 'competitors_df', df_competitors.to_json(orient='records'))

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import pandas as pd
from IPython.display import display

# NOTE: This cell assumes 'df2' (the result of the aggregation step) is available.

print("Adjusting SEMRush columns that were not part of competitor-columns pivot...")

# Assign aggregating function to each metric
# The chosen functions are critical for creating a single, best-case summary per keyword:
# - 'min' for Position: Gives the *best* rank achieved across all competitors who rank.
# - 'max' for Search Volume/Number of Results/Timestamp: Captures the highest value reported.
# - 'sum' for Traffic/Traffic Cost: Aggregates the total value across all competitor results.
# - 'mean' for Difficulty/CPC/Competition: Averages the difficulty/cost across all reporting sources.
# - 'first' for categorical data (URLs, Intents, SERP Features): Chooses the first encountered value.
agg_funcs = {
    'Position': 'min',
    'Search Volume': 'max',
    'CPC': 'mean',
    'Traffic': 'sum',
    'Traffic (%)': 'mean',
    'Traffic Cost': 'sum',
    'Keyword Difficulty': 'mean',
    'Previous position': 'first',
    'Competition': 'mean',
    'Number of Results': 'max',
    'Timestamp': 'max',
    'SERP Features by Keyword': 'first',
    'Keyword Intents': 'first',
    'Position Type': 'first',
    'URL': 'first',
    'Competitor URL': 'first',
    'Client URL': 'first'
}

# Apply the aggregation across the combined dataset (df2)
agg_df = df2.groupby('Keyword').agg(agg_funcs).reset_index()

# Add a derived metric: Keyword word count
agg_df['Number of Words'] = agg_df["Keyword"].apply(lambda x: len(x.split()))

# Drop the 'Position' column: It was only used for the pivot/min operation,
# but it's redundant/misleading now that the competitor position data is in pivot_df.
agg_df.drop(columns=['Position'], inplace=True)

print("Table of aggregates prepared.")

display(agg_df)

# Store the aggregated metrics in the pipeline
pip.set(job, 'keyword_aggregate_df_json', agg_df.to_json(orient='records'))

In [None]:
# %% editable=true slideshow={"slide_type": ""}
import pandas as pd
from pathlib import Path
from IPython.display import display

# NOTE: This cell assumes 'job', 'pivot_df', 'agg_df', and 'filter_file' are defined in prior cells.

# --- PATH DEFINITION ---
# The filter file path is already defined in a previous step, but included here for clarity
filter_file = Path("data") / f"{job}_filter_keywords.csv"

# --- REQUIRED SUPPORT FUNCTION (Surgically Ported from botifython.py) ---

def reorder_columns_surgical(df, priority_column, after_column):
    """
    Surgical port of bf.reorder_columns: Moves a column immediately after a specified column.
    """
    if priority_column in df.columns and after_column in df.columns:
        columns = df.columns.drop(priority_column).tolist()
        after_column_index = columns.index(after_column)
        columns.insert(after_column_index + 1, priority_column)
        return df[columns]
    elif priority_column not in df.columns:
        print(f"⚠️ Warning: Priority column '{priority_column}' not found for reorder.")
    elif after_column not in df.columns:
        print(f"⚠️ Warning: After column '{after_column}' not found for reorder.")
    return df


print("Merging Pivot Data with Aggregate Data...")

# 1. Merge Pivot Data (Keyword as index) with Aggregate Data (Keyword as index/column)
pivotmerge_df = pd.merge(pivot_df.reset_index(), agg_df, on='Keyword', how='left')

print("Pivot and Aggregate Data Joined.\n")
rows, columns = pivotmerge_df.shape
print(f"Rows: {rows:,}")
print(f"Cols: {columns:,}")

# --- FILTERING LOGIC ---

print("\nBrand and Negative Filters being applied...")
# 2. Optionally Filter Brand & Negative Keywords
if filter_file.exists():
    df_filter = pd.read_csv(filter_file, header=0)
    
    # Ensure all list items are strings before joining into a regex pattern
    kw_filter = [str(f) for f in df_filter["Filter"].dropna().tolist()]
    
    if kw_filter:
        # Use re.escape to handle special characters in keywords and then join with '|' (OR)
        pattern = '|'.join([re.escape(keyword) for keyword in kw_filter])
        
        # Apply the filter: keep rows where Keyword DOES NOT contain the pattern
        filtered_df = pivotmerge_df[~pivotmerge_df["Keyword"].str.contains(pattern, case=False, na=False)]
        print(f"✅ Filter applied using {len(kw_filter)} terms from '{filter_file}'.")
    else:
        filtered_df = pivotmerge_df
        print("⚠️ Filter file exists but contains no terms. Skipping filter application.")
else:
    filtered_df = pivotmerge_df
    print(f"☑️ No filter file found at '{filter_file}'. Skipping negative filtering.")

rows_filtered, columns_filtered = filtered_df.shape
print(f"Rows: {rows_filtered:,} ({rows:,} - {rows_filtered:,} = {rows - rows_filtered:,} rows removed)")


# --- REORDERING AND FINAL POLISH ---

# 3. Apply Reordering Logic (Using the surgically defined function)
# NOTE: The original logic chains reorders based on previously moved columns.
temp_df = filtered_df.copy() # Use a temporary variable for clarity during chained operations

temp_df = reorder_columns_surgical(temp_df, "Search Volume", after_column="Keyword")
temp_df = reorder_columns_surgical(temp_df, "Number of Words", after_column="CPC")
temp_df = reorder_columns_surgical(temp_df, "CPC", after_column="Number of Words")
temp_df = reorder_columns_surgical(temp_df, "Number of Results", after_column="Position Type")
temp_df = reorder_columns_surgical(temp_df, "Timestamp", after_column="Number of Results")
temp_df = reorder_columns_surgical(temp_df, "Competitor URL", after_column="Client URL")

# 4. Final Arrange (Verbatim column ordering and sorting)
# The manual reorder logic below overrides the custom function, but we include it verbatim:
rest_of_columns = [col for col in temp_df.columns if col not in ['Keyword', 'Search Volume']]
new_column_order = ['Keyword', 'Search Volume'] + rest_of_columns

# The conditional block from the original (verbatim)
if 'Keyword' in temp_df.columns:
    temp_df = temp_df[['Keyword'] + ['Search Volume'] + [col for col in temp_df.columns if col not in ['Keyword', 'Search Volume']]]

# Apply the intended final order
filtered_df = temp_df[new_column_order]

# Final sorting and column drops
arranged_df = filtered_df.sort_values(by='Search Volume', ascending=False)
arranged_df.drop(columns=["Previous position", "Traffic", "Traffic (%)", "Traffic Cost"], inplace=True)

print("\nFinal Keyword Table Prepared.")

# Store the final result in the pipeline
pip.set(job, 'filtered_gap_analysis_df_json', arranged_df.to_json(orient='records'))

display(arranged_df)