## Setup Requirements for GSC API Access

Before running this script, you'll need to set up:

1. **Google Cloud Platform (GCP) Project**:
   - Create or use an existing GCP project at [console.cloud.google.com](https://console.cloud.google.com)
   - Enable the **Google Search Console API** in your project

2. **Service Account**:
   - Go to IAM & Admin > Service Accounts in GCP
   - Create a new service account with a descriptive name
   - Download the JSON key file (save as `service-account-key.json`)
   - Copy the service account email address to a text file (`service_account.txt`)

3. **GSC Permissions**:
   - Open Google Search Console for your site
   - Go to Settings > Users and Permissions
   - Add the service account email as a user with "Full" permissions
   - Wait a few minutes for permissions to propagate

This gives your script API access to your GSC data without requiring browser-based authentication each time it runs.

!pip install google-api-python-client

In [None]:
from google.oauth2 import service_account
from googleapiclient.discovery import build
import json
import sys
import os


threshold = 31  # For striking distance keywords, for example position 11 or higher
working_folder = "/home/mike/repos/pipulate/notebooks"  # Leave off the trailing slash
semrush_file = f"{working_folder}/semrush_bulk.csv"  # From SEMrush Keyword Overview

def check_requirements():
    missing_items = []
    
    # Check working folder
    if not os.path.exists(working_folder):
        missing_items.append(f"Working folder: {working_folder}")
    
    # Check service account files
    service_account_txt = f'{working_folder}/service_account.txt'
    service_account_json = f'{working_folder}/service-account-key.json'
    
    if not os.path.exists(service_account_txt):
        missing_items.append(f"Service account email file: {service_account_txt}")
    if not os.path.exists(service_account_json):
        missing_items.append(f"Service account key file: {service_account_json}")
    
    print("✓ All required files and locations are present")

# Run the checks
check_requirements()

# Read service account email from text file
with open(f'{working_folder}/service_account.txt', 'r') as f:
    service_account_email = f.read().strip()

In [None]:


# Path to your service account key JSON file
SERVICE_ACCOUNT_FILE = f'{working_folder}/service-account-key.json'

# Define the required scopes
SCOPES = ['https://www.googleapis.com/auth/webmasters']

# Authenticate using service account
credentials = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE, scopes=SCOPES)

# Build the service
webmasters_service = build('webmasters', 'v3', credentials=credentials)

# List sites (properties) accessible to this service account
site_list = webmasters_service.sites().list().execute()

# Print the list of sites
print("Sites accessible to this service account:")
for site in site_list.get('siteEntry', []):
    url = site.get('siteUrl')
    permission_level = site.get('permissionLevel')
    print(f"URL: {url}, Permission: {permission_level}")

print(f"\nTotal sites: {len(site_list.get('siteEntry', []))}")
# print(f"Service account email: {service_account_email}")

In [None]:
# ======================================================================
# CELL: Find Most Recent GSC Data Date
# ======================================================================

import time
from datetime import datetime, timedelta

# Define the site we're working with (from first code block)
target_site = site_list['siteEntry'][0]['siteUrl']
print(f"Finding most recent data for site: {target_site}")

# Function to check if data exists for a specific date
def check_date_has_data(service, site_url, check_date):
    # Create a simple 1-day query with minimal dimensions
    date_str = check_date.strftime('%Y-%m-%d')
    test_request = {
        'startDate': date_str,
        'endDate': date_str,
        'dimensions': ['query'],  # Just query dimension for a quick check
        'rowLimit': 1  # We only need to know if any data exists
    }
    
    # Execute the query
    response = service.searchanalytics().query(siteUrl=site_url, body=test_request).execute()
    return len(response.get('rows', [])) > 0

# Start with yesterday and work backwards
current_date = datetime.now().date() - timedelta(days=1)
max_days_to_check = 10  # Limit how far back we'll check
days_checked = 0

print(f"Starting with date: {current_date}")

# Loop until we find data or hit our limit
while days_checked < max_days_to_check:
    print(f"Checking if data exists for {current_date}...", end=" ")
    
    if check_date_has_data(webmasters_service, target_site, current_date):
        print("✓ Data found!")
        most_recent_data_date = current_date
        break
    else:
        print("✗ No data")
        current_date -= timedelta(days=1)
        days_checked += 1
        time.sleep(0.5)  # Small pause to avoid rate limiting
    
if days_checked >= max_days_to_check:
    print(f"Warning: Couldn't find data in the last {max_days_to_check} days")
    most_recent_data_date = current_date + timedelta(days=1)  # Use the last date we didn't check
else:
    print(f"\nMost recent GSC data available is for: {most_recent_data_date}")
    
# Calculate a reasonable default date range (3 months ending at most recent date)
default_end_date = most_recent_data_date
default_start_date = default_end_date - timedelta(days=3)

print(f"Recommended date range for full queries:")
print(f"  Start: {default_start_date}")
print(f"  End: {default_end_date}")

In [None]:
# ======================================================================
# CELL: Query GSC Data with 3-Day Date Range and Convert to DataFrame
# ======================================================================

import pandas as pd

# Set pandas display options to show all rows/columns without limits
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Use a 3-day range instead of 90 days
default_start_date = default_end_date - timedelta(days=3)

# Prepare the query request using the updated dates
request = {
    'startDate': default_start_date.strftime('%Y-%m-%d'),
    'endDate': default_end_date.strftime('%Y-%m-%d'),
    'dimensions': ['query', 'page'],
    'rowLimit': 25000,  # Maximum allowed per request
    'startRow': 0
}

print(f"Querying data for site: {target_site}")
print(f"Date range: {default_start_date.strftime('%Y-%m-%d')} to {default_end_date.strftime('%Y-%m-%d')}")

# ======================================================================
# FUNCTION: Fetch All GSC Data (with pagination)
# ======================================================================

def fetch_all_gsc_data(service, site_url, request, max_rows=100000):
    """
    Fetch all data from Google Search Console API with pagination.
    
    Args:
        service: The GSC API service object
        site_url: The site URL to query
        request: The query request body
        max_rows: Maximum total rows to fetch (default: 100000)
        
    Returns:
        List of row data from GSC
    """
    all_rows = []
    rows_fetched = 0
    start_row = 0
    
    while rows_fetched < max_rows:
        # Update the startRow in the request
        request['startRow'] = start_row
        
        # Execute the query
        response = service.searchanalytics().query(siteUrl=site_url, body=request).execute()
        
        # Get the rows from the response
        current_rows = response.get('rows', [])
        
        # If no more rows returned, we're done
        if not current_rows:
            break
            
        # Add these rows to our list
        all_rows.extend(current_rows)
        
        # Update counters for next iteration
        num_current_rows = len(current_rows)
        rows_fetched += num_current_rows
        start_row += num_current_rows
        
        # Print progress
        print(f"Fetched {rows_fetched} rows so far...")
        
    print(f"Completed fetching {rows_fetched} total rows")
    return all_rows


# Execute the query using our existing function
all_data = fetch_all_gsc_data(webmasters_service, target_site, request)

# Convert the API response to a DataFrame
df = pd.DataFrame(all_data)

# Split the 'keys' column into separate columns
if 'keys' in df.columns:
    # The first element in keys is 'query', the second is 'page'
    df['query'] = df['keys'].apply(lambda x: x[0])
    df['page'] = df['keys'].apply(lambda x: x[1])
    df = df.drop('keys', axis=1)

# Ensure metrics are numeric types
for col in ['clicks', 'impressions', 'position', 'ctr']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col])

# Convert CTR to percentage for better readability
df['ctr'] = df['ctr'] * 100

# Preview the DataFrame
print("\nDataFrame Preview (30 rows):")
print(df.head(30))

# Print basic stats about the data
print(f"\nTotal number of rows: {len(df)}")
print(f"Unique queries: {df['query'].nunique()}")
print(f"Unique pages: {df['page'].nunique()}")
print(f"Total clicks: {df['clicks'].sum()}")
print(f"Total impressions: {df['impressions'].sum()}")
print(f"Average position: {df['position'].mean():.2f}")
print(f"Average CTR: {df['ctr'].mean():.2f}%")

In [None]:
# ======================================================================
# CELL: Find Striking Distance Keywords
# ======================================================================

print(f"Finding striking distance keywords (position >= {threshold})...")

# Step 1: For each query, find its best position across all pages
query_best_position = df.groupby('query')['position'].min().reset_index()

# Step 2: Filter to keep only queries where even the best position is >= threshold
striking_distance_queries = query_best_position[query_best_position['position'] >= threshold]['query'].tolist()
print(f"Found {len(striking_distance_queries)} queries with all positions >= {threshold}")

# Step 3: Create a dataframe with only these striking distance keywords
striking_df = df[df['query'].isin(striking_distance_queries)].copy()

# Step 4: Aggregate data by query (summing clicks and impressions)
striking_agg = striking_df.groupby('query').agg({
    'clicks': 'sum',
    'impressions': 'sum',
    'position': 'mean'  # Average position across all pages
}).reset_index()

# Sort by impressions (highest first) to prioritize higher volume opportunities
striking_agg = striking_agg.sort_values('impressions', ascending=False)

# Print details about the striking distance keywords
print("\nStriking Distance Keywords (30 rows):")
print(striking_agg.head(30))
print(f"\nTotal striking distance keywords: {len(striking_agg)}")

# Extract the complete list of keywords, sorted by impressions
all_striking_keywords = striking_agg['query'].tolist()
print(f"Prepared {len(all_striking_keywords)} keywords for processing")

In [None]:
# ======================================================================
# CELL: Clean Keywords for SEMrush
# ======================================================================

# Set to True to use NLTK analysis, False to use only basic cleaning
USE_NLTK_ANALYSIS = False  # Easy toggle

# If NLTK is enabled, try to import it (install if needed)
if USE_NLTK_ANALYSIS:
    try:
        import nltk
        from nltk.tokenize import word_tokenize
        # Uncomment to download necessary data (first time only)
        # nltk.download('punkt')
    except ImportError:
        print("NLTK not installed. Run: pip install nltk")
        USE_NLTK_ANALYSIS = False
        
# Basic cleaning function
def clean_keywords(keywords_list):
    cleaned_keywords = []
    too_complex = []
    
    for keyword in keywords_list:
        # Remove quotes and normalize spacing
        cleaned = keyword.replace('"', '').replace('"', '').strip()
        
        # Check if too long (more than 5 words might be problematic)
        if len(cleaned.split()) > 5:
            too_complex.append(keyword)
            continue
            
        # Check for special characters (except basic punctuation)
        if any(c for c in cleaned if not c.isalnum() and c not in [' ', '-', '.']):
            # Try removing special characters
            cleaned = ''.join(c for c in cleaned if c.isalnum() or c == ' ')
            
        # Remove "vs" comparisons as they often get flagged
        if " vs " in cleaned.lower():
            words = cleaned.lower().split(" vs ")
            cleaned = words[0]  # Take just the first term
            
        cleaned_keywords.append(cleaned.strip())
        
    return cleaned_keywords, too_complex

# More advanced analysis using NLTK (if enabled)
def analyze_keyword_complexity(keyword):
    if not USE_NLTK_ANALYSIS:
        return "skipped_analysis"
        
    # Tokenize
    tokens = word_tokenize(keyword.lower())
    
    # Check length
    if len(tokens) > 5:
        return "too_long"
    
    # Check for special tokens
    if any(not token.isalnum() for token in tokens):
        return "special_chars"
    
    # Check for comparison terms
    if "vs" in tokens or "versus" in tokens:
        return "comparison"
    
    return "ok"

# Process all keywords
print(f"Processing {len(all_striking_keywords)} keywords...")
cleaned_keywords, removed_keywords = clean_keywords(all_striking_keywords)

# Analyze complexity if enabled
if USE_NLTK_ANALYSIS:
    complexity_results = {}
    for kw in cleaned_keywords:
        result = analyze_keyword_complexity(kw)
        if result != "ok":
            complexity_results[kw] = result
    
    print(f"\nKeyword complexity analysis:")
    for kw, result in complexity_results.items():
        print(f"- '{kw}': {result}")

# Take the top 100 cleaned keywords
top_100_cleaned = cleaned_keywords[:100]
semrush_formatted = "\n".join(top_100_cleaned)

print(f"\nCleaned {len(all_striking_keywords)} keywords to {len(cleaned_keywords)}")
print(f"Removed {len(removed_keywords)} complex keywords")
print(f"Selected top 100 for SEMrush")

print("\nTop 100 keywords for SEMrush (copy/paste ready):")
print("Run the next cell to copy/paste the keywords into SEMrush")
# print(semrush_formatted)

# Optionally save to file
# with open('semrush_keywords.txt', 'w') as f:
#     f.write(semrush_formatted)

In [None]:
print(semrush_formatted)

In [None]:
missing_items = []
# Check SEMrush file
if not os.path.exists(semrush_file):
    missing_items.append(f"SEMrush data file: {semrush_file}")

if missing_items:
    print("Error: The following required files or locations are missing:")
    for item in missing_items:
        print(f"  - {item}")
    print("\nPlease ensure all required files are in place and adjust the paths if needed:")
    print(f"1. Place your SEMrush data file in the working folder and update 'semrush_file' path")
    sys.exit(1)
    


In [None]:
df_semrush = pd.read_csv(semrush_file)
print(df_semrush.columns)
print(df_semrush.head())
print(df_semrush.describe())
print(df_semrush.info())
print(df_semrush.shape)
print(df_semrush.size)

In [None]:
# ======================================================================
# CELL: Join GSC Striking Distance Keywords with SEMrush Data
# ======================================================================
import numpy as np

# First, let's clean up the SEMrush dataframe a bit (nulls and capitalization)
df_semrush['Keyword'] = df_semrush['Keyword'].str.lower()  # Make lowercase to match GSC data
df_semrush_clean = df_semrush.copy()

# Now join the striking distance keywords with SEMrush data
merged_df = striking_agg.merge(
    df_semrush_clean,
    left_on='query',
    right_on='Keyword',
    how='left'  # Left join to keep all striking distance keywords
)

# Fill NaN values for Volume (some keywords may not be in SEMrush)
merged_df['Volume'] = merged_df['Volume'].fillna(0).astype(int)
merged_df['Keyword Difficulty'] = merged_df['Keyword Difficulty'].fillna(0)
merged_df['CPC (USD)'] = merged_df['CPC (USD)'].fillna(0.0)

# Create an opportunity score (basic version)
# High impressions + decent position + high volume + lower difficulty = better opportunity
merged_df['Opportunity Score'] = (
    merged_df['impressions'] * 0.4 +                # More impressions is good
    merged_df['Volume'] * 0.4 +                     # More volume is good
    (100 - merged_df['Keyword Difficulty']) * 0.1 + # Lower difficulty is good
    (30 - merged_df['position']).clip(lower=0) * 0.1 # Better position (closer to 11) is good
)

# Sort by opportunity score
merged_df = merged_df.sort_values('Opportunity Score', ascending=False)

# Display the results
print("Top Opportunities (Keywords with SEMrush data):")
columns_to_show = ['query', 'impressions', 'position', 'Volume', 'Keyword Difficulty', 'CPC (USD)', 'Opportunity Score']
print(merged_df[columns_to_show].head(30))

# Summarize match rate
total_keywords = len(striking_agg)
matched_keywords = merged_df['Volume'].replace(0, np.nan).count()
match_rate = (matched_keywords / total_keywords) * 100

print(f"\nKeyword Match Statistics:")
print(f"Total striking distance keywords: {total_keywords}")
print(f"Keywords found in SEMrush: {matched_keywords} ({match_rate:.1f}%)")

# Create a list of prioritized keywords based on this analysis
# This filters to keep only keywords that have SEMrush data
semrush_matched_keywords = merged_df[merged_df['Volume'] > 0].copy()

print(f"\nTop 20 Opportunity Keywords (with SEMrush data):")
for idx, row in semrush_matched_keywords.head(20).iterrows():
    print(f"{row['query']} - Vol: {row['Volume']} - KD: {row['Keyword Difficulty']} - Pos: {row['position']:.1f}")

In [None]:
# ======================================================================
# CELL: Generate SEO Title Optimization Template
# ======================================================================

# Function to generate the SEO table and template
def generate_seo_template(df, num_keywords=20, filter_list=None):
    # Default filter list if none provided
    if filter_list is None:
        filter_list = ["mike levin"]
    
    # Filter to only include rows with volume > 0 and exclude filter_list terms
    filtered_df = df[df['Volume'] > 0].copy()
    for term in filter_list:
        filtered_df = filtered_df[~filtered_df['query'].str.contains(term, case=False)]
    
    # Sort by Opportunity Score and take top N
    filtered_df = filtered_df.sort_values('Opportunity Score', ascending=False).head(num_keywords)
    
    # Define column widths
    col_widths = {
        'keyword': 30,
        'relevance': 10,
        'volume': 8,
        'kd': 6,
        'position': 10,
        'opportunity': 12
    }
    
    # Generate the table header with proper column widths
    header = f"| {'Keyword'.ljust(col_widths['keyword'])} | {'Relevance'.center(col_widths['relevance'])} | {'Volume'.center(col_widths['volume'])} | {'KD'.center(col_widths['kd'])} | {'Position'.center(col_widths['position'])} | {'Opportunity'.center(col_widths['opportunity'])} |"
    separator = f"|{'-' * (col_widths['keyword'] + 2)}|{'-' * (col_widths['relevance'] + 2)}|{'-' * (col_widths['volume'] + 2)}|{'-' * (col_widths['kd'] + 2)}|{'-' * (col_widths['position'] + 2)}|{'-' * (col_widths['opportunity'] + 2)}|"
    
    table = header + "\n" + separator + "\n"
    
    # Generate the table rows with proper alignment
    for _, row in filtered_df.iterrows():
        keyword = row['query']
        # Truncate long keywords with ellipsis
        if len(keyword) > col_widths['keyword'] - 1:
            keyword = keyword[:col_widths['keyword'] - 4] + "..."
        
        volume = str(int(row['Volume']))
        kd = f"{row['Keyword Difficulty']:.1f}"
        position = f"{row['position']:.1f}"
        opportunity = f"{row['Opportunity Score']:.1f}"
        
        # Format the row with proper alignment
        table_row = f"| {keyword.ljust(col_widths['keyword'])} | {'?%'.center(col_widths['relevance'])} | {volume.rjust(col_widths['volume'])} | {kd.rjust(col_widths['kd'])} | {position.rjust(col_widths['position'])} | {opportunity.rjust(col_widths['opportunity'])} |"
        table += table_row + "\n"
    
    # Generate the complete template
    template = f"""
# SEO Title & Permalink Optimization Template

## Top Keyword Opportunities

{table}
---

# Paste article here

---

Analyze this article draft and the provided SEO data table. 

1. Identify the 3-5 striking distance keywords from the table that best match the article's actual content and themes.

2. For each matching keyword, rate its relevance to the article content on a scale of 0-100%.

3. Based on the most relevant, high-opportunity keywords, suggest:
   - An engaging title in Title Case (60-70 characters max)
   - A permalink slug using hyphenated lowercase (3-5 words max)
   - Brief rationale explaining why this keyword strategy makes sense
   
4. The title should accurately represent the article content while incorporating the target keyword(s) naturally.

5. Consider user intent: would someone searching this term be satisfied by this content?
"""
    
    return template

# Generate and display the template
seo_template = generate_seo_template(merged_df, num_keywords=100)
print(seo_template)