In [10]:
import time
import random
import pandas as pd
import nltk
from nltk.corpus import stopwords
from rake_nltk import Rake
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from pytrends.request import TrendReq

nltk.download('stopwords')

# === STEP 1: Scrape Page with Selenium + BS4 ===
target_url = ("https://www.netguru.com/services/web-development")
	#https://radixweb.com/services/web-development			
	#https://www.scnsoft.com/web-development			
	#https://www.appnovation.com/services/web-development			

options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.get(target_url)
time.sleep(5)
html = driver.page_source
driver.quit()

soup = BeautifulSoup(html, "lxml")

# Extract SEO Elements
title = soup.title.text.strip()
meta_desc = soup.find("meta", {"name": "description"})
meta_desc = meta_desc['content'].strip() if meta_desc else ""
h_tags = [tag.text.strip() for tag in soup.find_all(["h1", "h2", "h3"])]
body_text = ' '.join([p.text.strip() for p in soup.find_all("p")])
blog_tags = [tag.text.strip() for tag in soup.find_all("a", class_="tag")]
internal_links = [a['href'] for a in soup.find_all("a", href=True) if target_url in a['href']]

text_content = ' '.join(h_tags) + " " + body_text

# === STEP 2: Keyword Extraction using RAKE with min/max length filter ===
rake = Rake(stopwords=stopwords.words('english'))
rake.extract_keywords_from_text(text_content)
raw_keywords = rake.get_ranked_phrases()

# Keep only 2 to 4 word phrases
keywords = []
for kw in raw_keywords:
    word_count = len(kw.split())
    if 2 <= word_count <= 4:
        keywords.append(kw.strip().lower())

# Deduplicate
keywords = list(set(keywords))

# === STEP 3: Simulated Google Trends + Volume via Text Frequency ===
pytrends = TrendReq(hl='en-US', tz=360)

def get_keyword_metrics(keyword, text_source):
    # Try real data from PyTrends
    try:
        pytrends.build_payload([keyword], cat=0, timeframe='today 12-m')
        data = pytrends.interest_over_time()
        if not data.empty and keyword in data.columns:
            volume = int(data[keyword].mean())
        else:
            raise ValueError("No data")
    except:
        # Simulate volume by frequency of keyword in text
        count = text_source.lower().count(keyword.lower())
        volume = min(count * 10, 100)

    # Simulate difficulty and CPC
    difficulty = round(random.uniform(20, 70), 2)
    cpc = round(random.uniform(0.5, 3.0), 2)
    return volume, difficulty, cpc

# === STEP 4: Categorize and Compile Final Data ===
def categorize_keyword(kw):
    length = len(kw.split())
    if length == 2:
        return "Short-tail"
    elif length == 3:
        return "Mid-tail"
    else:
        return "Long-tail"

rows = []
for kw in keywords[:100]:  # Limit to 50
    volume, difficulty, cpc = get_keyword_metrics(kw, text_content)
    kw_type = categorize_keyword(kw)
    rows.append([kw, target_url, volume, difficulty, cpc, kw_type])

# === STEP 5: Save to Excel ===
df = pd.DataFrame(rows, columns=["Keyword", "Source URL", "Volume", "Difficulty", "CPC", "Type"])
df.to_excel("Webapp_development_keywords_report-4.xlsx", index=False)

print("✅ Report generated successfully: Webapp_development_keywords_report-4.xlsx")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)
  df = df.fillna(False)


✅ Report generated successfully: Webapp_development_keywords_report-4.xlsx


  df = df.fillna(False)
