In [1]:
!pip install selenium beautifulsoup4 lxml nltk spacy rake-nltk pytrends pandas openpyxl
!python -m nltk.downloader stopwords
!python -m spacy download en_core_web_sm


Collecting rake-nltk
  Obtaining dependency information for rake-nltk from https://files.pythonhosted.org/packages/3b/e5/18876d587142df57b1c70ef752da34664bb7dd383710ccf3ccaefba2aa0c/rake_nltk-1.0.6-py3-none-any.whl.metadata
  Downloading rake_nltk-1.0.6-py3-none-any.whl.metadata (6.4 kB)
Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 262.6 kB/s eta 0:00:49
     --------------------------------------- 0.1/12.8 MB 391.3 kB/s eta 0:00:33
     --------------------------------------- 0.1/12.8 MB 544.7 kB/s eta 0:00:24
      -------------------------------------- 0.2/12.8 MB 737.3 kB/s eta 0:00:18
      -------------------------------------- 0.3/12.8 MB 827.5 kB/s eta 0:00:16
      -------------------------------------- 0.3/12.8 MB 863.3 kB/s eta 0:00:15
     - ------------------------------------- 0.4/12.8 MB 928.4 kB/s eta 0:00:14
     - ------------------------------------- 0.4

In [2]:
!pip install trends




In [1]:
import time
import random
import pandas as pd
import nltk
from nltk.corpus import stopwords
from rake_nltk import Rake
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from pytrends.request import TrendReq

nltk.download('stopwords')

# === STEP 1: Scrape Page with Selenium + BS4 ===
target_url = ("https://www.talentica.com/blogs/custom-saas-development/")

options = Options()
options.add_argument("--headless")
driver = webdriver.Chrome(options=options)
driver.get(target_url)
time.sleep(5)
html = driver.page_source
driver.quit()

soup = BeautifulSoup(html, "lxml")

# Extract SEO Elements
title = soup.title.text.strip()
meta_desc = soup.find("meta", {"name": "description"})
meta_desc = meta_desc['content'].strip() if meta_desc else ""
h_tags = [tag.text.strip() for tag in soup.find_all(["h1", "h2", "h3"])]
body_text = ' '.join([p.text.strip() for p in soup.find_all("p")])
blog_tags = [tag.text.strip() for tag in soup.find_all("a", class_="tag")]
internal_links = [a['href'] for a in soup.find_all("a", href=True) if target_url in a['href']]

text_content = ' '.join(h_tags) + " " + body_text

# === STEP 2: Keyword Extraction using RAKE with min/max length filter ===
rake = Rake(stopwords=stopwords.words('english'))
rake.extract_keywords_from_text(text_content)
raw_keywords = rake.get_ranked_phrases()

# Keep only 2 to 4 word phrases
keywords = []
for kw in raw_keywords:
    word_count = len(kw.split())
    if 2 <= word_count <= 4:
        keywords.append(kw.strip().lower())

# Deduplicate
keywords = list(set(keywords))

# === STEP 3: Simulated Google Trends + Volume via Text Frequency ===
pytrends = TrendReq(hl='en-US', tz=360)

def get_keyword_metrics(keyword, text_source):
    # Try real data from PyTrends
    try:
        pytrends.build_payload([keyword], cat=0, timeframe='today 12-m')
        data = pytrends.interest_over_time()
        if not data.empty and keyword in data.columns:
            volume = int(data[keyword].mean())
        else:
            raise ValueError("No data")
    except:
        # Simulate volume by frequency of keyword in text
        count = text_source.lower().count(keyword.lower())
        volume = min(count * 10, 100)

    # Simulate difficulty and CPC
    difficulty = round(random.uniform(20, 70), 2)
    cpc = round(random.uniform(0.5, 3.0), 2)
    return volume, difficulty, cpc

# === STEP 4: Categorize and Compile Final Data ===
def categorize_keyword(kw):
    length = len(kw.split())
    if length == 2:
        return "Short-tail"
    elif length == 3:
        return "Mid-tail"
    else:
        return "Long-tail"

rows = []
for kw in keywords[:50]:  # Limit to 50
    volume, difficulty, cpc = get_keyword_metrics(kw, text_content)
    kw_type = categorize_keyword(kw)
    rows.append([kw, target_url, volume, difficulty, cpc, kw_type])

# === STEP 5: Save to Excel ===
df = pd.DataFrame(rows, columns=["Keyword", "Source URL", "Volume", "Difficulty", "CPC", "Type"])
df.to_excel("custom_saas_keywords_report.xlsx", index=False)

print("✅ Report generated successfully: custom_saas_keywords_report.xlsx")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


✅ Report generated successfully: custom_saas_keywords_report.xlsx
