In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from rake_nltk import Rake
from nltk.corpus import stopwords
from pytrends.request import TrendReq
from openpyxl import Workbook
import time

# -------------------------
# STEP 1: SCRAPE WEBSITE
# -------------------------
def scrape_seo_elements(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "lxml")

    title = soup.title.string if soup.title else ""
    meta_desc = soup.find("meta", {"name": "description"})
    meta_desc = meta_desc["content"] if meta_desc else ""

    headings = [h.get_text(strip=True) for h in soup.find_all(['h1', 'h2', 'h3'])]
    body_content = " ".join([p.get_text(strip=True) for p in soup.find_all('p')])

    # Blog Tags (Assume tags have <a rel='tag'> or 'tag' in class)
    tags = [tag.get_text(strip=True) for tag in soup.find_all("a", {"rel": "tag"})]
    if not tags:
        tags = [tag.get_text(strip=True) for tag in soup.find_all("a", class_=re.compile("tag"))]

    internal_links = [a["href"] for a in soup.find_all("a", href=True) if url.split("/")[2] in a["href"]]

    return {
        "title": title,
        "meta_desc": meta_desc,
        "headings": headings,
        "body_content": body_content,
        "tags": tags,
        "internal_links": internal_links
    }

# -------------------------
# STEP 2: KEYWORD EXTRACTION
# -------------------------
def extract_keywords(text, min_keywords=300):
    r = Rake(stopwords=stopwords.words('english'))
    r.extract_keywords_from_text(text)
    phrases = r.get_ranked_phrases()

    # Ensure at least 300 keywords (repeat or trim)
    if len(phrases) < min_keywords:
        phrases = phrases * (min_keywords // len(phrases) + 1)
    phrases = list(dict.fromkeys(phrases))[:min_keywords]  # remove duplicates & trim

    # Classify into short, mid, long tail
    short_tail, mid_tail, long_tail = [], [], []
    for kw in phrases:
        wc = len(kw.split())
        if wc == 2:
            short_tail.append(kw)
        elif wc == 3:
            mid_tail.append(kw)
        elif wc >= 4:
            long_tail.append(kw)

    return short_tail, mid_tail, long_tail

# -------------------------
# STEP 3: PYTRENDS KEYWORD ANALYSIS
# -------------------------
def analyze_keywords_pytrends(keywords, max_keywords=50):
    pytrends = TrendReq(hl='en-US', tz=330)
    data = []

    for kw in keywords[:max_keywords]:  # PyTrends works best with <50 per batch
        try:
            pytrends.build_payload([kw], timeframe='today 12-m', geo='IN')
            time.sleep(1)  # avoid rate limit

            df_trend = pytrends.interest_over_time()
            volume = df_trend[kw].mean() if not df_trend.empty else 0
            difficulty = round((100 - volume) / 10, 2)  # Fake metric (proxy)
            cpc = round(volume / 10, 2)  # Fake CPC proxy (for demonstration)
            data.append([kw, volume, difficulty, cpc])
        except:
            data.append([kw, 0, 0, 0])
            continue
    return data

# -------------------------
# STEP 4: FINAL REPORT
# -------------------------
def create_final_report(url, short_tail, mid_tail, long_tail):
    all_keywords = (
        [(kw, "Short-tail") for kw in short_tail] +
        [(kw, "Mid-tail") for kw in mid_tail] +
        [(kw, "Long-tail") for kw in long_tail]
    )
    keywords_list = [kw[0] for kw in all_keywords]
    trends_data = analyze_keywords_pytrends(keywords_list)

    final_data = []
    for (kw, tail_type), (_, volume, difficulty, cpc) in zip(all_keywords, trends_data):
        final_data.append([kw, url, volume, difficulty, cpc, tail_type])

    df = pd.DataFrame(final_data, columns=["Keyword", "Source_URL", "Volume", "Difficulty", "CPC", "Tail-Type"])
    df.to_excel("Spain Keywords.xlsx", index=False)
    print("\n✅ Spain Keywords.xlsx")
    print(df.head(10))


# -------------------------
# STEP 5: RUNNING ALL STEPS
# -------------------------
if __name__ == "__main__":
    url = "https://teaspoonofadventure.com/eat-in-spain/"
    print(f"Scraping SEO data from: {url} ...")
    seo_data = scrape_seo_elements(url)
    print("Title:", seo_data['title'])
    print("Meta Description:", seo_data['meta_desc'])
    print("Headings Found:", seo_data['headings'][:5])
    print("Tags:", seo_data['tags'])

    print("\nExtracting keywords...")
    text_to_analyze = " ".join([seo_data['title'], seo_data['meta_desc']] + seo_data['headings']) + " " + seo_data['body_content']
    short_tail, mid_tail, long_tail = extract_keywords(text_to_analyze)

    print(f"Short-tail: {len(short_tail)}, Mid-tail: {len(mid_tail)}, Long-tail: {len(long_tail)}")
    create_final_report(url, short_tail, mid_tail, long_tail)
