In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
from bs4 import BeautifulSoup
from rake_nltk import Rake
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
import pandas as pd
import re
import schedule
import time
from datetime import datetime

# Setup (one-time)
nltk.download('punkt')
nltk.download('stopwords')

# List of Maldives-related URLs
urls = [
    "https://www.tripadvisor.in/Attractions-g293953-Activities-Maldives.html",
    "https://www.holidify.com/pages/maldives-with-kids-1307.html",
    "https://www.makemytrip.com/tripideas/places/maldives",
    "https://www.thrillophilia.com/places-to-visit-in-maldives-for-honeymoon",
    "https://traveltriangle.com/blog/places-to-visit-in-maldives/",
    "https://www.thomascook.in/international-tourism/places-to-visit-in-maldives",
    "https://www.gokitetours.com/top-rated-tourist-attractions-in-maldives/",
    "https://www.honeymoondreams.co.uk/category/destinations/indian-ocean/maldives/",
    "https://www.yatra.com/international-tour-packages/honeymoon-holidays-in-maldives",
    "https://www.godigit.com/international-travel-insurance/tourist-places/places-to-visit-in-maldives"
]

def clean_text(text):
    return re.sub(r'\s+', ' ', re.sub(r'[^a-zA-Z\s]', '', text)).strip()

def extract_keywords_rake(content):
    r = Rake()
    r.extract_keywords_from_text(content)
    phrases = r.get_ranked_phrases()
    # Filter keywords between 2 to 4 words only
    filtered = [phrase.title() for phrase in phrases if 2 <= len(phrase.split()) <= 4]
    return filtered[:20]  # Top 20 per URL

def classify_tail_length(kw):
    wc = len(kw.split())
    if wc == 2:
        return "short-tail"
    elif wc == 3:
        return "mid-tail"
    else:
        return "long-tail"

def scrape_and_analyze():
    print(f"\nRunning keyword scraping at {datetime.now()}...\n")
    all_keywords = []

    for url in urls:
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.content, "html.parser")

            # Extract main textual content
            paragraphs = soup.find_all('p')
            text = " ".join([p.get_text() for p in paragraphs])
            cleaned = clean_text(text)

            # Extract keywords
            keywords = extract_keywords_rake(cleaned)

            for kw in keywords:
                all_keywords.append({
                    "Keyword": kw,
                    "Source URL": url,
                    "Volume": "",       # To be filled using external SEO tools
                    "Difficulty": "",
                    "CPC": "",
                    "Type": classify_tail_length(kw)
                })

        except Exception as e:
            print(f"Failed to scrape {url}: {e}")

    # Save to Excel
    df = pd.DataFrame(all_keywords)
    filename = f"maldives_keywords_scraped_{datetime.now().strftime('%Y%m%d_%H%M')}.xlsx"
    df.to_excel(filename, index=False)
    print(f"\n✅ Keyword extraction complete. Saved to {filename}")

# ⚙️ Schedule the script to run daily at 9 AM
schedule.every().day.at("09:00").do(scrape_and_analyze)

# Run immediately once
scrape_and_analyze()

# Keep script running
while True:
    schedule.run_pending()
    time.sleep(60)

# Step 7: Create Excel
df = pd.DataFrame(data, columns=[
    "Keyword", "Source URL", "Volume", "Difficulty", "CPC", "Type (Head/Long-tail)"
])
df.to_excel("Dubai_Final_Keyword_Report.xlsx", index=False)

print(r"C:\Users\Hp\Downloads\Maldives_Final_Keyword_Report.xlsx")

