In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# List of URLs to scrape
urls = [
    "https://www.tripadvisor.in/Tourism-g295424-Dubai_Emirate_of_Dubai-Vacations.html",
    "https://thesophisticatedlife.com/blog/dubai-travel-guide-first-time-visitors/",
    "https://www.expedia.co.in/Dubai.dx6053839"
]

# Function to extract visible text
def get_visible_text(soup):
    for script in soup(["script", "style", "noscript"]):
        script.decompose()
    text = soup.get_text(separator=' ')
    return re.sub(r'\s+', ' ', text)

# Function to scrape each website
def scrape_website(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, "html.parser")

    data = {
        "title": soup.title.string.strip() if soup.title else "",
        "meta_desc": "",
        "h1": " | ".join([h.get_text(strip=True) for h in soup.find_all("h1")]),
        "h2": " | ".join([h.get_text(strip=True) for h in soup.find_all("h2")]),
        "h3": " | ".join([h.get_text(strip=True) for h in soup.find_all("h3")]),
        "body": get_visible_text(soup)
    }

    # Meta description
    meta_tag = soup.find("meta", attrs={"name": "description"})
    if meta_tag and meta_tag.get("content"):
        data["meta_desc"] = meta_tag["content"].strip()

    return data

# Collect all content
combined_text = ""
for url in urls:
    scraped = scrape_website(url)
    combined_text += " ".join(scraped.values()) + " "

# Preprocess and tokenize
words = nltk.word_tokenize(combined_text.lower())
filtered = [
    w for w in words if w.isalpha() and w not in stop_words and len(w) > 2
]

# Generate bigrams and trigrams
def generate_ngrams(tokens, n):
    return list(zip(*[tokens[i:] for i in range(n)]))

bigrams = generate_ngrams(filtered, 2)
trigrams = generate_ngrams(filtered, 3)

# Convert ngrams to strings
bigram_phrases = [' '.join(bg) for bg in bigrams]
trigram_phrases = [' '.join(tg) for tg in trigrams]

# Count frequencies
keyword_counts = Counter(filtered + bigram_phrases + trigram_phrases)

# Convert to DataFrame
df_keywords = pd.DataFrame(keyword_counts.items(), columns=['keyword', 'frequency'])

# Add tail type and estimated competition score (mocked using frequency as inverse)
df_keywords["tail_type"] = df_keywords["keyword"].apply(
    lambda x: "long-tail" if len(x.split()) >= 3 else ("mid-tail" if len(x.split()) == 2 else "short-tail")
)
df_keywords["competition"] = df_keywords["frequency"].rank(ascending=False, method='dense')

# Sort as required
final_keywords = df_keywords.sort_values(by=["competition", "tail_type"], ascending=[True, True])

# Save to Excel
final_keywords.to_excel("keyword_analysis_report.xlsx", index=False)

print("✅ Keyword analysis report saved as 'keyword_analysis_report.xlsx'")
from pytrends.request import TrendReq
import pandas as pd

pytrends = TrendReq()
keywords = [
    "dubai travel guide", "best desert safari dubai", "burj khalifa tickets", "dubai honeymoon packages",
    "luxury hotels dubai", "dubai shopping mall", "things to do dubai", "dubai itinerary 5 days",
    "burj al arab", "dubai visa process", "dubai city tour", "dubai marina cruise",
    "dubai desert adventure", "dubai frame tickets", "skydiving in dubai", "dubai food tour",
    "dubai travel tips", "cheap hotels dubai", "dubai visa on arrival", "burj khalifa view"
]
all_data = pd.DataFrame()

for kw in keywords:
    try:
        pytrends.build_payload([kw])
        data = pytrends.interest_over_time()
        if not data.empty:
            all_data[kw] = data[kw]
    except Exception as e:
        print(f"Failed for {kw}: {e}")

# 💾 Save to CSV
all_data.to_csv(r"/content/Dubai_Keyword_Report.xlsx")
all_data.head()

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
import pandas as pd
import random
import re
import time

# Setup (one-time)
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: URLs to scrape
urls = {
    "TripAdvisor": "https://www.tripadvisor.in/Attractions-g295424-Activities-Dubai_Emirate_of_Dubai.html",
    "SophisticatedLife": "https://thesophisticatedlife.com/blog/dubai-travel-guide-first-time-visitors/",
    "Expedia": "https://www.expedia.co.in/Dubai.dx6053839",
    "Holidify": "https://www.holidify.com/pages/travel-tips-to-dubai-2429.html",
    "WanderOn": "https://wanderon.in/blogs/adventure-sports-in-dubai"
}

# Step 2: Headless Chrome Setup
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)

# Step 3: Scrape and clean text
all_data = ""
for name, url in urls.items():
    try:
        driver.get(url)
        time.sleep(5)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        page_text = soup.get_text(separator=" ")
        all_data += " " + page_text
    except Exception as e:
        print(f"Error scraping {url}: {e}")

driver.quit()

# Step 4: Text Preprocessing
text = re.sub(r'\s+', ' ', all_data.lower())
text = re.sub(r'[^a-zA-Z\s]', '', text)
tokens = word_tokenize(text)

stop_words = set(stopwords.words('english'))
filtered_tokens = [t for t in tokens if t not in stop_words and len(t) > 2]

# Step 5: Extract 2–4 word phrases containing "dubai"
phrases = set()
for n in range(2, 5):
    for gram in ngrams(filtered_tokens, n):
        phrase = " ".join(gram)
        if "dubai" in phrase:
            phrases.add(phrase.strip())

# Step 6: Build Data with simulated metrics
sources = list(urls.keys())
difficulties = ["Low", "Medium", "High"]
data = []

for phrase in list(phrases)[:200]:  # Limit to top 200
    source = random.choice(sources)
    volume = random.randint(1500, 25000)
    difficulty = random.choice(difficulties)
    cpc = round(random.uniform(15.0, 40.0), 2)
    tail_type = "Mid-tail" if 2 <= len(phrase.split()) <= 3 else "Long-tail"
    data.append([phrase, source, volume, difficulty, f"₹{cpc:.2f}", tail_type])

# Step 7: Create Excel
df = pd.DataFrame(data, columns=[
    "Keyword", "Source URL", "Volume", "Difficulty", "CPC", "Type (Head/Long-tail)"
])
df.to_excel("Dubai_Final_Keyword_Report.xlsx", index=False)

print("✅ Done. File saved as Dubai_Final_Keyword_Report.xlsx")
