In [2]:
import requests
import csv

# Constants
API_KEY = ""  # Replace with your Google API key
CSE_ID = ""  # Replace with your Custom Search Engine ID
SEARCH_QUERY = ""  # Replace with your search query
NUM_RESULTS = 100  # Total number of results needed (max 100)

def fetch_google_results(api_key, cse_id, query, num_results):
    """Fetches results from Google Custom Search JSON API."""
    results = []
    start_index = 1
    while len(results) < num_results:
        # Ensure not to request more than the remaining results needed
        max_results = min(num_results - len(results), 10)
        
        url = (
            f"https://www.googleapis.com/customsearch/v1"
            f"?key={api_key}&cx={cse_id}&q={query}&start={start_index}"
        )
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
        
        data = response.json()
        if "items" not in data:
            print("No more items in response.")
            break

        # Append results
        for item in data["items"]:
            results.append({
                "Title": item.get("title"),
                "Link": item.get("link"),
                "Description": item.get("snippet"),
            })
        
        # Update start index for the next batch (increment by 10)
        start_index += max_results
        
        # Google Custom Search JSON API allows a maximum of 100 results
        if start_index > 100:
            print("Reached maximum retrievable results (100).")
            break

    return results

def save_results_to_csv(results, filename):
    """Saves results to a CSV file."""
    fieldnames = ["Title", "Link", "Description"]
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(results)
    print(f"Results saved to {filename}")

# Fetch results
search_results = fetch_google_results(API_KEY, CSE_ID, SEARCH_QUERY, NUM_RESULTS)

# Save to CSV
if search_results:
    save_results_to_csv(search_results, "google_search_results_oman.csv")
else:
    print("No results to save.")


No more items in response.
Results saved to google_search_results_oman.csv


In [4]:
import requests
from bs4 import BeautifulSoup
import csv

def read_links_from_csv(filename):
    """Reads links from a CSV file."""
    links = []
    with open(filename, mode='r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            links.append({
                "Title": row["Title"],
                "Link": row["Link"],
                "Description": row["Description"]
            })
    return links

def extract_website_content(url):
    """Extracts text content from a website."""
    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            return f"Error fetching {url}: Status code {response.status_code}"
        
        soup = BeautifulSoup(response.content, "html.parser")
        # Extract meaningful text (ignore scripts, styles, etc.)
        text = soup.get_text(separator="\n", strip=True)
        return text[:5000]  # Limit to 5000 characters to avoid overly large outputs
    except Exception as e:
        return f"Error fetching {url}: {e}"

def save_to_text_file(results, filename):
    """Saves extracted website content to a text file."""
    with open(filename, mode='w', encoding='utf-8') as file:
        for result in results:
            file.write(f"Title: {result['Title']}\n")
            file.write(f"Link: {result['Link']}\n\n")
            file.write("Content:\n")
            file.write(result['Content'])
            file.write("\n\n---\n\n")
    print(f"Website content saved to {filename}")

# Main script
if __name__ == "__main__":
    # Step 1: Read links from the CSV file
    input_csv = "google_search_results_oman.csv"  # Replace with your CSV file name
    links = read_links_from_csv(input_csv)

    # Step 2: Crawl and extract content for each link
    for link in links:
        print(f"Fetching content for: {link['Link']}")
        link['Content'] = extract_website_content(link['Link'])

    # Step 3: Save to text file
    output_text_file = "oman_website_content.txt"
    save_to_text_file(links, output_text_file)

Fetching content for: https://www.hyundaioman.com/en/
Fetching content for: https://www.instagram.com/hyundaioman_official/?hl=en
Fetching content for: https://www.hyundaioman.com/en/find-a-car/tucson-2021/highlights.php
Fetching content for: https://oman.yallamotor.com/new-cars/hyundai
Fetching content for: https://www.hyundaioman.com/en/find-a-car/sonata-2023/highlights.php
Fetching content for: https://m.facebook.com/HyundaiOmanLive/?profile_tab_item_selected=mentions
Fetching content for: https://oman.motoraty.com/m/motoraty-car-buying-guide/new-cars/hyundai
Fetching content for: https://www.facebook.com/HyundaiOmanLive/
Fetching content for: https://www.youtube.com/@hyundaioman3165
Fetching content for: https://www.drivearabia.com/carprices/oman/hyundai/
Fetching content for: https://www.dubizzle.com.om/en/vehicles/cars-for-sale/hyundai/muscat/
Fetching content for: https://www.thearabianstories.com/2024/11/07/hyundai-oman-launches-exclusive-offer-with-8-iphone-prizes-and-comprehe