In [9]:
import os
import requests
import json
import re
from datetime import datetime, timedelta

# Constants
API_KEY = "AIzaSyAubcx_4--W1ynVOBNCmhZyU_yOO6xPwFs"  # Replace with your Google API key
CSE_ID = "2454833207fbd4f37"  # Replace with your Custom Search Engine ID
SEARCH_QUERY = "hyundai \"forum\" reviews"  # Replace with your search query
NUM_RESULTS = 100  # Total number of results needed (max 100)
SEARCH_FILE_FOLDER_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Google-Search-Results/"
PROJECT_NAME = "reputation-management"


def fetch_google_results(api_key, cse_id, query, num_results, date_restrict=None):
    """Fetches results from Google Custom Search JSON API."""
    results = []
    start_index = 1
    while len(results) < num_results:
        max_results = min(num_results - len(results), 10)

        url = (
            f"https://www.googleapis.com/customsearch/v1"
            f"?key={api_key}&cx={cse_id}&q={query}&start={start_index}"
        )

        # Add date restriction if provided
        if date_restrict:
            url += f"&dateRestrict={date_restrict}"

        response = requests.get(url)
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break

        data = response.json()
        if "items" not in data:
            print("No more items in response.")
            break

        # Append results
        for item in data["items"]:
            # Extract the publication date from the pagemap if available
            date = None
            if "pagemap" in item and "metatags" in item["pagemap"]:
                metatags = item["pagemap"]["metatags"]
                for tag in metatags:
                    date = tag.get("article:published_time") or tag.get("pubdate")
                    if date:
                        break

            # Handle relative dates in the description
            if not date:  # If no date was found in the metatags
                date = parse_relative_date(item.get("snippet", ""))  # Call with 1 argument

            results.append({
                "Title": item.get("title"),
                "Link": item.get("link"),
                "Description": item.get("snippet"),
                "Date": date or "N/A",  # Default to "N/A" if no date is found
            })

        # Update start index for the next batch (increment by 10)
        start_index += max_results

        # Google Custom Search JSON API allows a maximum of 100 results
        if start_index > 100:
            print("Reached maximum retrievable results (100).")
            break

    return results


def parse_relative_date(text):
    """Parses relative date strings (e.g., '1 day ago') into an actual date."""
    try:
        match = re.search(r'(\d+)\s*(day|hour|minute|week|month|year)s?\s*ago', text, re.IGNORECASE)
        if match:
            value, unit = int(match.group(1)), match.group(2).lower()
            now = datetime.now()

            if unit == 'day':
                parsed_date = now - timedelta(days=value)
            elif unit == 'hour':
                parsed_date = now - timedelta(hours=value)
            elif unit == 'minute':
                parsed_date = now - timedelta(minutes=value)
            elif unit == 'week':
                parsed_date = now - timedelta(weeks=value)
            elif unit == 'month':
                parsed_date = now - timedelta(days=value * 30)  # Approximation
            elif unit == 'year':
                parsed_date = now - timedelta(days=value * 365)  # Approximation
            else:
                return None

            return parsed_date.strftime('%Y-%m-%d')  # Return only the date
    except Exception as e:
        print(f"Error parsing relative date: {e}")
        return None


def save_results_to_json(results, file_name, filename):
    """Saves results to a JSON file in the specified folder."""
    # Ensure the folder exists
    os.makedirs(file_name, exist_ok=True)

    # Create full file path
    file_path = os.path.join(file_name, filename)

    # Save results to JSON
    with open(file_path, mode='w', encoding='utf-8') as file:
        json.dump(results, file, indent=4, ensure_ascii=False)
    print(f"Results saved to {file_path}")


# Fetch results
DATE_RESTRICT = 'd2'
search_results = fetch_google_results(API_KEY, CSE_ID, SEARCH_QUERY, NUM_RESULTS, DATE_RESTRICT)

# Replace spaces with dashes for the folder and file name
file_name = SEARCH_QUERY.replace(' ', '-')
output_filename = f"{PROJECT_NAME}-gsr-{file_name}.json"

# Save to JSON in the folder
if search_results:
    save_results_to_json(search_results, SEARCH_FILE_FOLDER_PATH, output_filename)
else:
    print("No results to save.")


Reached maximum retrievable results (100).
Results saved to /home/madhavbpanicker/Documents/Scrape_project/Google-Search-Results/reputation-management-gsr-hyundai-"forum"-reviews.json


In [None]:
import pandas as pd
import requests
import time
from datetime import datetime

# ScraperAPI endpoint and API key
SCRAPER_API_URL = "http://api.scraperapi.com"
API_KEY = "190d8602cbff5cbcfc555cbae04aeddc"
#Folder Path and Project Name Definition
RAW_DATA_FOLDER_PATH="/home/madhavbpanicker/Documents/Scrape_project/Website-Data-Raw/"
PROJECT_NAME="reputation-management"
SEARCH_FILE_FOLDER_PATH="/home/madhavbpanicker/Documents/Scrape_project/Google-Search-Results/"

# Function to fetch page content using ScraperAPI
def fetch_content(url):
    try:
        params = {
            "api_key": API_KEY,
            "url": url
        }
        response = requests.get(SCRAPER_API_URL, params=params, timeout=30)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to fetch {url}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# Read the CSV file with URLs
#input_file = f"gsr-{SEARCH_QUERY.replace(' ','-')}.csv"
input_file = f"{SEARCH_FILE_FOLDER_PATH}{output_filename}"
df = pd.read_json(input_file, orient='records')

# Add columns for page content and fetch date
df['page_content'] = ""
df['fetch_date'] = ""

# Loop through each URL and fetch content
for index, row in df.iterrows():
    url = row['Link']
    print(f"Fetching content for: {url}")
    
    # Fetch content and add current date
    content = fetch_content(url)
    fetch_date = datetime.now().strftime('%Y-%m-%d')  # Get current date in YYYY-MM-DD format
    
    # Update DataFrame
    df.at[index, 'page_content'] = content
    df.at[index, 'fetch_date'] = fetch_date
    
    time.sleep(2)  

# Save the updated DataFrame with page content to a new CSV file
output_file = f"{RAW_DATA_FOLDER_PATH}{PROJECT_NAME}-scraped_results-{SEARCH_QUERY.replace(' ','-')}.json"
# Convert DataFrame to a list of dictionaries (records) for JSON serialization
df_records = df.to_dict(orient='records')

# Save the JSON data to the file
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(df_records, f, ensure_ascii=False, indent=4)
print(f"Scraping completed. Results saved to {output_file}")

Fetching content for: https://www.hyundai-forums.com/threads/car-and-driver-santa-fe-hybrid-review.720456/
Fetching content for: https://www.santacruzforums.com/threads/you-can-carry-4x8-materials.15898/latest
Fetching content for: https://www.hyundaiperformance.com/threads/06-sonata-getting-access-to-ac-mounting-fastener.127070/latest
Fetching content for: https://www.tucson-forum.com/threads/factory-washer-fluid.3958/latest
Fetching content for: https://www.palisadeforum.com/threads/camping-out-in-my-2023-hyundai-palisade-xrt.8373/
Fetching content for: https://www.tucson-forum.com/threads/windshield-phenomenon.3957/latest
Fetching content for: https://www.palisadeforum.com/threads/help-i-am-stranded-and-60-miles-from-dealer.8375/
Fetching content for: https://www.newtiburon.com/threads/high-rise-spoiler-needs-paint-tlc.484361/
Fetching content for: https://www.ioniqforum.com/threads/2025-ioniq-5-can-supercharge-more-quickly-than-earlier-models.51737/latest
Fetching content for: http