In [1]:
import os
import requests
import json
import re
from datetime import datetime, timedelta
import pandas as pd
import time
import boto3
from io import StringIO

# Constants
GOOGLE_API_KEY = ""  # Replace with your Google API key
CSE_ID = ""  # Replace with your Custom Search Engine ID
NUM_RESULTS = 100  # Total number of results needed (max 100)
SEARCH_FILE_FOLDER_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Google-Search-Results/"
RAW_DATA_FOLDER_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Website-Data-Raw/"
PROJECT_NAME = "reputation-management"
SCRAPER_API_URL = "http://api.scraperapi.com"
SCRAPER_API_KEY = ""
S3_BUCKET_NAME = ""
DATE_RESTRICT = "d2" #Add restriction to make it only show results from the past two days
CAR_COMPANIES = [
    "Honda", "Nissan","Subaru", "Mazda", "Chevrolet", "Buick", "Volkswagen"
]

# Function to fetch results from Google Custom Search API
def fetch_google_results(api_key, cse_id, query, num_results, date_restrict=None):
    results = []
    start_index = 1
    while len(results) < num_results:
        max_results = min(num_results - len(results), 10)

        url = (
            f"https://www.googleapis.com/customsearch/v1"
            f"?key={api_key}&cx={cse_id}&q={query}&start={start_index}"
        )

        if date_restrict:
            url += f"&dateRestrict={date_restrict}"

        response = requests.get(url)
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break

        data = response.json()
        if "items" not in data:
            print("No more items in response.")
            break

        for item in data["items"]:
            date = None
            if "pagemap" in item and "metatags" in item["pagemap"]:
                metatags = item["pagemap"]["metatags"]
                for tag in metatags:
                    date = tag.get("article:published_time") or tag.get("pubdate")
                    if date:
                        break

            if not date:
                date = parse_relative_date(item.get("snippet", ""))

            results.append({
                "Title": item.get("title"),
                "Link": item.get("link"),
                "Description": item.get("snippet"),
                "Date": date or "N/A",
            })

        start_index += max_results

        if start_index > 100:
            print("Reached maximum retrievable results (100).")
            break

    return results

# Function to parse relative dates
def parse_relative_date(text):
    try:
        match = re.search(r'(\d+)\s*(day|hour|minute|week|month|year)s?\s*ago', text, re.IGNORECASE)
        if match:
            value, unit = int(match.group(1)), match.group(2).lower()
            now = datetime.now()

            if unit == 'day':
                parsed_date = now - timedelta(days=value)
            elif unit == 'hour':
                parsed_date = now - timedelta(hours=value)
            elif unit == 'minute':
                parsed_date = now - timedelta(minutes=value)
            elif unit == 'week':
                parsed_date = now - timedelta(weeks=value)
            elif unit == 'month':
                parsed_date = now - timedelta(days=value * 30)
            elif unit == 'year':
                parsed_date = now - timedelta(days=value * 365)
            else:
                return None

            return parsed_date.strftime('%Y-%m-%d')
    except Exception as e:
        print(f"Error parsing relative date: {e}")
        return None

# Function to save results to JSON
def save_results_to_json(results, folder_path, filename):
    os.makedirs(folder_path, exist_ok=True)
    file_path = os.path.join(folder_path, filename)
    with open(file_path, mode='w', encoding='utf-8') as file:
        json.dump(results, file, indent=4, ensure_ascii=False)
    print(f"Results saved to {file_path}")

# Function to fetch page content
def fetch_content(url):
    try:
        params = {
            "api_key": SCRAPER_API_KEY,
            "url": url
        }
        response = requests.get(SCRAPER_API_URL, params=params, timeout=30)
        if response.status_code == 200:
            return response.text
        else:
            print(f"Failed to fetch {url}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to upload file to S3
def upload_to_s3(file_path, bucket_name, s3_folder):
    s3 = boto3.client(
        's3',
        aws_access_key_id='',
        aws_secret_access_key='',
        region_name='ap-south-1'
    )
    try:
        s3_file_key = f"{s3_folder}/{os.path.basename(file_path)}"
        s3.upload_file(file_path, bucket_name, s3_file_key)
        print(f"File uploaded to S3: s3://{bucket_name}/{s3_file_key}")
    except Exception as e:
        print(f"Error uploading to S3: {e}")

# Main workflow
for company in CAR_COMPANIES:
    search_query = f"{company} reviews"
    print(f"Fetching results for: {search_query}")

    # Fetch Google Search results
    search_results = fetch_google_results(GOOGLE_API_KEY, CSE_ID, search_query, NUM_RESULTS, DATE_RESTRICT)

    if not search_results:
        print(f"No results for {company}.")
        continue

    # Save search results to JSON
    file_name = f"{PROJECT_NAME}-gsr-{search_query.replace(' ', '-')}.json"
    save_results_to_json(search_results, SEARCH_FILE_FOLDER_PATH, file_name)

    # Load search results
    input_file = os.path.join(SEARCH_FILE_FOLDER_PATH, file_name)
    df = pd.read_json(input_file, orient='records')

    # Add columns for page content and fetch date
    df['page_content'] = ""
    df['fetch_date'] = ""

    for index, row in df.iterrows():
        url = row['Link']
        print(f"Fetching content for: {url}")
        content = fetch_content(url)
        fetch_date = datetime.now().strftime('%Y-%m-%d')
        df.at[index, 'page_content'] = content
        df.at[index, 'fetch_date'] = fetch_date
        time.sleep(2)

    # Save updated results to JSON
    output_file = os.path.join(RAW_DATA_FOLDER_PATH, f"{PROJECT_NAME}-scraped_results-{search_query.replace(' ', '-')}.json")
    df['fetch_date'] = df['fetch_date'].astype(str)
    df_records = df.to_dict(orient='records')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(df_records, f, ensure_ascii=False, indent=4)
    print(f"Scraping completed for {company}. Results saved to {output_file}")

    # Upload to S3
    upload_to_s3(output_file, S3_BUCKET_NAME, "Website-Data-Raw")

print("All companies processed.")

Fetching results for: Honda reviews
Reached maximum retrievable results (100).
Results saved to /home/madhavbpanicker/Documents/Scrape_project/Google-Search-Results/reputation-management-gsr-Honda-reviews.json
Fetching content for: https://honda-tech.com/
Fetching content for: https://www.youtube.com/channel/UCX13wwdzipRhH001W3X0WyQ
Fetching content for: https://www.suburbanhonda.com/customer-reviews.htm
Fetching content for: https://www.dealerrater.com/dealer/Valley-Honda-review-14326/
Fetching content for: https://www.dchacademyhonda.com/our-reviews
Fetching content for: https://www.dealerrater.com/dealer/Tempe-Honda-review-15494/
Fetching content for: https://www.valleyhonda.com/about-us/customer-testimonials/
Fetching content for: https://www.dealerrater.com/dealer/Millennium-Honda-review-15415/
Fetching content for: https://www.hondacityli.com/about-us/customer-testimonials/
Error fetching https://www.hondacityli.com/about-us/customer-testimonials/: HTTPConnectionPool(host='api.sc

KeyboardInterrupt: 

In [5]:
import os
import json
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup

# Constants
RAW_DATA_FOLDER_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Website-Data-Raw/"
PROCESSED_OUTPUT_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Website-Data-Trimmed/"

# Ensure the output folder exists
os.makedirs(PROCESSED_OUTPUT_PATH, exist_ok=True)

# Function to clean HTML content
def clean_html(content):
    soup = BeautifulSoup(content, "html.parser")
    # Remove script and style elements
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    # Get clean text
    return soup.get_text(separator=" ", strip=True)

# Function to process JSON files
def process_json_files(folder_path, output_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            print(f"Processing file: {file_name}")
            file_path = os.path.join(folder_path, file_name)

            # Load the JSON file
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            processed_entries = []

            for entry in data:
                # Skip entries with null or empty page_content
                if not entry.get("page_content"):
                    continue

                # Clean HTML content
                cleaned_content = clean_html(entry["page_content"])

                # Extract sentences from the cleaned page_content
                sentences = sent_tokenize(cleaned_content)

                # Create a new entry for each sentence
                for sentence in sentences:
                    processed_entries.append({
                        "Title": entry.get("Title", "N/A"),
                        "Link": entry.get("Link", "N/A"),
                        "Description": entry.get("Description", "N/A"),
                        "Date": entry.get("Date", "N/A"),
                        "fetch_date": entry.get("fetch_date", "N/A"),
                        "Sentence": sentence
                    })

            # Save the processed entries to a new JSON file
            output_file = os.path.join(output_path, file_name.replace(".json", "_processed.json"))
            with open(output_file, 'w', encoding='utf-8') as out_f:
                json.dump(processed_entries, out_f, ensure_ascii=False, indent=4)
            print(f"Processed data saved to: {output_file}")

# Run the processing function
process_json_files(RAW_DATA_FOLDER_PATH, PROCESSED_OUTPUT_PATH)

print("All files processed.")


Processing file: reputation-management-scraped_results-Kia-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Website-Data-Trimmed/reputation-management-scraped_results-Kia-reviews_processed.json
Processing file: reputation-management-scraped_results-Toyota-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Website-Data-Trimmed/reputation-management-scraped_results-Toyota-reviews_processed.json
Processing file: reputation-management-scraped_results-Hyundai-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Website-Data-Trimmed/reputation-management-scraped_results-Hyundai-reviews_processed.json
All files processed.


In [1]:
import os
import json
from bs4 import BeautifulSoup

# Load SpaCy model for sentence detection
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("SpaCy model 'en_core_web_sm' not found. Downloading now...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Constants
RAW_DATA_FOLDER_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Website-Data-Raw/"
PROCESSED_OUTPUT_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Processed-Data/"

# Ensure the output folder exists
os.makedirs(PROCESSED_OUTPUT_PATH, exist_ok=True)

# Function to clean HTML content
def clean_html(content):
    soup = BeautifulSoup(content, "html.parser")
    # Remove script and style elements
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    # Get clean text
    return soup.get_text(separator=" ", strip=True)

# Function to split text into sentences using SpaCy
def extract_sentences(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# Function to process JSON files
def process_json_files(folder_path, output_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            print(f"Processing file: {file_name}")
            file_path = os.path.join(folder_path, file_name)

            # Load the JSON file
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            processed_entries = []

            for entry in data:
                # Skip entries with null or empty page_content
                if not entry.get("page_content"):
                    continue

                # Clean HTML content
                cleaned_content = clean_html(entry["page_content"])

                # Extract sentences from the cleaned page_content
                sentences = extract_sentences(cleaned_content)

                # Create a new entry for each sentence
                for sentence in sentences:
                    processed_entries.append({
                        "Title": entry.get("Title", "N/A"),
                        "Link": entry.get("Link", "N/A"),
                        "Description": entry.get("Description", "N/A"),
                        "Date": entry.get("Date", "N/A"),
                        "fetch_date": entry.get("fetch_date", "N/A"),
                        "Sentence": sentence
                    })

            # Save the processed entries to a new JSON file
            output_file = os.path.join(output_path, file_name.replace(".json", "_processed.json"))
            with open(output_file, 'w', encoding='utf-8') as out_f:
                json.dump(processed_entries, out_f, ensure_ascii=False, indent=4)
            print(f"Processed data saved to: {output_file}")

# Run the processing function
process_json_files(RAW_DATA_FOLDER_PATH, PROCESSED_OUTPUT_PATH)

print("All files processed.")


Processing file: reputation-management-scraped_results-Kia-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Processed-Data/reputation-management-scraped_results-Kia-reviews_processed.json
Processing file: reputation-management-scraped_results-Toyota-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Processed-Data/reputation-management-scraped_results-Toyota-reviews_processed.json
Processing file: reputation-management-scraped_results-Hyundai-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Processed-Data/reputation-management-scraped_results-Hyundai-reviews_processed.json
All files processed.


In [2]:
import os
import json
from bs4 import BeautifulSoup

# Load SpaCy model for paragraph detection
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("SpaCy model 'en_core_web_sm' not found. Downloading now...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Constants
RAW_DATA_FOLDER_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Website-Data-Raw/"
PROCESSED_OUTPUT_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Processed-Data-Para/"

# Ensure the output folder exists
os.makedirs(PROCESSED_OUTPUT_PATH, exist_ok=True)

# Function to clean HTML content
def clean_html(content):
    soup = BeautifulSoup(content, "html.parser")
    # Remove script and style elements
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    # Get clean text
    return soup.get_text(separator=" ", strip=True)

# Function to split text into paragraphs
def extract_paragraphs(text):
    paragraphs = [para.strip() for para in text.split("\n") if para.strip()]
    return paragraphs

# Function to process JSON files
def process_json_files(folder_path, output_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            print(f"Processing file: {file_name}")
            file_path = os.path.join(folder_path, file_name)

            # Load the JSON file
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            processed_entries = []

            for entry in data:
                # Skip entries with null or empty page_content
                if not entry.get("page_content"):
                    continue

                # Clean HTML content
                cleaned_content = clean_html(entry["page_content"])

                # Extract paragraphs from the cleaned page_content
                paragraphs = extract_paragraphs(cleaned_content)

                # Create a new entry for each paragraph
                for paragraph in paragraphs:
                    processed_entries.append({
                        "Title": entry.get("Title", "N/A"),
                        "Link": entry.get("Link", "N/A"),
                        "Description": entry.get("Description", "N/A"),
                        "Date": entry.get("Date", "N/A"),
                        "fetch_date": entry.get("fetch_date", "N/A"),
                        "Paragraph": paragraph
                    })

            # Save the processed entries to a new JSON file
            output_file = os.path.join(output_path, file_name.replace(".json", "_processed.json"))
            with open(output_file, 'w', encoding='utf-8') as out_f:
                json.dump(processed_entries, out_f, ensure_ascii=False, indent=4)
            print(f"Processed data saved to: {output_file}")

# Run the processing function
process_json_files(RAW_DATA_FOLDER_PATH, PROCESSED_OUTPUT_PATH)

print("All files processed.")


Processing file: reputation-management-scraped_results-Kia-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Processed-Data-Para/reputation-management-scraped_results-Kia-reviews_processed.json
Processing file: reputation-management-scraped_results-Toyota-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Processed-Data-Para/reputation-management-scraped_results-Toyota-reviews_processed.json
Processing file: reputation-management-scraped_results-Hyundai-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Processed-Data-Para/reputation-management-scraped_results-Hyundai-reviews_processed.json
All files processed.


In [3]:
import os
import json
from bs4 import BeautifulSoup

# Load SpaCy model for paragraph detection
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("SpaCy model 'en_core_web_sm' not found. Downloading now...")
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

# Constants
RAW_DATA_FOLDER_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Website-Data-Raw/"
PROCESSED_OUTPUT_PATH = "/home/madhavbpanicker/Documents/Scrape_project/Processed-Data-Para-Relevance/"

# Ensure the output folder exists
os.makedirs(PROCESSED_OUTPUT_PATH, exist_ok=True)

# Function to clean HTML content
def clean_html(content):
    soup = BeautifulSoup(content, "html.parser")
    # Remove script and style elements
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()
    # Get clean text
    return soup.get_text(separator=" ", strip=True)

# Function to split text into paragraphs
def extract_paragraphs(text):
    paragraphs = [para.strip() for para in text.split("\n") if para.strip()]
    return paragraphs

# Function to filter relevant paragraphs
def is_relevant_paragraph(paragraph):
    # Define keywords indicating customer-related content
    customer_keywords = [
        "I think", "I feel", "I believe", "experience", "review", "feedback", "service", 
        "support", "help", "recommend", "suggest", "advise", "share"
    ]
    return any(keyword.lower() in paragraph.lower() for keyword in customer_keywords)

# Function to process JSON files
def process_json_files(folder_path, output_path):
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            print(f"Processing file: {file_name}")
            file_path = os.path.join(folder_path, file_name)

            # Load the JSON file
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            processed_entries = []

            for entry in data:
                # Skip entries with null or empty page_content
                if not entry.get("page_content"):
                    continue

                # Clean HTML content
                cleaned_content = clean_html(entry["page_content"])

                # Extract paragraphs from the cleaned page_content
                paragraphs = extract_paragraphs(cleaned_content)

                # Filter relevant paragraphs and create entries
                for paragraph in paragraphs:
                    if is_relevant_paragraph(paragraph):
                        processed_entries.append({
                            "Title": entry.get("Title", "N/A"),
                            "Link": entry.get("Link", "N/A"),
                            "Description": entry.get("Description", "N/A"),
                            "Date": entry.get("Date", "N/A"),
                            "fetch_date": entry.get("fetch_date", "N/A"),
                            "Paragraph": paragraph
                        })

            # Save the processed entries to a new JSON file
            output_file = os.path.join(output_path, file_name.replace(".json", "_processed.json"))
            with open(output_file, 'w', encoding='utf-8') as out_f:
                json.dump(processed_entries, out_f, ensure_ascii=False, indent=4)
            print(f"Processed data saved to: {output_file}")

# Run the processing function
process_json_files(RAW_DATA_FOLDER_PATH, PROCESSED_OUTPUT_PATH)

print("All files processed.")


Processing file: reputation-management-scraped_results-Kia-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Processed-Data-Para-Relevance/reputation-management-scraped_results-Kia-reviews_processed.json
Processing file: reputation-management-scraped_results-Toyota-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Processed-Data-Para-Relevance/reputation-management-scraped_results-Toyota-reviews_processed.json
Processing file: reputation-management-scraped_results-Hyundai-reviews.json
Processed data saved to: /home/madhavbpanicker/Documents/Scrape_project/Processed-Data-Para-Relevance/reputation-management-scraped_results-Hyundai-reviews_processed.json
All files processed.
