## G2 Review Scraper

### Before running make sure html files ready, change platform name then run the code

In [35]:
import os
import csv
from bs4 import BeautifulSoup

# Initialize the dataset
dataset = []

def map_rating(stars_class):
    # Map the stars_class to the corresponding rating
    ratings = {
        "stars-10": 5,
        "stars-9": 4.5,
        "stars-8": 4,
        "stars-7": 3.5,
        "stars-6": 3,
        "stars-5": 2.5,
        "stars-4": 2,
        "stars-3": 1.5,
        "stars-2": 1,
        "stars-1": 0.5,
        "stars-0": 0
    }
    return ratings.get(stars_class, None)

# Iterate over each HTML file in the "datas" folder
platform = "ZipRecruiter"
folder_path = f"datas/{platform}/"
for filename in os.listdir(folder_path):
    # Check if the file is an HTML file
    if filename.endswith(".html"):
        # Load the HTML file
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all review containers
        review_containers = soup.find_all('div', class_='paper paper--white paper--box mb-2 position-relative border-bottom')

        # Iterate over each review container
        for review_container in review_containers:
            # Extract review time
            review_time_element = review_container.find('span', class_='x-current-review-date')
            review_time = review_time_element.time.get_text(strip=True) if review_time_element else ""

            # Extract positive review
            positive_review_element = review_container.find('p', class_='formatted-text')
            positive_review = positive_review_element.get_text(strip=True) if positive_review_element else ""

            # Extract negative review
            negative_review_elements = review_container.find_all('p', class_='formatted-text')
            negative_review = negative_review_elements[1].get_text(strip=True) if len(negative_review_elements) > 1 else ""

            # Extract problems solved
            problems_solved_elements = review_container.find_all('p', class_='formatted-text')
            problems_solved = problems_solved_elements[-1].get_text(strip=True) if problems_solved_elements else ""

            # Extract rating
            stars_class = review_container.select_one('.stars')['class'][-1]
            rating = map_rating(stars_class)

            # Assign the rating value to rating_val if it's not None
            rating_val = rating if rating is not None else ""

            # Create a dictionary for the current review
            review_data = {
                'review_time': review_time,
                'positive_review': positive_review,
                'negative_review': negative_review,
                'problems_solved': problems_solved,
                'rating': rating_val
            }

            # Add the review data to the dataset
            dataset.append(review_data)

# Save the dataset as a CSV file
output_folder = "output"
os.makedirs(output_folder, exist_ok=True)
output_filename = os.path.join(output_folder, f'{platform}.csv')
fieldnames = ['review_time', 'positive_review', 'negative_review', 'problems_solved', 'rating']

with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(dataset)

print("Dataset saved as", output_filename)

Dataset saved as output/ZipRecruiter.csv
