In [3]:
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from bs4 import BeautifulSoup
import time
import random
from fake_useragent import UserAgent
import numpy as np  # For NaN values

# File to save progress
progress_file = "scraping_progress.txt"
output_file = "popular_on_ambitionbox.csv"

# Function to load the last progress
def load_progress():
    if os.path.exists(progress_file):
        with open(progress_file, "r") as f:
            return int(f.read().strip())
    return 1  # Default start page if no progress is saved

# Function to save progress
def save_progress(page):
    with open(progress_file, "w") as f:
        f.write(str(page))

# Function to save data periodically
def save_data(df):
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        df = pd.concat([existing_df, df], ignore_index=True)
    df.to_csv(output_file, index=False)

# Setup
ua = UserAgent()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument(f"user-agent={ua.random}")

# Initialize driver
def get_driver():
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

driver = get_driver()

# Load progress
start_page = load_progress()
end_page = 500  # Define the total number of pages to scrape

# Data storage
names, ratings, rating_counts, company_info, highly_rated_for, critically_rated_for = [], [], [], [], [], []

# Scraping loop
for page in range(start_page, end_page + 1):
    try:
        print(f"Scraping page {page}...")
        url = f"https://www.ambitionbox.com/list-of-companies?campaign=desktop_nav&page={page}"
        driver.get(url)
        
        # Retry mechanism
        retries = 3
        for attempt in range(retries):
            try:
                WebDriverWait(driver, 20).until(
                    EC.presence_of_all_elements_located((By.CLASS_NAME, "companyCardWrapper"))
                )
                break
            except Exception:
                if attempt < retries - 1:
                    print(f"Retrying page {page} (Attempt {attempt + 1}/{retries})...")
                    time.sleep(5)  # Wait before retrying
                else:
                    raise Exception(f"Failed to load page {page} after {retries} attempts.")
        
        time.sleep(random.uniform(5, 10))  # Add randomized delay

        # Parse the content
        soup = BeautifulSoup(driver.page_source, "lxml")
        company = soup.find_all('div', class_="companyCardWrapper")
        
        # Extract data
        for i in company:
            # Basic data
            name = i.find('h2')
            names.append(name.text.strip() if name else np.nan)

            rating = i.find('div', style="height:auto;padding-bottom:1px;")
            ratings.append(rating.text.strip() if rating else np.nan)

            total_rating = i.find('span', class_="companyCardWrapper__companyRatingCount")
            rating_counts.append(total_rating.text.strip() if total_rating else np.nan)

            info = i.find('span', class_="companyCardWrapper__interLinking")
            company_info.append(info.text.strip() if info else np.nan)

            # Extract the rating blocks
            rating_blocks = i.find_all('div', class_="companyCardWrapper__ratingComparisonWrapper")

            high_ratings, critical_ratings = [], []  # Temporary storage for the current company
            for block in rating_blocks:
                # Extract "High Ratings For" data
                highly_rated = block.find('span', class_="companyCardWrapper__ratingHeader--high")
                if highly_rated:
                    highly_rated_value = highly_rated.find_next('span', class_="companyCardWrapper__ratingValues")
                    high_ratings.append(highly_rated_value.text.strip() if highly_rated_value else np.nan)

                # Extract "Critical Ratings For" data
                critically_rated = block.find('span', class_="companyCardWrapper__ratingHeader--critical")
                if critically_rated:
                    critically_rated_value = critically_rated.find_next('span', class_="companyCardWrapper__ratingValues")
                    critical_ratings.append(critically_rated_value.text.strip() if critically_rated_value else np.nan)

            # Add concatenated high and critical ratings to lists
            highly_rated_for.append(", ".join(high_ratings) if high_ratings else np.nan)
            critically_rated_for.append(", ".join(critical_ratings) if critical_ratings else np.nan)
        
        # Save progress and partial data
        save_progress(page)
        if page % 10 == 0:  # Save data every 10 pages
            df = pd.DataFrame({
                'name': names,
                'rating': ratings,
                'Total Rating': rating_counts,
                'Company Info': company_info,
                'Highly Rated For': highly_rated_for,
                'Critically Rated For': critically_rated_for
            })
            save_data(df)
            names, ratings, rating_counts, company_info, highly_rated_for, critically_rated_for = [], [], [], [], [], []

    except Exception as e:
        print(f"Error on page {page}: {e}")
        driver.quit()
        driver = get_driver()  # Restart the driver
        continue

# Quit the driver
driver.quit()

# Save remaining data
if names:
    df = pd.DataFrame({
        'name': names,
        'rating': ratings,
        'Total Rating': rating_counts,
        'Company Info': company_info,
        'Highly Rated For': highly_rated_for,
        'Critically Rated For': critically_rated_for
    })
    save_data(df)

print("Scraping completed successfully.")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 