# **Web Scraping from CareerBliss**

**Group Members:** Michael DeMasi, Katherine Yang, Tolani Oshinusi, Jeremy Cho

Based on our research, we will scrape employee reviews from mulitple high/low attrition companies.

- High Attrition Companies:
  - Apple
  - Amazon
  - Facebook
  - Tesla
  - AMD
  - ServiceNow
  - Mastercard
  - Abbot Laboratories

- Low Attrition Companies
  - Conoco Phillips
  - Chevron
  - Lockheed Martin
  - Comcast
  - Boeing
  - ExxonMobil
  - Progressive
  - Honeywell

In [None]:
# Imports for web scraping
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

In [None]:
''' WARNING: This cell takes about 30 minutes to run, the csv has been previously generated for use in the main notebook! '''
''' Perform web scraping to collect reviews from CareerBliss and generate a dataframe.'''

# initialize a dataframe to store the scraped data
reviews_df = pd.DataFrame(columns=["review_id", "company_name", "job_title", "company_id", "job_title_id", "review", "attrition_level"])

# companies to consider based on research
high_attrition_companies = ["apple", "amazon", "facebook", "tesla-motors", "amd", "servicenow", "mastercard", "abbott-laboratories"]
low_attrition_companies = ["conocophillips", "chevron", "lockheed-martin", "comcast", "boeing", "exxonmobil", "progressive", "honeywell"]

# companies with their maximum page numbers on careerbliss
companies = {
    "apple": 49,
    "amazon": 51,
    "facebook": 5,
    "tesla-motors": 10,
    "amd": 17,
    "servicenow": 2,
    "mastercard": 4,
    "abbott-laboratories": 33,
    "conocophillips": 6,
    "chevron": 21,
    "lockheed-martin": 87,
    "comcast": 63,
    "boeing": 59,
    "exxonmobil": 16,
    "progressive": 5,
    "honeywell": 35
}

headers = {
    "-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}

# Iterate through each company and their pages
for company, max_pages in companies.items():
  base_url = f"https://www.careerbliss.com/{company}/reviews/"
  print(company)

  for page in range(max_pages):
    print(page)
    if page == 0:
        url = base_url
    else:
      url = f"{base_url}?page={page}"
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to fetch page {page}: Status code {response.status_code}")
        break

    soup = BeautifulSoup(response.content, "html.parser")
    reviews = soup.find_all("div", class_="company-reviews")

    # Extract the required fields from each review block
    for review in reviews:
        try:
            # Capture all elements from the HTML page and handle inconsistancies
            review_id = review.find("a", class_="job-title header5 twocentChromeExt")
            if review_id:
                review_id = review_id.get("data-reviewid", None)
            company_name = review.find("a", class_="job-title header5 twocentChromeExt")
            if company_name:
                company_name = company_name.get("data-company", None)
            job_title = review.find("a", class_="job-title header5 twocentChromeExt")
            if job_title:
                job_title = job_title.get("data-jobtitle", None)
            company_id = review.find("a", class_="job-title header5 twocentChromeExt")
            if company_id:
                company_id = company_id.get("data-companyid", None)
            job_title_id = review.find("a", class_="job-title header5 twocentChromeExt")
            if job_title_id:
                job_title_id = job_title_id.get("data-jobtitleid", None)
            review_text = review.find("p", class_="comments foggy")
            if review_text:
                review_text = review_text.text.strip()
            if company in high_attrition_companies:
                attrition_level = "high"
            else:
                attrition_level = "low"

            # Append to DataFrame if following fields are present
            if review_id and company and job_title and company_id and job_title_id and review_text:
                reviews_df = pd.concat([
                    reviews_df,
                    pd.DataFrame([{
                        "review_id": review_id,
                        "company_name": company_name,
                        "job_title": job_title,
                        "company_id": company_id,
                        "job_title_id": job_title_id,
                        "review": review_text,
                        "attrition_level": attrition_level
                    }])
                ], ignore_index=True)
        except Exception as e:
            print(f"Error: {e}")

    # add delay between requests to prevent being blocked
    time.sleep(1)

# Save the DataFrame to a CSV file
reviews_df.to_csv("careerbliss_reviews.csv", index=False)