# Google Jobs Scraper

## Dependencies

In [None]:
import requests
import json
import time
import random

import pandas as pd

from bs4 import BeautifulSoup
from datetime import datetime, timedelta

import sys
from pathlib import Path

# Automatically detect the repo root (parent of notebook folder)
repo_root = Path().resolve().parent  # if notebook is in 'notebooks/' folder
sys.path.append(str(repo_root))

from config.config import get_environment

from config.config import data_import_json, data_export_json, data_import_pandas, data_export_pandas

## ENV

In [None]:
ENV = get_environment(
    env_path="../environments",
    env_name="env.json"
)

# content_date = datetime.now().date() + timedelta(days=0)
content_date = ENV['CONTENT_DATE']
website = ENV['SOURCE']['NAME']
version = ENV['VERSION']

## Mining

In [None]:
def scrape_job(
        scrape_date: datetime,
        website: str,
        version: int,
        session: requests,
        page: int,
        max_retry: int=3,
        time_sleep_min: int=1,
        time_sleep_max: int=3,
        timeout: int=10
    ):

    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'en-US,en;q=0.9,id;q=0.8',
        'cache-control': 'max-age=0',
        'downlink': '1.35',
        'priority': 'u=0, i',
        'rtt': '300',
        'sec-ch-prefers-color-scheme': 'dark',
        'sec-ch-ua': '"Chromium";v="142", "Google Chrome";v="142", "Not_A Brand";v="99"',
        'sec-ch-ua-arch': '"x86"',
        'sec-ch-ua-bitness': '"64"',
        'sec-ch-ua-form-factors': '"Desktop"',
        'sec-ch-ua-full-version': '"142.0.7444.60"',
        'sec-ch-ua-full-version-list': '"Chromium";v="142.0.7444.60", "Google Chrome";v="142.0.7444.60", "Not_A Brand";v="99.0.0.0"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-model': '""',
        'sec-ch-ua-platform': '"Windows"',
        'sec-ch-ua-platform-version': '"19.0.0"',
        'sec-ch-ua-wow64': '?0',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-user': '?1',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
        'x-browser-channel': 'stable',
    }

    url_scrape = f'https://www.google.com/about/careers/applications/jobs/results?page={page}'

    print(f"Scraping Page: {page} | URL: {url_scrape}")

    for retry in range(max_retry):
        time.sleep(random.uniform(time_sleep_min, time_sleep_max))
        try:
            response = session.get(
                url_scrape,
                headers=headers,
                timeout=timeout,
            )
            # Validate Status Code
            if response.status_code == 200:
                break
            else:
                raise ValueError(f"Status Code: {response.status_code} | URL: {url_scrape}")

        except Exception as e:
            if retry == max(range(max_retry)):
                raise e
            else:
                print(f"Retry request: {url_scrape} | Retry count: {retry+1} | Error: {e}")
                session = requests.Session()

    soup = BeautifulSoup(response.content, "html.parser")
    soup2 = soup.find(name="h2", string="Jobs search results").find_parent("div")
    total_jobs = soup2.select_one('div[role="status"]').get('aria-label')

    # write_json_file(
    #     data=soup2,
    #     url=url_scrape,
    #     identifier=identifier,
    #     scrape_date=scrape_date,
    #     page=page,
    #     total_jobs=total_jobs,
    #     folder="../data/scraper"
    # )

    data_export_json(
        data=soup2,
        website=website,
        folder_name='scraper',
        version=version,
        content_date=scrape_date, # "0000-00-00"
        additional_info=f"scrape-page{page}",
        metadata={
            "total_jobs": total_jobs,
            "page": page,
            "url_scrape": url_scrape
        }
    )

    return soup2

In [None]:
def parse_job_list(
        soup: BeautifulSoup,
        page: int
    ):

    items = soup.find_all(name="span", string="Learn more")
    parsed_list = list()
    base_url = "https://www.google.com/about/careers/applications/"

    for item in items:
        job_url = item.find_next(name="a").get("href")
        job_url = f"{base_url}{job_url}"

        print(f"Parsing Page: {page} | Item: {len(parsed_list)+1} | URL: {job_url}")
        item2 = item.find_parent(name="li")

        job_id = item2.select_one("div").attrs["jsdata"].split(";")[1]
        job_loc = item2.find(name="i", string="place").find_next(name="span").text
        job_name = item.find_parent(name="div").find_parent(name="div").find_parent(name="div").find(name="h3").text
        try:
            job_lvl = item2.find(name="i", string="bar_chart").find_next(name="span").text
        except (TypeError, AttributeError) as e:
            print(e)
            job_lvl = None

        try:
            job_lvl_desc = item2.find(name="i", string="bar_chart").find_next(name="span").find_next(name="div").find(name="h2").find_next("div").text
        except (TypeError, AttributeError) as e:
            print(e)
            job_lvl_desc = None

        job_qua = " ".join([
            f"- {i.text}" for i in item2.find(name="h4", string="Minimum qualifications").find_next(name="ul").select("li")
        ])

        data_dict = dict()
        data_dict["id"] = job_id
        data_dict["url"] = job_url
        data_dict["name"] = job_name
        data_dict["location"] = job_loc
        data_dict["level"] = job_lvl
        data_dict["level_description"] = job_lvl_desc
        data_dict["qualification"] = job_qua

        # Append parsed list
        parsed_list.append(data_dict)

    return parsed_list

## Execute

In [None]:
# Scrape and Parse
reparse_only = True
continue_scraper = False
session = requests.Session()
parsed_list = list()
page = 0

try:
    while True:
        page += 1

        # Reparse only, no scrape | Continue scraper from latest page
        if reparse_only or continue_scraper:
            try:
                json_data = data_import_json(
                    website=website,
                    folder_name='scraper',
                    version=version,
                    content_date=content_date,
                    additional_info=f"scrape-page{page}"
                )
                soup = BeautifulSoup(
                    json_data["data"],
                    "html.parser"
                )

            except Exception as e:
                # Scrape
                if reparse_only:
                    print(e)
                    break

                # Continue Scraper
                else:
                    soup = scrape_job(
                        session=session,
                        page=page,
                        scrape_date=content_date,
                        website=website,
                        version=version,
                    )

        else:
            # Scrape
            soup = scrape_job(
                session=session,
                page=page,
                scrape_date=content_date,
                website=website,
                version=version,
            )

        # Parse
        parsed_list_temp = parse_job_list(
            soup=soup,
            page=page
        )

        parsed_list = parsed_list + parsed_list_temp

        # Identifier to stop iteration
        if len(soup.select('a[aria-label="Go to next page"]')) == 0:
            break

        # if page == 2:
        #     break

    df_parse = pd.DataFrame(parsed_list)
    data_export_pandas(
        df_output=df_parse,
        website=website,
        content_date=content_date,
        version=version,
        folder_name='parser',
        additional_info='parsed',
        incl_excel=True
    )

except Exception as e:
    print(e)

    df_parse = pd.DataFrame(parsed_list)
    data_export_pandas(
        df_output=df_parse,
        website=website,
        content_date=content_date,
        version=version,
        folder_name='parser',
        additional_info='parsed',
        incl_excel=True
    )