---
format:
    html:
        embed-resources: true
---

# Crawling 

Use the serpapi to access and collect google job postings.

In [1]:
from serpapi import GoogleSearch

In [None]:
import json
with open('retracted') as f:
    keys = json.load(f)
API_KEY = keys['serpapi']

# Utility function

Create a utility function to scrape each job posting and export as a json file.

In [None]:
from datetime import datetime

# Define function

def search_google_jobs(search_query="data scientist", next_page_token=False, verbose=False):

    # Get current date and time
    date_time = datetime.now()

    search_query = search_query.replace(" ", "-")
    date_time = str(date_time).replace(" ", "-")

    # Create output file name
    output = "data/" + date_time + "-" + search_query + ".json"

    if next_page_token == False:
        # Set parameters
        params = {
	    'api_key':API_KEY,                          		
	    'q': search_query,             			
        'hl': 'en',                         		
        'gl': 'us',                         		
        "num": 10,									
	    'engine': 'google_jobs'}					

    else:
       # Set parameters
       params = {
	    'api_key':API_KEY,                          	
	    'q': search_query,             			
        'hl': 'en',                         		
        'gl': 'us',                         		
        "num": 10,									
	    'engine': 'google_jobs',					
        'next_page_token': next_page_token}
    
    # Does the search and extraction
    search = GoogleSearch(params)   			
    result_dict = search.get_dict() 

    if 'error' in result_dict:
        print("ERROR FOUND IN SEARCH")
    
    # Write file
    if verbose ==True:
        print(json.dumps(result_dict, indent=2))
    
    with open(output, 'w') as f:
        json.dump(result_dict, f, indent=2)
    
    #Set next page_token
    next_page_token = result_dict.get("serpapi_pagination", {}).get("next_page_token")
    
    return(next_page_token, result_dict)

# Go over each job title

Perform the search over the job titles, use pagination after 10 jobs, and clean the titles.

In [None]:
job_titles = [
    "Data Scientist",
    "Machine Learning Engineer",
    "Artificial Intelligence Specialist",
    "Data Analyst",
    "Business Intelligence Analyst",
    "Research Scientist (AI-ML)",
    "Deep Learning Engineer",
    "NLP Engineer (Natural Language Processing)",
    "Computer Vision Engineer",
    "Data Engineer",
    "Applied Scientist",
    "Quantitative Analyst (Quant)",
    "AI Solutions Architect",
    "Statistician",
    "Big Data Engineer",
    "Data Science Consultant",
    "Automation Engineer",
    "Analytics Manager",
    "Operations Research Analyst",
    "Robotics Engineer",
    "Bioinformatics Data Scientist",
    "Financial Data Scientist",
    "Customer Insights Analyst",
    "Marketing Data Analyst",
    "Data Strategy Manager",
    "Cloud AI Engineer",
    "Computational Scientist",
    "Fraud Detection Specialist",
    "Risk Analyst",
    "Data Architect"
]

In [76]:
import time

def iterate_jobs(titles):
    num = 0
    for job in titles:
        num += 1
        print(job + "\n" + "Search- " + str(num))
        next_page_token, result_dict = search_google_jobs(search_query= job)
        for result in result_dict['jobs_results']:
            result["title"]= result["title"].replace(" / ", "")
            split_str = result["title"].split()
            result["title"] = " ".join(split_str)
            print(result["title"] + " : " + result["company_name"])
        time.sleep(1)
        num += 1
        print("Search- " + str(num))
        next_page_token, result_dict = search_google_jobs(search_query= job, next_page_token=next_page_token)
        for result in result_dict['jobs_results']:
            result["title"]= result["title"].replace(" / ", "")
            split_str = result["title"].split()
            result["title"] = " ".join(split_str)
            print(result["title"] + " : " + result["company_name"])
        time.sleep(1)
        num += 1
        print("Search- " + str(num))
        next_page_token, result_dict = search_google_jobs(search_query= job, next_page_token=next_page_token)
        for result in result_dict['jobs_results']:
            result["title"]= result["title"].replace(" / ", "")
            split_str = result["title"].split()
            result["title"] = " ".join(split_str)
            print(result["title"] + " : " + result["company_name"])
        time.sleep(1)

In [78]:
cleaned_titles = iterate_jobs(job_titles)

Data Scientist
Search- 1
Staff Data Scientist, Personalization & Shopping : Pinterest
Consulting Data Scientist 4 : Oracle
Associate Data Scientist, West : Dataiku
Sr. Data Scientist, AI Engineering : Charter Communications
Data Scientist (Sr/Staff Software Engineer) (Remote - US) : BNSF Railway
Data Scientist II – QuantumBlack, AI by McKinsey : McKinsey & Company
Senior Data Scientist - 25418 New : Energy Acuity
Data Scientist (L5) , Performance Marketing : Netflix
Associate Data Scientist, West : Dataiku
PRINCIPAL DATA SCIENTIST : Lumen
Search- 2
[Remote] Senior Data Scientist I, GTM : Dandy
Senior Data Scientist : Cardinal Health
Business Intelligence Data Scientist : Bluesky
Sr Data Scientist : SPECTRUM
Senior Data Scientist, Innovation Lab - Remote : Experian
[Remote] Data Scientist (Sr/Staff Software Engineer) (Remote - US) : BNSF Railway
Lead Data Scientist, Search (REMOTE) : Dicks Sporting Goods
Data Scientist Jobs : Tailored Access, LLC
Data Scientist-POC : Sardine
Data Scient