### Imports

In [1]:
from bs4 import BeautifulSoup, NavigableString
from selenium import webdriver
from multiprocessing import Process
import csv
import pandas as pd
import datetime
import ssl

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

#### URL Formation

In [2]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time

locations = []
query = []
pageSize = 20

with open('location.txt', 'r') as file:
    for line in file:
        locations.append(line.strip()) 

with open('q.txt', 'r') as file:
    for line in file:
        query.append(line.strip())

with open('pageSize.txt', 'r') as file:
    for line in file:
        pageSize = int(line.strip())
        if pageSize in [10, 20, 50, 100]:
            print(f"Page size {pageSize} is correct.")
        else:
            print("Get a correct page size")

URLS_dict = {}
baseurl = "https://www.dice.com/jobs?"

for location in locations:
    URLS_dict[location] = []
    for job in query:
        for page in range(1, 7): 
            url = f"{baseurl}q={job}&location={location}&page={page}&pageSize={pageSize}"
            print(url)
            URLS_dict[location].append(url)

# Print the total number of URLs for each location
for location, urls in URLS_dict.items():
    print(f"Location: {location}, Total URLs: {len(urls)}")


Page size 100 is correct.
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=1&pageSize=100
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=2&pageSize=100
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=3&pageSize=100
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=4&pageSize=100
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=5&pageSize=100
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=6&pageSize=100
https://www.dice.com/jobs?q=Data%20Engineer&location=San%20Francisco,%20CA,%20USA&page=1&pageSize=100
https://www.dice.com/jobs?q=Data%20Engineer&location=San%20Francisco,%20CA,%20USA&page=2&pageSize=100
https://www.dice.com/jobs?q=Data%20Engineer&location=San%20Francisco,%20CA,%20USA&page=3&pageSize=100
https://www.dice.com/jobs?q=Data%20Engineer&locati

### Scraping Job

In [3]:

def scrape_dice_jobs(soup,location):
    employment_types = []
    job_titles = []
    posted_dates = []
    job_descriptions = []
    company_links = []
    company_names = []

    job_cards = soup.find_all('dhi-search-card', {'class': 'ng-star-inserted'})

    for card in job_cards:
        employment_type = card.find('span', {'data-cy': 'search-result-employment-type'}).text.strip()
        job_title = card.find('a', class_='card-title-link normal').text.strip()
        posted_date = card.find('span', class_='posted-date').text.strip()
        job_description = card.find('div', class_='card-description').text.strip()
        company_link = card.find('a', class_='ng-star-inserted')['href']
        company_name = card.find('a', class_='ng-star-inserted').text.strip()
        
        employment_types.append(employment_type)
        job_titles.append(job_title)
        posted_dates.append(posted_date)
        job_descriptions.append(job_description)
        company_links.append(company_link)
        company_names.append(company_name)

    data = {
        'Job Title': job_titles,
        'Company Name': company_names,
        'Job Description': job_descriptions,
        'Location': location.replace("%20", " "),
        'Employment Type': employment_types,
        'Posted Date': posted_dates,
        'Company Link': company_links
    }
    return pd.DataFrame(data)


### Main function

In [4]:
for location, urls in URLS_dict.items():
    print(location)
    location=location.replace("%20", " ")
    concatenated_df = pd.DataFrame()
    for url in urls:
        print(url)
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(20)
        page_source = driver.page_source
        driver.quit()
        soup = BeautifulSoup(page_source, 'html.parser')
        df = scrape_dice_jobs(soup, location)
        print(df.shape)
        concatenated_df = pd.concat([concatenated_df, df])
        print(concatenated_df.shape)

    timestamp = datetime.datetime.now().strftime("%Y%m%d")
    filename = f'dice_jobs_{location}_{timestamp}.csv'
    concatenated_df.to_csv(filename, index=True)
    print("File Created",filename)

San%20Francisco,%20CA,%20USA
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=1&pageSize=100
(100, 7)
(100, 7)
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=2&pageSize=100
(100, 7)
(200, 7)
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=3&pageSize=100
(100, 7)
(300, 7)
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=4&pageSize=100
(100, 7)
(400, 7)
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=5&pageSize=100
(100, 7)
(500, 7)
https://www.dice.com/jobs?q=Data%20Analytics&location=San%20Francisco,%20CA,%20USA&page=6&pageSize=100
(100, 7)
(600, 7)
https://www.dice.com/jobs?q=Data%20Engineer&location=San%20Francisco,%20CA,%20USA&page=1&pageSize=100
(100, 7)
(700, 7)
https://www.dice.com/jobs?q=Data%20Engineer&location=San%20Francisco,%20CA,%20USA&page=2&pageSize=100
(0, 7)
(700, 7)
https:/

WebDriverException: Message: disconnected: Unable to receive message from renderer
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=124.0.6367.62)
Stacktrace:
0   chromedriver                        0x0000000100cee8dc chromedriver + 4368604
1   chromedriver                        0x0000000100ce6d70 chromedriver + 4337008
2   chromedriver                        0x000000010090ac04 chromedriver + 289796
3   chromedriver                        0x00000001008f5230 chromedriver + 201264
4   chromedriver                        0x00000001008f4f5c chromedriver + 200540
5   chromedriver                        0x00000001008f3eac chromedriver + 196268
6   chromedriver                        0x0000000100914bd0 chromedriver + 330704
7   chromedriver                        0x00000001009857b8 chromedriver + 792504
8   chromedriver                        0x0000000100941ab4 chromedriver + 514740
9   chromedriver                        0x000000010094250c chromedriver + 517388
10  chromedriver                        0x0000000100cb2df8 chromedriver + 4124152
11  chromedriver                        0x0000000100cb7be8 chromedriver + 4144104
12  chromedriver                        0x0000000100c987c0 chromedriver + 4016064
13  chromedriver                        0x0000000100cb8518 chromedriver + 4146456
14  chromedriver                        0x0000000100c8a274 chromedriver + 3957364
15  chromedriver                        0x0000000100cd7e60 chromedriver + 4275808
16  chromedriver                        0x0000000100cd7fdc chromedriver + 4276188
17  chromedriver                        0x0000000100ce69d0 chromedriver + 4336080
18  libsystem_pthread.dylib             0x0000000182c31034 _pthread_start + 136
19  libsystem_pthread.dylib             0x0000000182c2be3c thread_start + 8
