# SCRAPER

## Initialization

### Install dependencies

In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.




### Import libraries

In [2]:
import requests
import csv
import nltk
import re

from numpy import savetxt, array
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\My
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Development of Scraper

### Testing scraper

In [3]:
## Getting webpage content by http request
url = 'https://www.dice.com/jobs/q-Android+Developer-jobs'
response = requests.get(url)
print('Status code: ', response.status_code)

Status code:  200


In [4]:
## Creating parser
soup = BeautifulSoup(response.text,  "html.parser")

## Check text content
print(soup.find('title'))

<title>Android Developer Jobs | Dice.com</title>


In [5]:
## Retrieving job titles
jobs_titles= soup.find_all(class_='hidden jcl-accessibility-text sc-dhi-job-search-job-card-layout-full')

print(jobs_titles[0].text)

Job Title - Android Developer (NC*)


In [6]:
## Retrieving job descriptions
jobs_descriptions = soup.find_all(class_='job-summary-full p-reg-100 sc-dhi-job-search-job-card-layout-full')

print(jobs_descriptions[0].text)

Experis IT/ManpowerGroup has partnered with a leading Financial Services organization for an Android Developer to assist their team. This will be a Dynamic Work Model. Job Title: Android Developer Location: Durham/Raleigh, NC - Dynamic Work Model However, we are not considering candidates who plan to travel for extended periods. Job Description: We are seeking a skilled Android Developer to join our team in Raleigh, North Carolina. While local candidates are preferred, we are open to considering


### Testing website traversal

In [7]:
## Getting webpage content by http request
url = 'https://www.dice.com/jobs/browsejobs/q-title-djt-A-jobs'
response = requests.get(url)
print('Status code: ', response.status_code)

## Creating parser
main_soup = BeautifulSoup(response.text,  "html.parser")

Status code:  200


In [None]:
## Getting job categories
categories = [category.text for category in main_soup.find_all(class_='mR5') if len(category.text) == 1]
print(categories)
print(len(categories)/8)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
3.25


In [9]:
## Getting job category links
categories_links = [url.replace(url.split('-')[-2],category) for category in categories]

print(categories_links)

['https://www.dice.com/jobs/browsejobs/q-title-djt-A-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-B-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-C-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-D-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-E-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-F-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-G-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-H-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-I-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-J-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-K-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-L-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-M-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-N-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-O-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-P-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-Q-job

In [10]:
## Getting job titles per category link
response = requests.get(categories_links[0])
print('Status code: ', response.status_code)

## Creating parser
category_soup = BeautifulSoup(response.text,  "html.parser")

Status code:  200


In [11]:
## Getting job titles and job links
## Output is a list with array of each element (job_title,job_title_link)
job_titles_in_category = [(job_title.text,job_title['href']) for job_title in category_soup.find_all(class_='mR5 browse-job-detail')]

print(job_titles_in_category)

[('Advisory - Identity and Access Management- Senior Consu', '/jobs/q-Advisory+%26%2345+Identity+and+Access+Management%26%2345+Senior+Consu-jobs'), ('Android Developer', '/jobs/q-Android+Developer-jobs'), ('Automation Engineer', '/jobs/q-Automation+Engineer-jobs'), ('AEM Developer', '/jobs/q-AEM+Developer-jobs'), ('Azure DevOps Engineer', '/jobs/q-Azure+DevOps+Engineer-jobs'), ('Angular Developer', '/jobs/q-Angular+Developer-jobs'), ('Application Developer', '/jobs/q-Application+Developer-jobs'), ('Axiom Developer', '/jobs/q-Axiom+Developer-jobs'), ('AWS/Azure Cloud Security Engineer (Remote - Delivery Ce', '/jobs/q-AWS%26%2347Azure+Cloud+Security+Engineer+%28Remote+%26%2345+Delivery+Ce-jobs'), ('Appian Developer', '/jobs/q-Appian+Developer-jobs'), ('Azure Cloud Engineer', '/jobs/q-Azure+Cloud+Engineer-jobs'), ('AWS Cloud Architect', '/jobs/q-AWS+Cloud+Architect-jobs'), ('AWS Data Engineer', '/jobs/q-AWS+Data+Engineer-jobs'), ('Actimize Developer', '/jobs/q-Actimize+Developer-jobs'), (

In [12]:
## Generating job links
root_url = 'https://www.dice.com'
job_titles_links = [root_url+job_title[1] for job_title in job_titles_in_category]

print(job_titles_links)

['https://www.dice.com/jobs/q-Advisory+%26%2345+Identity+and+Access+Management%26%2345+Senior+Consu-jobs', 'https://www.dice.com/jobs/q-Android+Developer-jobs', 'https://www.dice.com/jobs/q-Automation+Engineer-jobs', 'https://www.dice.com/jobs/q-AEM+Developer-jobs', 'https://www.dice.com/jobs/q-Azure+DevOps+Engineer-jobs', 'https://www.dice.com/jobs/q-Angular+Developer-jobs', 'https://www.dice.com/jobs/q-Application+Developer-jobs', 'https://www.dice.com/jobs/q-Axiom+Developer-jobs', 'https://www.dice.com/jobs/q-AWS%26%2347Azure+Cloud+Security+Engineer+%28Remote+%26%2345+Delivery+Ce-jobs', 'https://www.dice.com/jobs/q-Appian+Developer-jobs', 'https://www.dice.com/jobs/q-Azure+Cloud+Engineer-jobs', 'https://www.dice.com/jobs/q-AWS+Cloud+Architect-jobs', 'https://www.dice.com/jobs/q-AWS+Data+Engineer-jobs', 'https://www.dice.com/jobs/q-Actimize+Developer-jobs', 'https://www.dice.com/jobs/q-Applications+Architect-jobs', 'https://www.dice.com/jobs/q-AWS+DevOps+Engineer-jobs', 'https://www.

In [13]:
## Loading content of job title link
job_title_url = job_titles_links[1]
job_title_response = requests.get(job_title_url)
print('Status code: ', response.status_code)

## Creating parser
job_title_soup = BeautifulSoup(job_title_response.text,  "html.parser")

Status code:  200


In [14]:
## Retrieving job descriptions
jobs_descriptions = [description.text for description in job_title_soup.find_all(class_='job-summary-full p-reg-100 sc-dhi-job-search-job-card-layout-full')]

print(jobs_descriptions[0])

Experis IT/ManpowerGroup has partnered with a leading Financial Services organization for an Android Developer to assist their team. This will be a Dynamic Work Model. Job Title: Android Developer Location: Durham/Raleigh, NC - Dynamic Work Model However, we are not considering candidates who plan to travel for extended periods. Job Description: We are seeking a skilled Android Developer to join our team in Raleigh, North Carolina. While local candidates are preferred, we are open to considering


In [15]:
## Getting number of pages
num_pages = int(soup.find(class_='sc-dhi-seds-pagination').text.split(' ')[-1])

## Generating pagination links
base_page_link = '?page='
page_links = [job_title_url+base_page_link+str(page) for page in range(2,num_pages+1)]

print(page_links)

['https://www.dice.com/jobs/q-Android+Developer-jobs?page=2', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=3', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=4', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=5', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=6', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=7', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=8', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=9', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=10', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=11', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=12', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=13', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=14', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=15', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=16', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=17', 'https://www.di

In [16]:
## Getting job descriptions for every page in job title listing
for n,page_link in enumerate(page_links):
    ## Loading content of job title link
    job_title_url = page_link
    job_title_response = requests.get(job_title_url)
    print('Status code: ', response.status_code)

    ## Creating parser
    job_title_soup = BeautifulSoup(job_title_response.text,  "html.parser")

    ## Retrieving job descriptions
    temp = [description.text for description in job_title_soup.find_all(class_='job-summary-full p-reg-100 sc-dhi-job-search-job-card-layout-full')]
    jobs_descriptions = jobs_descriptions + temp
    
    if n > 9: break ## Testing code for pages 2-12


Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200


In [17]:
print('Number of job descriptions for first 12 pages:',len(jobs_descriptions))

Number of job descriptions for first 12 pages: 240


## Dice.com scraper

### Defining helper functions

In [18]:
def soup_loader(url):
    ## Getting webpage content by http request
    ## Creating parser
    response = requests.get(url)
    soup = BeautifulSoup(response.text,  "html.parser")
    return soup, response.status_code

In [19]:
def get_categories(soup):
    ## Getting job categories
    return [category.text for category in soup.find_all(class_='mR5') if category.text == 1]

In [20]:
def get_category_links(categories_list):
    return [url.replace(url.split('-')[-2],category) for category in categories_list]

In [21]:
def get_job_title_and_link(soup):
    ## Getting job titles and job links
    ## Output is a list with array of each element (job_title,job_title_link)
    return [(job_title.text,job_title['href']) for job_title in soup.find_all(class_='mR5 browse-job-detail')]


In [22]:
def get_job_links(list):
    ## Generating job links
    root_url = 'https://www.dice.com'
    return [root_url+job_title[1] for job_title in list]

In [23]:
def get_num_pages(soup):
    ## Getting number of pages
    checker = soup.find(class_='sc-dhi-seds-pagination')
    if checker == None or checker == 1: return 1 ## Checking if no pages
    return int(soup.find(class_='sc-dhi-seds-pagination').text.split(' ')[-1])

In [24]:
def get_page_links(num_pages, job_title_url):
    ## Generating pagination links
    base_page_link = '?page='
    return [job_title_url+base_page_link+str(page) for page in range(2,num_pages+1)]

In [25]:
def get_job_descriptions(soup, stop_words = stopwords.words('english')):
    ## Retrieving job descriptions
    ## Applying NLP: lowercase, remove non-alphanum, remove stopwords
    descriptions = [description.text for description in soup.find_all(class_='job-summary-full p-reg-100 sc-dhi-job-search-job-card-layout-full')] 
    for n,desc in enumerate(descriptions):
        temp = re.sub(r"[^ a-zA-Z0-9]+",'', desc).lower().split(' ')
        temp = [word for word in temp if word not in stop_words]
        descriptions[n] = ''.join(temp)
    return descriptions


### Scraping

In [26]:
## Creating main soup
url = 'https://www.dice.com/jobs/browsejobs/q-title-djt-A-jobs' ## Starting point
main_soup,status = soup_loader(url)

## Getting job categories
categories = get_categories(main_soup)

## Getting job category links
categories_links = get_category_links(categories)

## Initializing contiainers
jobs_descriptions=[]
job_titles=[]

for category in categories_links:
    try:
        print('Category:', category)
            
        ## Getting job titles per category link
        category_soup,status = soup_loader(category)

        ## Getting job titles and job links
        ## Output is a list with array of each element (job_title,job_title_link)
        job_titles_in_category = get_job_title_and_link(category_soup)
        job_titles += [job_title[0] for job_title in job_titles_in_category]
        
        ## Generating job links
        job_titles_links = get_job_links(job_titles_in_category)

        for l,job_link in enumerate(job_titles_links): ## remove l after testing
            print(' Job Link:',job_link)
            
            ## Loading content of job title link
            job_title_soup,status = soup_loader(job_link)

            ## Retrieving job descriptions
            job_link_descriptions = get_job_descriptions(job_title_soup)
            
            ## Getting number of pages
            num_pages = get_num_pages(job_title_soup)
            
            print(f'       {1} out of {num_pages} pages processed')
            if num_pages == 1: 
                jobs_descriptions += [job_link_descriptions]
                print('       Only 1 page'); continue ## Next job title if only one page
            
            ## Generating pagination links
            page_links = get_page_links(num_pages, job_link)

            ## Getting job descriptions for page>1 in job title listing
            for n,page_link in enumerate(page_links):
                ## Loading content of job title link
                job_title_soup,status = soup_loader(page_link)

                ## Retrieving job descriptions
                job_link_descriptions += f' {get_job_descriptions(job_title_soup)}'
                
                print(f'       {n+2} out of {num_pages} pages processed')
                # if n+1 == 4: break ## Testing code for pages 2-5
            
            ## Hard limiting number of jobs to scrape 
            if len(job_link_descriptions) > 20: jobs_descriptions += [job_link_descriptions[:20]]
            else: jobs_descriptions += [job_link_descriptions]
            
        #     if l > 2 : break ## Testing code for 4 jobs
        # break ## Testing code
    except Exception as e:
        print(e)


Category: https://www.dice.com/jobs/browsejobs/q-title-djt-A-jobs
 Job Link: https://www.dice.com/jobs/q-Advisory+%26%2345+Identity+and+Access+Management%26%2345+Senior+Consu-jobs
       1 out of 1 pages processed
       Only 1 page
 Job Link: https://www.dice.com/jobs/q-Android+Developer-jobs
       1 out of 44 pages processed
       2 out of 44 pages processed
       3 out of 44 pages processed
       4 out of 44 pages processed
       5 out of 44 pages processed
       6 out of 44 pages processed
       7 out of 44 pages processed
       8 out of 44 pages processed
       9 out of 44 pages processed
       10 out of 44 pages processed
       11 out of 44 pages processed
       12 out of 44 pages processed
       13 out of 44 pages processed
       14 out of 44 pages processed
       15 out of 44 pages processed
       16 out of 44 pages processed
       17 out of 44 pages processed
       18 out of 44 pages processed
       19 out of 44 pages processed
       20 out of 44 pages proc

KeyboardInterrupt: 

In [None]:
## Saving as CSV
# descriptions = [job_descriptions] ## turning descriptions into shape (1,num_job_titles,num_job_descripsions)

with open('datasets//dice_jobs.csv', 'w') as f:
     
    # using csv.writer method from CSV package
    write = csv.writer(f)
     
    write.writerow(job_titles) ## columns
    write.writerow(jobs_descriptions) ## rows

In [None]:
len(jobs_descriptions)

4

In [None]:
print('# of job listings:',len(jobs_descriptions))

# of job listings: 61
