# SCRAPER

## Initialization

### Install dependencies

In [28]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.




### Import libraries

In [29]:
import requests
import csv
import nltk
import re

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\My
[nltk_data]     Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Development of Scraper

### Testing scraper

In [30]:
## Getting webpage content by http request
url = 'https://www.dice.com/jobs/q-Android+Developer-jobs'
response = requests.get(url)
print('Status code: ', response.status_code)

Status code:  200


In [31]:
## Creating parser
soup = BeautifulSoup(response.text,  "html.parser")

## Check text content
print(soup.find('title'))

<title>Android Developer Jobs | Dice.com</title>


In [32]:
## Retrieving job titles
jobs_titles= soup.find_all(class_='hidden jcl-accessibility-text sc-dhi-job-search-job-card-layout-full')

print(jobs_titles[0].text)

Job Title - Android Developer


In [33]:
## Retrieving job descriptions
jobs_descriptions = soup.find_all(class_='job-summary-full p-reg-100 sc-dhi-job-search-job-card-layout-full')

print(jobs_descriptions[0].text)

Position: Android Developer Location: Santa Cruz, CA (outer Silicon Valley) – position is hybrid (2 days per week onsite).   Exp: 10-12+yrs    Interview: 2 video conferences, then hiring decision. Our Client is seeking an Android Developer for an extendable 6-month contract position. This is a hybrid position – minimum of 2 days per week onsite in Santa Cruz, CA. Responsibilities: Professionalize, optimize, and document code which is "hacked together" that it is robust, scalable, and documented;


### Testing website traversal

In [34]:
## Getting webpage content by http request
url = 'https://www.dice.com/jobs/browsejobs/q-title-djt-A-jobs'
response = requests.get(url)
print('Status code: ', response.status_code)

## Creating parser
main_soup = BeautifulSoup(response.text,  "html.parser")

Status code:  200


In [35]:
## Getting job categories
categories = [category.text for category in main_soup.find_all(class_='mR5') if len(category.text) == 1]
print(categories)
print(len(categories)/8)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
3.25


In [36]:
## Getting job category links
categories_links = [url.replace(url.split('-')[-2],category) for category in categories]

print(categories_links)

['https://www.dice.com/jobs/browsejobs/q-title-djt-A-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-B-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-C-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-D-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-E-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-F-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-G-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-H-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-I-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-J-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-K-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-L-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-M-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-N-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-O-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-P-jobs', 'https://www.dice.com/jobs/browsejobs/q-title-djt-Q-job

In [37]:
## Getting job titles per category link
response = requests.get(categories_links[0])
print('Status code: ', response.status_code)

## Creating parser
category_soup = BeautifulSoup(response.text,  "html.parser")

Status code:  200


In [38]:
## Getting job titles and job links
## Output is a list with array of each element (job_title,job_title_link)
job_titles_in_category = [(job_title.text,job_title['href']) for job_title in category_soup.find_all(class_='mR5 browse-job-detail')]

print(job_titles_in_category)

[('Advisory - Identity and Access Management- Senior Consu', '/jobs/q-Advisory+%26%2345+Identity+and+Access+Management%26%2345+Senior+Consu-jobs'), ('Android Developer', '/jobs/q-Android+Developer-jobs'), ('Automation Engineer', '/jobs/q-Automation+Engineer-jobs'), ('Azure DevOps Engineer', '/jobs/q-Azure+DevOps+Engineer-jobs'), ('AEM Developer', '/jobs/q-AEM+Developer-jobs'), ('Application Developer', '/jobs/q-Application+Developer-jobs'), ('Axiom Developer', '/jobs/q-Axiom+Developer-jobs'), ('Angular Developer', '/jobs/q-Angular+Developer-jobs'), ('AWS/Azure Cloud Security Engineer (Remote - Delivery Ce', '/jobs/q-AWS%26%2347Azure+Cloud+Security+Engineer+%28Remote+%26%2345+Delivery+Ce-jobs'), ('Appian Developer', '/jobs/q-Appian+Developer-jobs'), ('Actimize Developer', '/jobs/q-Actimize+Developer-jobs'), ('AWS Data Engineer', '/jobs/q-AWS+Data+Engineer-jobs'), ('Applications Architect', '/jobs/q-Applications+Architect-jobs'), ('Azure Cloud Engineer', '/jobs/q-Azure+Cloud+Engineer-job

In [39]:
## Generating job links
root_url = 'https://www.dice.com'
job_titles_links = [root_url+job_title[1] for job_title in job_titles_in_category]

print(job_titles_links)

['https://www.dice.com/jobs/q-Advisory+%26%2345+Identity+and+Access+Management%26%2345+Senior+Consu-jobs', 'https://www.dice.com/jobs/q-Android+Developer-jobs', 'https://www.dice.com/jobs/q-Automation+Engineer-jobs', 'https://www.dice.com/jobs/q-Azure+DevOps+Engineer-jobs', 'https://www.dice.com/jobs/q-AEM+Developer-jobs', 'https://www.dice.com/jobs/q-Application+Developer-jobs', 'https://www.dice.com/jobs/q-Axiom+Developer-jobs', 'https://www.dice.com/jobs/q-Angular+Developer-jobs', 'https://www.dice.com/jobs/q-AWS%26%2347Azure+Cloud+Security+Engineer+%28Remote+%26%2345+Delivery+Ce-jobs', 'https://www.dice.com/jobs/q-Appian+Developer-jobs', 'https://www.dice.com/jobs/q-Actimize+Developer-jobs', 'https://www.dice.com/jobs/q-AWS+Data+Engineer-jobs', 'https://www.dice.com/jobs/q-Applications+Architect-jobs', 'https://www.dice.com/jobs/q-Azure+Cloud+Engineer-jobs', 'https://www.dice.com/jobs/q-AWS+Cloud+Architect-jobs', 'https://www.dice.com/jobs/q-Azure+Data+Engineer-jobs', 'https://www.

In [40]:
## Loading content of job title link
job_title_url = job_titles_links[1]
job_title_response = requests.get(job_title_url)
print('Status code: ', response.status_code)

## Creating parser
job_title_soup = BeautifulSoup(job_title_response.text,  "html.parser")

Status code:  200


In [41]:
## Retrieving job descriptions
jobs_descriptions = [description.text for description in job_title_soup.find_all(class_='job-summary-full p-reg-100 sc-dhi-job-search-job-card-layout-full')]

print(jobs_descriptions[0])

Position: Android Developer Location: Santa Cruz, CA (outer Silicon Valley) – position is hybrid (2 days per week onsite).   Exp: 10-12+yrs    Interview: 2 video conferences, then hiring decision. Our Client is seeking an Android Developer for an extendable 6-month contract position. This is a hybrid position – minimum of 2 days per week onsite in Santa Cruz, CA. Responsibilities: Professionalize, optimize, and document code which is "hacked together" that it is robust, scalable, and documented;


In [42]:
## Getting number of pages
num_pages = int(soup.find(class_='sc-dhi-seds-pagination').text.split(' ')[-1])

## Generating pagination links
base_page_link = '?page='
page_links = [job_title_url+base_page_link+str(page) for page in range(2,num_pages+1)]

print(page_links)

['https://www.dice.com/jobs/q-Android+Developer-jobs?page=2', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=3', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=4', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=5', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=6', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=7', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=8', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=9', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=10', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=11', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=12', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=13', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=14', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=15', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=16', 'https://www.dice.com/jobs/q-Android+Developer-jobs?page=17', 'https://www.di

In [43]:
## Getting job descriptions for every page in job title listing
for n,page_link in enumerate(page_links):
    ## Loading content of job title link
    job_title_url = page_link
    job_title_response = requests.get(job_title_url)
    print('Status code: ', response.status_code)

    ## Creating parser
    job_title_soup = BeautifulSoup(job_title_response.text,  "html.parser")

    ## Retrieving job descriptions
    temp = [description.text for description in job_title_soup.find_all(class_='job-summary-full p-reg-100 sc-dhi-job-search-job-card-layout-full')]
    jobs_descriptions = jobs_descriptions + temp
    
    if n > 9: break ## Testing code for pages 2-12


Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200
Status code:  200


In [44]:
print('Number of job descriptions for first 12 pages:',len(jobs_descriptions))

Number of job descriptions for first 12 pages: 240


## Dice.com scraper

### Defining helper functions

In [45]:
def soup_loader(url):
    ## Getting webpage content by http request
    ## Creating parser
    headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
        }
    response = requests.get(url, 
                            timeout=180, 
                            headers=headers, 
                            )
    soup = BeautifulSoup(response.text,  "html.parser")
    response.close()
    return soup, response.status_code

In [46]:
def get_categories(soup):
    ## Getting job categories
    return [category.text for category in soup.find_all(class_='mR5') if len(category.text) == 1]


In [47]:
def get_category_links(categories_list, url):
    ## Gets links for indentified categories
    return [url.replace(url.split('-')[-2],category) for category in categories_list]


In [48]:
def get_job_title_and_link(soup):
    ## Getting job titles and job links
    ## Output is a list with array of each element (job_title,job_title_link)
    return [(job_title.text,job_title['href']) for job_title in soup.find_all(class_='mR5 browse-job-detail')]


In [49]:
def get_job_links(list):
    ## Generating job links
    root_url = 'https://www.dice.com'
    return [root_url+job_title[1] for job_title in list]


In [50]:
def get_num_pages(soup):
    ## Getting number of pages
    checker = soup.find(class_='sc-dhi-seds-pagination')
    if checker == None or checker == 1: return 1 ## Checking if no pages
    return int(soup.find(class_='sc-dhi-seds-pagination').text.split(' ')[-1])


In [51]:
def get_page_links(num_pages, job_title_url):
    ## Generating pagination links
    base_page_link = '?page='
    return [job_title_url+base_page_link+str(page) for page in range(2,num_pages+1)]


In [52]:
def get_job_descriptions(soup, stop_words = stopwords.words('english')):
    ## Retrieving job descriptions
    ## Applying NLP: lowercase, remove non-alphanum, remove stopwords
    descriptions = [description.text for description in soup.find_all(class_='job-summary-full p-reg-100 sc-dhi-job-search-job-card-layout-full')] 
    for n,desc in enumerate(descriptions):
        temp = re.sub(r"[^ a-zA-Z]+",' ', desc).lower().split(' ')
        temp = [word for word in temp if word not in stop_words if len(word) > 1]
        descriptions[n] = ' '.join(temp)
    return ' '.join(descriptions)


### Scraping

In [53]:
## Category index to extract
file=1

In [54]:

## Creating main soup
url = 'https://www.dice.com/jobs/browsejobs/q-title-djt-A-jobs' ## Starting point
main_soup,status = soup_loader(url)

## Getting job categories
categories = get_categories(main_soup)[file]
print(categories)

## Getting job category links
categories_links = get_category_links(categories, url)

## Initializing contiainers
jobs_descriptions=[]
job_titles=[]
base=100000
key=1
for category in categories_links:
    try:
        print('Category:', category)
            
        ## Getting job titles per category link
        category_soup,status = soup_loader(category)

        ## Getting job titles and job links
        ## Output is a list with array of each element (job_title,job_title_link)
        job_titles_in_category = get_job_title_and_link(category_soup)
        job_titles += [job_title[0] for job_title in job_titles_in_category]
        
        ## Generating job links
        job_titles_links = get_job_links(job_titles_in_category)
        
        for l,job_link in enumerate(job_titles_links): ## remove l after testing
            print(' Job Link:',job_link)
            job_link_descriptions = []
            
            ## Loading content of job title link
            job_title_soup,status = soup_loader(job_link)

            ## Getting number of pages
            num_pages = get_num_pages(job_title_soup)
            n=1; key+=1
            print(f'       {1} out of {num_pages} pages processed')
            if num_pages == 1: 
                jobs_descriptions.append({'job_id':file*base+key,'job_title':job_titles[l],'job_description':get_job_descriptions(job_title_soup)})
                print('       Only 1 page'); continue ## Next job title if only one page
            
            job_link_descriptions+= get_job_descriptions(job_title_soup)
            ## Generating pagination links
            page_links = get_page_links(num_pages, job_link)
            
            ## Getting job descriptions for page>1 in job title listing
            for page_link in page_links:

                ## Loading content of job title link
                job_title_soup,status = soup_loader(page_link)

                ## Retrieving job descriptions
                job_link_descriptions += get_job_descriptions(job_title_soup)
                
                print(f'       {n+1} out of {num_pages} pages processed')
                ## Hard limiting number of jobs to scrape 
                n+=1
                # break
                if n == 2: break ## Getting a maximum (n+1)*20 number of job description per title
            jobs_descriptions.append({'job_id':file*base+key,'job_title':job_titles[l],'job_description':get_job_descriptions(job_title_soup)})
            if l == 2 : break ## Testing code for 10 jobs
        # break ## Testing code
    except Exception as e:
        print(e)

B
Category: https://www.dice.com/jobs/browsejobs/q-title-djt-B-jobs
 Job Link: https://www.dice.com/jobs/q-Business+Analyst-jobs
       1 out of 137 pages processed
       2 out of 137 pages processed
 Job Link: https://www.dice.com/jobs/q-Business+Systems+Analyst-jobs
       1 out of 139 pages processed
       2 out of 139 pages processed
 Job Link: https://www.dice.com/jobs/q-Biometrics+Senior+Consultant%26%2345+Remote%26%2347Delivery+Center+Ro-jobs
       1 out of 15 pages processed
       2 out of 15 pages processed


In [55]:
## Saving as CSV
fieldnames = ['job_id','job_title','job_description']
with open(f'datasets//dice_jobs_{str(file)}.csv', 'w') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(jobs_descriptions)
