# Scrape Wuzzuf Job Website

This code scrapes the Wuzzuf job website which has about ~5K jobs in Egypt for the assignment "Improving Labor Market Matching in Egypt."

The Wuzzuf job website has detailed stats on the number of job applicants, reviewed applicants by employer, and number of rejected applicants.  It also is all in English making it easier to synthesize the data.  Plus there is plenty of tagging for job category as well as industry codes.  The data is subsequently output into a csv file that is further processed and cleaned.

#### Created by Natalie Chun (20 September - 15 October 2017)

In [1]:
#key packages for scraping websites
import urllib
import urllib.request as urlrequest
from bs4 import BeautifulSoup
import sys

#standard packages for holding and analyzing datasets and outputting files
import pandas as pd
import numpy as np
import re
import datetime
import time
import csv

In [2]:
#get Egypt Jobs (only)
def get_WuzuffJobUrls():

    url = 'https://wuzzuf.net/search/jobs?start=0&filters%5Bcountry%5D%5B0%5D=Egypt'
    nextpage = True
    url_jobs = []

    while nextpage:
    
        req = urlrequest.Request(url)
        response = urlrequest.urlopen(req)
        soup = BeautifulSoup(response, 'html.parser')
    
        # objective is to get the links from the page and put it in a list to call and run through
        name_box = soup.find('div', attrs={'class': 'content-card card-has-jobs'})
        #print(name_box)

        #obtain all of the urls associated with different jobs listed on the website (this only needs to be called once)
        for a in name_box.find_all('a', href=True):
            if 'https://wuzzuf.net/jobs/p/' in a['href']:
                href = a['href'].split('?')
                if href[0] not in url_jobs:
                    url_jobs.append(href[0])
                    #print("Found the URL:", href[0])

        # get the next set of job listings for this classification
        nextpg = name_box.find('li', attrs={'class': 'pag-next'})
        try:
            url = nextpg.find_all('a', href=True)[0]['href']
            #Print out length to track number of urls retrieved
            #print(len(url_jobs))
            # sleep so that we do not bombard the website with requests
            time.sleep(5)
        except AttributeError:
            nextpage = False
    
    print(len(url_jobs))
    return(url_jobs)

In [None]:
# Write the job urls out to a file so we do not have to continually call this if we do not want to
job_urls = get_WuzuffJobUrls()  

In [None]:
#write the urllinks to be stored in a file
datenow = time.strftime("%m%d%Y")
with open('Wuzzuf_job_urls'+datenow+'.txt', 'w', newline='') as file:
    for j in job_urls:
        file.write(str(j.encode('utf-8')))
        file.write("\n")
file.close()

In [8]:
datenow = time.strftime("%m%d%Y")
job_urls = []
with open('Wuzzuf_job_urls'+datenow+'.txt', 'r') as file:
    for l in file:
        job_urls.append(l.strip('''b\'''').strip('''\'\n'''))
print(job_urls[0:5])

['https://wuzzuf.net/jobs/p/102624-Sales-Agent---Multinational-Insurance-Company-Allianz-Egypt-Cairo-Egypt', 'https://wuzzuf.net/jobs/p/102623-Offshore-Account----Call-Center-Representative---Business-International-Services-Cairo-Egypt', 'https://wuzzuf.net/jobs/p/102622-Marketing-Manager-Elshawa-Trading-Group-Dakahlia-Egypt', 'https://wuzzuf.net/jobs/p/100707-Senior-English-Instructor-Harvest-British-College-Alexandria-Egypt', 'https://wuzzuf.net/jobs/p/102616-photographer-Ben-Soliman-Giza-Egypt']


In [24]:
#this function takes the job urls and calls another function to scrape each job advertisement page
scrape_starttime = datetime.datetime.now()

def scrapeWuzzufPages(urls,cnt,replace):
    sys.stdout.encoding
    if replace is True:
        filewrite = 'w'
    else:
        filewrite = 'a+'
        
    with open('Wuzzuf_jobdata_'+datenow+'.csv', filewrite, newline='') as file:
        w = csv.writer(file)
        if replace is True:
            w.writerow(["download_date","job-title", 'job-company-name', "job-company-location", "postdate", "num_applicants","num_vacancies", "num_reviewed",
               "num_shortlist", "num_rejected", 'experience_needed', 'career_level', 'job_type', 'salary',
                        'education_level','gender','travel_frequency','languages','vacancies','job_roles','keywords','requirements','industries'])

        i = cnt
        for url in urls:
            try:
                w.writerow(get_WuzzufJobData(url))
            except:
                pass
            #sleep for a bit too make sure to not hit pages too often
            time.sleep(3)
            i += 1
            if i % 100 == 0:
                print('Processed urls: {}'.format(i))
                
        print("\nDone!\n Statuses Processed in %s" 
              % (datetime.datetime.now() - scrape_starttime))
file.close()

In [25]:
punctuation = [";",",","'","&"]

#Function keeps requesting page or until it hits limit of 5 requests
def request_until_succeed(url):
    req = urlrequest.Request(url)
    success = False
    while success is False:
        try: 
            response = urlrequest.urlopen(req)
            if response.getcode() == 200:
                success = True
        except Exception:
            try:
                urlsplit = urllib.parse.urlsplit(url)
                urlsplit = list(urlsplit)
                print(urlsplit)
                urlsplit[2] = urllib.parse.quote(urlsplit[2])
                url = urllib.parse.urlunsplit(urlsplit)
                req = urlrequest.Request(url)
            except Exception:
                print("Exception")
                time.sleep(5)
                print("Error for URL %s: %s" % (url, datetime.datetime.now()))

    return response.read()

#this page scrapes individual job advertisement pages
def get_WuzzufJobData(urlname):

    response = request_until_succeed(urlname)
    soup = BeautifulSoup(response, 'html.parser')
    #print(soup)
    
    #obtain main job data
    mainjobdata = soup.find('div', attrs={'class': 'job-main-card content-card'})
    #print(mainjobdata)
    jobdata = mainjobdata.find_all(['h1','a','span'])
    #print(jobdata)
    jobinfo = {}
    for d in jobdata:
        try:
            if d['class'][0] in ['job-title','job-company-name','job-company-location']:
                jobinfo[d['class'][0]] = d.get_text().strip().encode('utf-8')
                #print(jobinfo[d['class'][0]])
        except KeyError:
            pass

    #get stats on applicants
    try:
        num_applicants = mainjobdata.find_all('div', attrs={'class': 'applicants-num'})[0].get_text()
    except IndexError:
        num_applicants = 0
        
    try:
        num_vacancies = mainjobdata.find_all('span', attrs={'class': 'vacancies-num'})[0].get_text()
    except IndexError:
        num_vacancies = 0

    stats = mainjobdata.find_all('div', attrs={'class': 'applicants-stat-num'})
    #print(stats)
    try:
        num_seen = stats[0].get_text()
    except IndexError:
        num_seen = 0
    try:
        num_shortlist = stats[1].get_text()
    except IndexError:
        num_shortlist = 0
    try:
        num_rejected = stats[2].get_text()
    except IndexError:
        num_rejected = 0
        
    #get date when posted and download date
    post_date = mainjobdata.find('p', attrs={'class': 'job-post-date'})
    #print(post_date['title'])
    try:
        dateval = datetime.datetime.strptime(post_date['title'],'%A, %B %d, %Y at %H:%M%p')
    except ValueError:
        dateval = datetime.datetime.strptime(post_date['title'],'%A, %B %d, %Y at%I:%M%p')
    dateval = dateval.strftime('%Y-%m-%d %H:%M') # best time format for spreadsheet programs
    
    #now still need to split the post-date into a term that is valid
    #print(post_date['title'])
    
    #obtain job summary information
    jobsumm = soup.find('div', attrs={'class': 'row job-summary'})
    jobsummdata = jobsumm.find_all(['dl'])
    #print(jobsumm)
    #print(jobdata)
    for d in jobsummdata:
        try:
            temp = re.sub('\s+',' ',d.get_text()).strip().split(":")
            name = re.sub('\s',"_",temp[0].lower())
            if name in ['languages']:
                jobinfo[name] = temp[1].strip().split(',')
            elif name in ['salary']:
                if 'Negotiable' in temp[1].strip().split(','):
                    jobinfo[name] = temp[1].strip().split(',')
                else:
                    newtemp = temp[1].strip().replace(',','')
                    jobinfo[name] = [newtemp]
            else:
                jobinfo[name] = temp[1].strip()
        except KeyError:
            pass
        
    #these columns are not consistent across jobs so need to take this into account
    columns = ['experience_needed','career_level','job_type','salary','education_level','gender','travel_frequency','languages','vacancies']
    for c in columns:
        if c not in jobinfo:
            jobinfo[c] = "NA"
       
    jobcard = soup.find('div', attrs={'class': "about-job content-card"})
    #print(jobcard)
    data = jobcard.find_all('div', attrs={'class': "labels-wrapper"})
    #print(data)
    jobroles = []
    for d in data:
        for role in d.find_all(['a']):
            jobroles.append(role.get_text().strip())    
    jobinfo['roles'] = jobroles
    #print(jobroles)
        
    #obtain job requirements, key words, and industry indicators
    jobreqs = soup.find('div', attrs={'class': "job-requirements content-card"})
    #print(jobreqs)
    if jobreqs is not None:
        data = jobreqs.find_all('meta', content=True)
        keywords = []
        try:
            temp = data[0]['content'].replace('◌ِ','')
            for t in temp.split(', '):
                keywords.append(t.encode('utf-8'))
            jobinfo['keywords'] = keywords
        except IndexError:
            jobinfo['keywords'] = []
    else:
        jobinfo['keywords'] = []
    #print(jobinfo['keywords'])
    
    try:
        data = jobreqs.find_all('li')
        reqs = []
        for d in data:
            temp = d.get_text().lower().strip('.')
            for p in punctuation:
                temp = temp.replace(';','')
            reqs.append(temp.encode('utf-8'))
        jobinfo['requirements'] = reqs
        #print(reqs)
    except:
        jobinfo['requirements'] = []
    
    industries = soup.find('div', attrs={'class': "industries labels-wrapper"})
    #print(industries.find_all(['a']))
    inds = []
    for ind in industries.find_all(['a']):
        inds.append(ind.get_text().strip().encode('utf-8'))
    #print(inds)
    jobinfo['industries'] = inds
    
    #print(jobinfo)
    # now let us return the dictionary entries to write to a csv file.  
    #Note that we may need to split so we do not have problem with commas
    job_data = [datenow,jobinfo['job-title'],jobinfo['job-company-name'],jobinfo['job-company-location'],dateval,num_applicants,num_vacancies,num_seen,
               num_shortlist,num_rejected,jobinfo['experience_needed'],jobinfo['career_level'],jobinfo['job_type'],jobinfo['salary'],
               jobinfo['education_level'],jobinfo['gender'],jobinfo['travel_frequency'],jobinfo['languages'],jobinfo['vacancies'],jobinfo['roles'],jobinfo['keywords'],jobinfo['requirements'],jobinfo['industries']]
    
    return(job_data)

In [26]:
# error test of the data
print(job_urls[640])
print(get_WuzzufJobData(job_urls[640]))

https://wuzzuf.net/jobs/p/101927-Social-Media-Secretary-Auto-Reda-Hamza-Cairo-Egypt
['11292017', b'Social Media Secretary', b'Auto Reda Hamza', b'Heliopolis, Cairo', '2017-11-25 16:22', '30', '1', '10', '3', 0, 'More than 3 years', 'Experienced (Non-Manager)', 'Full Time', ['Negotiable', ' مرتبات وحوافز'], 'NA', 'NA', 'NA', 'NA', '1 open position', ['Administration', 'Media/Journalism/Publishing'], [b'Admin', b'Administration', b'Secretary', b'Social Media', b'Media'], [], [b'Automotive']]


In [27]:
#scrape the data.  either start from beginning or just append to existing file
replace = False

if replace == True:
    scrapeWuzzufPages(job_urls,0,True)
else:
    #Test trial for data entry into csv file (read-in)
    data = pd.read_csv('Wuzzuf_jobdata_'+datenow+'.csv',encoding='utf-8',usecols=['job-title'])
    print(len(data))
    print(job_urls[len(data)])
    scrapeWuzzufPages(job_urls[len(data):],len(data),False)

640
https://wuzzuf.net/jobs/p/101927-Social-Media-Secretary-Auto-Reda-Hamza-Cairo-Egypt
Processed urls: 700
Processed urls: 800
Processed urls: 900
Processed urls: 1000
Processed urls: 1100
Processed urls: 1200
Processed urls: 1300
Processed urls: 1400
Processed urls: 1500
Processed urls: 1600
Processed urls: 1700
Processed urls: 1800
Processed urls: 1900
Processed urls: 2000
Processed urls: 2100
Processed urls: 2200
Processed urls: 2300
Processed urls: 2400
Processed urls: 2500
Processed urls: 2600
Processed urls: 2700
Processed urls: 2800
Processed urls: 2900
Processed urls: 3000
Processed urls: 3100
Processed urls: 3200
Processed urls: 3300
Processed urls: 3400
Processed urls: 3500
Processed urls: 3600
Processed urls: 3700
Processed urls: 3800
Processed urls: 3900
Processed urls: 4000
Processed urls: 4100
Processed urls: 4200
Processed urls: 4300
Processed urls: 4400
Processed urls: 4500
Processed urls: 4600
Processed urls: 4700
Processed urls: 4800
Processed urls: 4900
Processed ur