# Scrape OLX Website

This was created for the paper "Improving Labor Market Matching in Egypt".  It does an initial scrape of the OLX websites which are seemingly one of the most comprehensive sites for online job postings that are available in Egypt.  This code scrapes the following OLX data:
1. Aggregate counts by region of ad postings
2. Aggregate counts by region of job ad postings
3. Counts by region and sector of job ad postings

The data is subsequently output into a csv file that can be read in for analysis.

Note:  In the future, consideration may be given to scraping the individual job ads which contain more detailed information on a) desired level of education b) job data c) experience d) position type.  However, in the initial assessment the difficulty of translating from Arabic into English and scraping the individual job ads was seen as too time consuming within the time that was available and allocated for this assignment.  Job ads seemingly are quite messy and a significant amount of time would be needed to clean the fields and obtain consistency.

The following package may be potentially useful for automatic translation (google)
https://gist.github.com/jseabold/1473363

In [1]:
import urllib.request as urlrequest
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import datetime
import time
import csv
datenow = time.strftime("%m%d%Y")

In [2]:
#this URL has the regions in which it is possible to grab all the locational based data for the jobs
def get_OLXregiondata():


    url = 'https://olx.com.eg/en/sitemap/regions/'
    req = urlrequest.Request(url)
    response = urlrequest.urlopen(req)
    soup = BeautifulSoup(response, 'html.parser')
    #print(soup)
    
    name_box = soup.find('div', attrs={'class': 'content text'})
    regions = name_box.find_all('div', attrs={'class':'bgef pding5_10 marginbott10 margintop20 clr'})
    subregions = name_box.find_all('div',attrs={'class':"clr marginbott10"})
    #print(len(regions))
    #print(len(subregions))

    regionname = []
    totalposts = []
    subregname = []
    subposts = []
    fregname = []
    fsubregname = []
    data = []
    
    for i, subreg in enumerate(subregions):
    #print(regions[i].get_text().strip())
        region = regions[i].get_text().strip()
        temp = region.split(' (')
        regionname.append(temp[0])
        totalposts.append(temp[1].strip(')')) 
        fregname.append(re.sub('\s+(\+\s)?','-',regionname[i].lower()))    
        #print(subreg)
        text = subreg.find_all('li')
        #print(text)
        for t in text:
            temp = t.get_text().strip()
            temp = temp.replace('\n','').split('(')
            #print(temp)
            subregname = temp[0].strip()
            subposts = temp[1].strip(')')
            #print(subregname,subposts)
            fsubregname = re.sub('''[\s(\+\s)?|\'|\.\s]''','-',subregname.lower())
            fsubregname = re.sub('[-](-)?(-)?','-',fsubregname)
            data.append([datenow,regionname[i],fregname[i],totalposts[i],subregname,fsubregname,subposts])
            #print("Region {}, Total Posts {}, Subregion {}, Subposts {}".format(regionname,totalposts,subregname,subposts))
            #want to export data to different csv
      
    #write out the data for 359 districts
    with open('OLX_regiondata_'+datenow+'.csv', 'w', newline='') as file:
        w = csv.writer(file)
        w.writerow(["download_date","region",'fregname','totalposts',"subregion",'fsubregname',"subposts"])
        for d in data:
            w.writerow(d) 
    file.close()

In [3]:
get_OLXregiondata()

In [4]:
#now our objective is to get the sectoral job data for each region-area (one million job postings)
dateval = '11302017'
data = pd.read_csv('OLX_regiondata_'+dateval+'.csv')
print(data.head())

   download_date region fregname  totalposts      subregion    fsubregname  \
0       11302017  Aswan    aswan        3278    Abou Simbel    abou-simbel   
1       11302017  Aswan    aswan        3278  Abou al-Reish  abou-al-reish   
2       11302017  Aswan    aswan        3278     Aswan City     aswan-city   
3       11302017  Aswan    aswan        3278       Basiliah       basiliah   
4       11302017  Aswan    aswan        3278          Daraw          daraw   

   subposts  
0        24  
1        54  
2      2311  
3         9  
4        72  


In [5]:
def get_OLXJobUrls(url):
    
    sector = {}
    href = {}
    
    req = urlrequest.Request(url)
    try:
        response = urlrequest.urlopen(req)
    #certain regions have no job postings
    except:
        return([sector,href])
        
        
    soup = BeautifulSoup(response, 'html.parser')

    #get counts of number of jobs in different areas
    name_box = soup.find_all('div', attrs={'class': 'wrapper'})
    
    for name in name_box:
        #print(name)
        newnames = name.find_all('a', attrs={'class' : 'topLink tdnone '})
        if len(newnames) > 0:
            for i, n in enumerate(newnames):
                #print(n)
                #print(n['href'])
                #print(n.find(href=True))
                #href.append(n.find('a', href=True))
                sect = n.find('span', attrs='link').get_text().strip()
                cnt = n.find('span', attrs='counter nowrap').get_text().strip().replace(',','')
                #export a tuple rather than dictionary
                sector[sect] = cnt
                href[sect] = n['href']
    #print(sector)
    #print(href)
    return([sector,href])

In [6]:
# lets first obtain the full country data to pull-in data on the different job categories
sector, href = get_OLXJobUrls('https://olx.com.eg/en/jobs-services/')

In [7]:
url = 'https://olx.com.eg/en/jobs-services/'+'ramses-ramses-extension'+'/'
print(url)
get_OLXJobUrls(url)

https://olx.com.eg/en/jobs-services/ramses-ramses-extension/


[{'Accounting': '7',
  'Architecture - Engineering': '1',
  'Construction': '2',
  'Consulting': '2',
  'Education': '3',
  'Executive': '1',
  'Hospitality': '1',
  'IT - Telecom': '1',
  'Jobs Wanted': '30',
  'Marketing - PR': '8',
  'Medical - Health': '3',
  'Other': '43',
  'Retail': '8',
  'Sales': '7',
  'Secretarial': '2'},
 {'Accounting': 'https://olx.com.eg/en/jobs-services/accounting/ramses-ramses-extension/',
  'Architecture - Engineering': 'https://olx.com.eg/en/jobs-services/architectureengineering/ramses-ramses-extension/',
  'Construction': 'https://olx.com.eg/en/jobs-services/construction/ramses-ramses-extension/',
  'Consulting': 'https://olx.com.eg/en/jobs-services/consulting/ramses-ramses-extension/',
  'Education': 'https://olx.com.eg/en/jobs-services/education/ramses-ramses-extension/',
  'Executive': 'https://olx.com.eg/en/jobs-services/executive/ramses-ramses-extension/',
  'Hospitality': 'https://olx.com.eg/en/jobs-services/hospitality/ramses-ramses-extension/

In [8]:
# now we want to loop through the key industries above and regions to investigate the counts of postings
# under each heading
# an easier way is to just loop through the general regions  
    
def write_OLXregionjobdata():
    
    #sector names
    sectornames = []
    for key, val in sector.items():
        sectornames.append(key)
    
    #write out the data for 359 districts
    with open('OLX_regionjobdata_'+datenow+'.csv', 'w', newline='') as file:
        w = csv.writer(file)
        w.writerow(["region","subregname"]+sectornames)

        #loop through 365 qism areas to get job data
        for i, reg in data.iterrows():
            fsubregname = reg['fsubregname']
            url = 'https://olx.com.eg/en/jobs-services/' + fsubregname + '/'
            subregsector, subreghref = get_OLXJobUrls(url)
            #now want to output this data into a csv file
            cntdata = []
            for key, val in sector.items():
                if key in subregsector:
                    cntdata.append(int(subregsector[key]))
                else:
                    cntdata.append(0)
            rowdata = [reg['region'],reg['subregion']]+cntdata
            if i % 20 == 0:
                print(rowdata)
            w.writerow(rowdata)
            # sleep to make sure not too many requests are being made
            time.sleep(1)
    file.close()

In [9]:
write_OLXregionjobdata()

['Aswan', 'Abou Simbel', 0, 0, 0, 0, 7, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Asyut', 'New Assiut', 3, 11, 27, 5, 84, 1, 14, 0, 0, 3, 0, 0, 1, 0, 0, 2, 0, 2]
['Alexandria', 'Bolkly', 2, 0, 8, 4, 11, 4, 1, 2, 0, 0, 1, 0, 2, 0, 1, 0, 0, 5]
['Alexandria', 'San Stefano', 2, 18, 5, 4, 24, 5, 9, 4, 0, 0, 2, 0, 0, 1, 1, 0, 0, 4]
['Red Sea', 'Qusair', 0, 1, 1, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0]
['Beheira', 'Wadi al-Natrun', 0, 0, 1, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]
['Giza', 'Kerdasa', 2, 0, 9, 1, 20, 1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1]
['Giza', 'Warraq', 26, 49, 168, 14, 251, 13, 23, 9, 17, 6, 8, 1, 18, 2, 6, 2, 12, 9]
['Suez', 'Arbaeen', 1, 2, 1, 1, 28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['Sharqia', 'Qareen', 0, 0, 1, 0, 25, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0]
['Cairo', 'Ain Shams', 78, 93, 300, 41, 670, 46, 62, 42, 28, 10, 29, 1, 34, 10, 9, 4, 19, 38]
['Cairo', 'New Cairo', 119, 212, 277, 140, 1075, 66, 170, 122, 57, 117, 49, 8, 116, 16, 59, 20, 41, 47]
['Q

## Scrape individual job advertisement pages

In [16]:
#gets all the job page urls from subregion-sector listings
def get_OLXJobPageUrls(url):
    
    urllist = []
    req = urlrequest.Request(url)
    response = urlrequest.urlopen(req)
    soup = BeautifulSoup(response, 'html.parser')
    
    #now find out the total number of pages available
    try:
        nextpage = soup.find('div',attrs={'class':'pager rel clr'})
        temp = nextpage.find('input',attrs={'type':"submit"})['class']
        totalpages = re.search(r'(\d+)',str(temp[1])).group(1)
        #print(temp,totalpages)
    except:
        totalpages = 1
    
    for i in range(1,int(totalpages)+1):
        #print("Enter")
        newurl = url
        if i > 1:
            newurl = url + '/?page='+str(i) 
        req = urlrequest.Request(newurl)
        response = urlrequest.urlopen(req)
        soup = BeautifulSoup(response, 'html.parser')
    
        #<div class="ads__item " onclick="window.location = 'https://olx.com.eg/ad/15-ID8nT4F.html'">
        adlinks = soup.find_all('div',attrs={'class':'ads__item__info'})
        #print(adlinks)
 
        #get all of the job ad page links
        for ad in adlinks:
            urllist.append(ad.find('a')['href'])
        #print(len(urllist))
        time.sleep(1)
    return(urllist)

In [10]:
#create file listing of all job page urls (should be about ~100K)

def write_OLXJobUrls(sector,datast=0):
    #sector names
    sectornames = []
    for key, val in sector.items():
        if key != 'Jobs Wanted':
            sectornames.append(key)
        
    #loop through 365 qism areas to get job data
    for i, reg in data[datast:].iterrows():
            
        print(reg['fsubregname'])
        #write each regional data to a separate file to reduce having to re-do downloads in case of failure
        with open('OLX_joburls_'+reg['fsubregname']+'_'+dateval+'.csv', 'w', newline='') as file:
            w = csv.writer(file)
            w.writerow(["sector","region","subregion","jobpageurl"])
            
            fsubregname = reg['fsubregname']
            url = 'https://olx.com.eg/en/jobs-services/' + fsubregname + '/'
            subregsector, subreghref = get_OLXJobUrls(url)
            
            #now want to grab each of the urls for each subreg and subsector
            for subregsector, href in subreghref.items():
                #print(href)
                urlpages = get_OLXJobPageUrls(href)
                #print(len(urlpages))
                for urlp in urlpages:
                    w.writerow([subregsector,reg['region'],reg['subregion'],urlp])
            if i % 20 == 0:
                print(subregsector,reg['region'],reg['subregion'],urlp)
        file.close()

In [20]:
print(sector)
write_OLXJobUrls(sector,datast=0)

{'Education': '4664', 'Hospitality': '12085', 'Jobs Wanted': '17559', 'Marketing - PR': '6856', 'Other': '55318', 'Retail': '3906', 'Sales': '7532', 'Secretarial': '3871', 'Accounting': '3154', 'Architecture - Engineering': '2563', 'Art - Design': '1721', 'Business Development': '225', 'Construction': '2692', 'Consulting': '589', 'Executive': '2064', 'HR - Recruiting': '790', 'IT - Telecom': '2853', 'Medical - Health': '2506'}
saqqara


KeyboardInterrupt: 

In [17]:
print(data[270:])

     download_date          region        fregname  totalposts  \
270       11302017      New Valley      new-valley         363   
271       11302017      New Valley      new-valley         363   
272       11302017      New Valley      new-valley         363   
273       11302017      New Valley      new-valley         363   
274       11302017      New Valley      new-valley         363   
275       11302017      New Valley      new-valley         363   
276       11302017       Beni Suef       beni-suef        7282   
277       11302017       Beni Suef       beni-suef        7282   
278       11302017       Beni Suef       beni-suef        7282   
279       11302017       Beni Suef       beni-suef        7282   
280       11302017       Beni Suef       beni-suef        7282   
281       11302017       Beni Suef       beni-suef        7282   
282       11302017       Beni Suef       beni-suef        7282   
283       11302017       Beni Suef       beni-suef        7282   
284       

In [9]:
url = 'https://olx.com.eg/en/jobs-services/retail/alexandria/'
urlsall = getOLXJobPageUrls(url)
print(urlsall)

NameError: name 'getOLXJobPageUrls' is not defined

In [11]:
#extract data from individual job ads 
import re
import googletrans

def request_until_succeed(url):
    req = urlrequest.Request(url)
    count = 1
    while count <= 5:
        try: 
            response = urlrequest.urlopen(req)
            if response.getcode() == 200:
                return(response)
        except Exception:
            print("Exception")
            time.sleep(1)
            print("Error for URL %s: %s" % (url, datetime.datetime.now()))
        count+=1
    return(None)

def get_OLXjobdata(url,sector,region,subregion):
    
    fields = ['Experience Level','Employment Type','Education Level','Type','Compensation']
    fielddata = {}
    
    #print(url)
    response = request_until_succeed(url)
    
    soup = BeautifulSoup(response, 'html.parser')
    
    tempurl = url.split('ad/')[1]
    urlshort = tempurl.replace('.html','')
    
    #get content for ad posting data and check if available as some are no longer available
    addata = soup.find('span',attrs={'class':'pdingleft10 brlefte5'})
    
    ### note want to add in the actual time download if we are to use the page views as proxy    
    datetimenow = time.strftime('%Y-%m-%d %H:%M')
    
    if addata is not None:
        #print(addata)
        addata = addata.get_text().strip()
        m = re.search(r"at (\d+:\d+, \d+ \w+ \d+), Ad ID: (\d+)",addata)
        date = m.group(1)
        adid = m.group(2)
        dateval = datetime.datetime.strptime(date,'%H:%M, %d %B %Y')
        adpostdate = dateval.strftime('%Y-%m-%d %H:%M') # best time format for spreadsheet programs
        #print(adid,adpostdate)

        #get main content related to job
        name_box = soup.find_all('div', attrs={'class': "clr descriptioncontent marginbott20"})
    
        for name in name_box:
            #print(name)
            newnames = name.find_all('td', attrs={'class' : 'col'})
            #print(newnames)
            for name in newnames:
                cat = name.find('th').get_text().strip()
                catval = name.find('td').get_text().strip()
                fielddata[cat] = catval
                #print(cat)
                #print(catval)

        #note that not all categories are always included in a job advertisement so we have to make sure there are contingencies
        for f in fields:
            if f not in fielddata:
                fielddata[f] = np.NAN
        
        content = soup.find('div', attrs={'class':"clr", 'id':'textContent'}).get_text().strip()
        translator = googletrans.Translator()
        try:
            translated = translator.translate(content)
            jobcontent = translated.text
        except:
            jobcontent = ""
    
        #format translated text 
        jobcontent = jobcontent.replace('\r',' ').replace('\n','>').encode('utf-8')
        #print(content,jobcontent)

        views = soup.find_all('div',attrs={'class':'pdingtop10'})
        #print(views)
        for v in views:
            if 'Views' in str(v):
                m = re.search(r"Views:<strong>(\d+)</strong>", str(v))
                num_views = m.group(1)
                #print(num_views)

        return(datetimenow, adid,urlshort,adpostdate,sector,region,subregion,jobcontent,num_views,fielddata['Experience Level'],fielddata['Employment Type'],
                fielddata['Education Level'],fielddata['Type'],fielddata['Compensation'])

    else:
        return(datetimenow, np.NAN,urlshort,np.NAN,sector,region,subregion,np.NAN,np.NAN,np.NAN,np.NAN,np.NAN,np.NAN,np.NAN)

In [12]:
get_OLXjobdata('https://olx.com.eg/en/ad/-ID8isW5.html',np.NAN,np.NAN,np.NAN)

('2017-12-07 19:18',
 '122611763',
 '-ID8isW5',
 '2017-12-03 17:08',
 nan,
 nan,
 nan,
 b'Production workers are required in a plastic factory>Salary 3000>There is accommodation for expatriates>Health and social insurance for employees and families>Upgrades and quick incentives>\xc2\xa0\xd9\x84\xd9\x84\xd8\xaa\xd9\x88\xd8\xa7\xd8\xb5\xd9\x84 \xd9\x81\xd9\x88\xd8\xb1\xd8\xb1\xd8\xb1\xd8\xb1\xd8\xb1\xd8\xb1\xd8\xa7 \xd8\xa7\xd9\x84\xd8\xa7\xd8\xb3\xd8\xaa\xd8\xa7\xd8\xb0\xd9\x87 \xd8\xa7\xd9\x86\xd8\xac\xd9\x8a olol 634 - show phone -',
 '592',
 'Entry level',
 'Full-time',
 'Diploma',
 'Employer',
 '3,000')

In [13]:
dateval = '12012017'

def write_OLXjobpagedata(datast=0):
    
    #sector names
    sectornames = []
    for key, val in sector.items():
        sectornames.append(key)
    
    urllist = {}
    
    op = 'a'
    if datast == 0:
        op = 'w'
    
    #write out the data for each job page
    with open('OLX_jobpagedata_'+dateval+'.csv', op, newline='') as file:
        w = csv.writer(file)
        if datast == 0:
            w.writerow(["downloaddate","id","url","postdate","sector","region","subregion","jobcontent","num_views","explevel","emptype","educlevel","employertype","compensation"])
    
        #loop through 365 qism areas to call the files containing the different urls
        for i, reg in data[datast:].iterrows():
            fsubregname = reg['fsubregname']
            print(fsubregname)
            urllist[fsubregname]=pd.read_csv('OLX_joburls_'+reg['fsubregname']+'_'+dateval+'.csv')
            for j, row in urllist[fsubregname].iterrows():
                temp = row['jobpageurl'].split('ad/')[1]
                newurl = 'https://olx.com.eg/en/ad/'+temp
                rowdata = get_OLXjobdata(newurl,row['sector'],row['region'],row['subregion'])
                w.writerow(rowdata)
                time.sleep(1)   
  
    file.close()

In [15]:
#try to resurvey giza-district (114, 127).  Last download sharq district.
write_OLXjobpagedata(datast=343)

fouh
hamoul
kafr-al-sheikh-city
motobas
qaleen
riyadh
sidi-salem
alamein
barany
dabaa
hammam
marina-el-alamein
marsa-matrouh
nagela
north-coast
salloum
siwa


In [114]:
# test the above function on one of the urls
url = 'https://olx.com.eg/en/ad/1600-ID8nedy.html'
region = np.NAN
subregion = np.NAN
sector = np.NAN
print(get_OLXjobdata(url,sector,region,subregion))

https://olx.com.eg/en/ad/1600-ID8nedy.html
123746888 2017-11-26 21:20
مطعم بشارع فؤاد - محطة الرمل - الاسكندرية
اجازة يوم اسبوعيا 
العمل 9 ساعات 
السن اقل من 40 عام
يشترط التفرغ
ارسل رسالة تحتوي على البيانات وسوف يتم الاتصال لتحديد ميعاد المقابلة Fouad Street Restaurant - Raml Station - Alexandria >Weekday vacation >Working 9 hours >Age less than 40 years >Full time required >Send a message containing the data and will be contacted to determine the appointment
2520
('123746888', '2017-11-26 21:20', nan, nan, nan, 'Fouad Street Restaurant - Raml Station - Alexandria >Weekday vacation >Working 9 hours >Age less than 40 years >Full time required >Send a message containing the data and will be contacted to determine the appointment', '2520', 'Entry level', 'Full-time', 'Diploma', 'Employer', '1,600')


In [116]:
print(sector, href)

nan {'Education': 'https://olx.com.eg/en/jobs-services/education/', 'Hospitality': 'https://olx.com.eg/en/jobs-services/hospitality/', 'Jobs Wanted': 'https://olx.com.eg/en/jobs-services/jobs-wanted/', 'Marketing - PR': 'https://olx.com.eg/en/jobs-services/marketingpr/', 'Other': 'https://olx.com.eg/en/jobs-services/jobs-other/', 'Retail': 'https://olx.com.eg/en/jobs-services/retail/', 'Sales': 'https://olx.com.eg/en/jobs-services/sales/', 'Secretarial': 'https://olx.com.eg/en/jobs-services/secretarial/', 'Accounting': 'https://olx.com.eg/en/jobs-services/accounting/', 'Architecture - Engineering': 'https://olx.com.eg/en/jobs-services/architectureengineering/', 'Art - Design': 'https://olx.com.eg/en/jobs-services/artdesign/', 'Business Development': 'https://olx.com.eg/en/jobs-services/business-development/', 'Construction': 'https://olx.com.eg/en/jobs-services/construction/', 'Consulting': 'https://olx.com.eg/en/jobs-services/consulting/', 'Executive': 'https://olx.com.eg/en/jobs-serv