# Web Scraping

In this notebook I will walk you through the process of web scraping using Python. I will use the following libraries:

In [1]:
import pandas as pd # to work with dataframes
import requests # to send http requests
from bs4 import BeautifulSoup # to parse html

I will scrape the website [Wuzzuf](https://wuzzuf.net/) to get the data science jobs in Egypt.


In [2]:
req = requests.get('https://wuzzuf.net/search/jobs/?a=hpb%7Cspbg&q=data%20science')
req

<Response [200]>

In [3]:
content = req.content

In [4]:
soup = BeautifulSoup(content, 'html.parser')

In [5]:
jobs = soup.find_all("div", class_='css-pkv5jc')


In [6]:
len(jobs)

15

In [7]:
jobs[0]

<div class="css-pkv5jc"><a href="https://wuzzuf.net/jobs/careers/Care-Dental-Egypt-27404" rel="noreferrer" target="_blank"><style data-emotion="css 17095x3">.css-17095x3{position:absolute;right:0;top:0;width:60px;height:60px;object-fit:contain;object-position:center center;}</style></a><style data-emotion="css laomuu">.css-laomuu{padding-right:60px;}</style><div class="css-laomuu"><style data-emotion="css m604qf">.css-m604qf{font-size:16px;font-weight:600;font-style:normal;letter-spacing:-0.4px;line-height:24px;color:#0055D9;margin:0;}</style><h2 class="css-m604qf"><style data-emotion="css o171kl">.css-o171kl{-webkit-text-decoration:none;text-decoration:none;color:inherit;}</style><a class="css-o171kl" href="https://wuzzuf.net/internship/1kma8fGoLuav-AI-Data-Science-Python-Developer-Intern-Sequel-Solutions-Cairo-Egypt" rel="noreferrer" target="_blank">AI &amp; Data Science Python Developer Intern</a></h2><style data-emotion="css d7j1kk">.css-d7j1kk{margin-bottom:8px;overflow:hidden;tex

In [8]:
job_title = jobs[0].find("h2", class_='css-m604qf').text
job_title

'AI & Data Science Python Developer Intern'

In [9]:
company_name = jobs[0].find("div", class_ = "css-d7j1kk").text.split(" - ")[0]
company_name


'Sequel Solutions'

In [10]:
country = jobs[0].find("span", class_ = "css-5wys0k").text.split(", ")[1].replace(" ", "")
country


'Egypt'

In [11]:
city = jobs[0].find("span", class_ = "css-5wys0k").text.split(", ")[0]
city

'Cairo'

In [12]:
date_of_offer = jobs[0].find("div", class_ = "css-4c4ojb").text
date_of_offer

'3 days ago'

In [13]:
job_type = jobs[0].find("span", class_="css-1ve4b75 eoyjyou0").text
job_type

'Internship'

In [14]:
work_location = jobs[0].find("span", class_="css-o1vzmt eoyjyou0").text
work_location

'Remote'

In [15]:
experience_level = jobs[0].find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[0]
experience_level

'Student'

In [16]:
yrs_exp = jobs[4].find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[1].split(" Yrs ")[0]
yrs_exp

'1 - 2'

In [17]:
industry_and_skills = ", ".join(jobs[0].find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[2:])
industry_and_skills

'IT/Software Development, Engineering - Telecom/Technology, Computer Science, Algorithms, Information Technology (IT), Python, Software, Software Development, Software Engineering, Programming'

In [19]:
wuzzuf = pd.DataFrame(columns=["job_title", "company_name", "country", "city", "job_type",
                               "work_location", "experience_level", "yrs_exp", "industry_and_skills"])

for i in range(100000):
    req_for = requests.get(f'https://wuzzuf.net/search/jobs/?a=hpb%7Cspbg&q=data%20science&start={i}')
    content_for = req_for.content
    soup_for = BeautifulSoup(content_for, 'html.parser')
    jobs_for = soup_for.find_all("div", class_='css-pkv5jc')
    if len(jobs_for) == 0:
        break
        
    
    for job in jobs_for:
        
        job_title = job.find("h2", class_='css-m604qf').text
        
        company_name = job.find("div", class_ = "css-d7j1kk").text.split(" - ")[0]
        
        country_select = job.find("span", class_ = "css-5wys0k").text.split(", ")[1].replace(" ", "")
        if country_select in ["Cairo", "Giza", "Sharqia", "Alexandria", "Damietta", "Dakahlia", "Fayoum", "Gharbia", 
                              "Ismailia", "KafrElSheikh", "Luxor", "Matruh", "Minya", "Monufya", "NewValley", "NorthSinai",
                              "PortSaid", "Qalubia", "Qena", "RedSea", "Sohag", "SouthSinai", "Suez"]:
            country = "Egypt"
            city = country_select
        else:
            country = country_select
            city = job.find("span", class_ = "css-5wys0k").text.split(", ")[0]
        
        job_type = job.find("span", class_="css-1ve4b75 eoyjyou0").text
        
        try:
            work_location = job.find("span", class_="css-o1vzmt eoyjyou0").text
        except:
            work_location = "Not Specified"
        
        experience_level = job.find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[0]
        
        if job.find("div", class_="css-1lh32fc").next_sibling.text.find("Yrs") != -1:
            yrs_exp = job.find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[1].split(" Yrs ")[0]
            industry_and_skills = ", ".join(job.find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[2:])
        else:
            yrs_exp = "Not Specified"
            industry_and_skills = ", ".join(job.find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[1:])
        
        wuzzuf.loc[len(wuzzuf)] = [job_title, company_name, country, city, job_type, work_location, experience_level, yrs_exp, industry_and_skills]
        
print("All Done")

All Done


In [20]:
wuzzuf

Unnamed: 0,job_title,company_name,country,city,job_type,work_location,experience_level,yrs_exp,industry_and_skills
0,AI & Data Science Python Developer Intern,Sequel Solutions,Egypt,Cairo,Internship,Remote,Student,0 - 1,"IT/Software Development, Engineering - Telecom..."
1,Senior Accountant,Data Science,Egypt,Giza,Full Time,On-site,Experienced,3 - 5,"Accounting/Finance, Administration, Banking, A..."
2,Telesales Representative & Data Collector,Diamond,Egypt,Giza,Full Time,On-site,Experienced,3 - 5,"R&D/Science, Sales/Retail, Sales Field, Telesa..."
3,Data Management Supervisor,Nahdet Misr Publishing Group,Egypt,Giza,Full Time,On-site,Experienced,6 - 8,"Operations/Management, Sales/Retail, Computer ..."
4,Data Analyst,Confidential,Egypt,Giza,Full Time,Hybrid,Entry Level,1 - 2,"Analyst/Research, Engineering - Telecom/Techno..."
...,...,...,...,...,...,...,...,...,...
1258,Sustainability Consultant,dss+,UnitedArabEmirates,Dubai,Full Time,Not Specified,Experienced,Not Specified,"Accounting/Finance, IT/Software Development, S..."
1259,Regional SHE Manager,S.C. Johnson & Son Inc,Egypt,Cairo,Full Time,Not Specified,Not specified,Not Specified,"Business Development, Operations/Management, S..."
1260,Front of House & Parent Liaison Officer - Rege...,schoolscompared,UnitedArabEmirates,Dubai,Full Time,Not Specified,Not specified,Not Specified,"Administration, Customer Service/Support, Educ..."
1261,Market Access Private Health Insurance Manager...,Johnson & Johnson,UnitedArabEmirates,Dubai,Full Time,Not Specified,Not specified,Not Specified,"Marketing/PR/Advertising, Sales/Retail, Pharma..."


In [21]:
wuzzuf.duplicated().sum()

0

In [22]:
wuzzuf.country.value_counts()

country
Egypt                 604
SaudiArabia           308
UnitedArabEmirates    274
Qatar                  54
Kuwait                 17
UnitedStates            3
Bahrain                 1
D.C.                    1
Germany                 1
Name: count, dtype: int64

In [23]:
wuzzuf.city.value_counts()

city
Cairo             447
Dubai             269
Riyadh            269
Giza              106
Doha               54
Alexandria         28
Makkah             25
Kuwait City        17
Sharqia            10
Jeddah              8
Abu Dhabi           4
Gharbia             3
Tabuk               3
Minya               2
Dakahlia            2
Khobar              2
Ras al-Khaimah      1
RedSea              1
Damietta            1
PortSaid            1
Monufya             1
Jazan               1
Munich              1
Washington          1
Ar Rifa'            1
Suez                1
Henderson           1
Qalubia             1
San Francisco       1
Trenton             1
Name: count, dtype: int64

In [24]:
wuzzuf.yrs_exp.value_counts()

yrs_exp
Not Specified    840
3 - 5             65
1 - 3             38
5 - 7             22
2 - 4             21
                ... 
2 - 20             1
3 - 13             1
4 - 4              1
8 - 15             1
3 - 16             1
Name: count, Length: 74, dtype: int64

In [25]:
wuzzuf[wuzzuf.city == ("Dakahlia")]

Unnamed: 0,job_title,company_name,country,city,job_type,work_location,experience_level,yrs_exp,industry_and_skills
210,Senior Software Tester,qTech.,Egypt,Dakahlia,Full Time,On-site,Experienced,3+,"IT/Software Development, Quality, Engineering ..."
416,Software Tester,qTech.,Egypt,Dakahlia,Full Time,On-site,Entry Level,1+,"IT/Software Development, Quality, Engineering ..."


In [26]:
wuzzuf.to_csv("data_science.csv")

## For functional programming

### Functions for encoding and decoding the search name 

In [27]:
def encode(s):
    encoded_text = s.replace(" ", "%20")
    encoded_text = encoded_text.replace("(","%28")
    encoded_text = encoded_text.replace(")","%29")
    return encoded_text

def decode(s):
    decoded_text = s.replace("%20", " ")
    decoded_text = decoded_text.replace("%28","(")
    decoded_text = decoded_text.replace("%29",")")
    return decoded_text

### Function for scraping

In [28]:
def scraping(search_name):
    encoded_text = encode(search_name)
    wuzzuf_func = pd.DataFrame(columns=["job_title", "company_name", "country", "city", "job_type", "work_location", "experience_level", "yrs_exp", "industry_and_skills"])
    
    for i in range(100000):
        req_for = requests.get(f'https://wuzzuf.net/search/jobs/?a=hpb%7Cspbg&q={encoded_text}&start={i}')
        content_for = req_for.content
        soup_for = BeautifulSoup(content_for, 'html.parser')
        jobs_for = soup_for.find_all("div", class_='css-pkv5jc')
        if len(jobs_for) == 0:
            break
            
            
        for job in jobs_for:
            job_title = job.find("h2", class_='css-m604qf').text
            
            company_name = job.find("div", class_ = "css-d7j1kk").text.split(" - ")[0]
            
            country_select = job.find("span", class_ = "css-5wys0k").text.split(", ")[1].replace(" ", "")
            
            if country_select in ["Cairo", "Giza", "Sharqia", "Alexandria", "Damietta", "Dakahlia", "Fayoum", "Gharbia", "Ismailia", "KafrElSheikh", "Luxor", "Matruh", "Minya", "Monufya", "NewValley", "NorthSinai", "PortSaid", "Qalubia", "Qena", "RedSea", "Sohag", "SouthSinai", "Suez"]:
                country = "Egypt"
                city = country_select
            else:
                country = country_select
                city = job.find("span", class_ = "css-5wys0k").text.split(", ")[0]
                
            job_type = job.find("span", class_="css-1ve4b75 eoyjyou0").text
            
            try:
                work_location = job.find("span", class_="css-o1vzmt eoyjyou0").text
            except:
                work_location = "Not Specified"
                
            experience_level = job.find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[0]
            if job.find("div", class_="css-1lh32fc").next_sibling.text.find("Yrs") != -1:
                yrs_exp = job.find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[1].split(" Yrs ")[0]
            else:
                yrs_exp = "Not Specified"
                
            industry_and_skills = ", ".join(job.find("div", class_="css-1lh32fc").next_sibling.text.split(" · ")[2:])
            
            wuzzuf_func.loc[len(wuzzuf_func)] = [job_title, company_name, country, city, job_type, work_location, experience_level, yrs_exp, industry_and_skills]
            
            
    decoded_text = decode(encoded_text)
    wuzzuf_func.to_csv(f"{decoded_text}.csv")
    print("All done and file has been saved :)")
    return wuzzuf_func

### Testing the function

In [29]:
game_developer = scraping("game developer")

All done and file has been saved :)


Here we get to the end of the notebook, I hope you enjoyed it and learnt something from it.

Thank you, and don't forget an upvote if you find it useful 😍
