<p style="text-align:center;">
<img src="https://github.com/digital-futures-academy/DataScienceMasterResources/blob/main/Resources/datascience-notebook-header.png?raw=true"
     alt="DigitalFuturesLogo"
     style="float: center; margin-right: 10px;" />
</p>

### Extract Data Science job search data from reed.co.uk

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Reed Web Scraper

In [15]:
# webscraper function for reed.co.uk
def reed_webscraper(url, no_of_pages):

    # store data for data frame
    job_title = []
    company = []
    salary = []
    location = []
    job_type = []
    remote = []

    # iterate through every webpage
    for page in range(no_of_pages):
        
        # get data from webpage and create BeautifulSoup object
        response = requests.get(url, params={'pageno': page})
        soup = BeautifulSoup(response.text, 'html.parser')

        # find all the tags containing info for each job post
        job_cards = soup('article')

        # iterate through each job post on current page
        for job in job_cards:
            
            # extract job title from html
            job_title.append(job.h2.text)

            # extract company name from html
            company.append(job.find('a', class_='gtmJobListingPostedBy').text)

            # extract salary, location, contract type, and remote data from html
            if len(job.ul) == 3:
                for i, j in enumerate(job.ul):
                    if i == 0:
                        salary.append(j.text)
                    elif i == 1:
                        location.append(j.text)
                    else:
                        job_type.append(j.text)
                        remote.append('No')
            else:
                for i, j in enumerate(job.ul):
                    if i == 0:
                        salary.append(j.text)
                    elif i == 1:
                        location.append(j.text)
                    elif i == 2:
                        job_type.append(j.text)
                    else:
                        remote.append(j.text)

    # create and return df                    
    df = pd.DataFrame({'salary': salary, 'title': job_title, 'company': company, 'location': location, 'contract': job_type, 'remote': remote})

    return df

### Create DataFrames

In [16]:
# scrape data scientist job data
url = 'https://www.reed.co.uk/jobs/data-scientist-jobs-in-united-kingdom?hideTrainingJobs=true&excludeSalaryDescriptions=16%2C32%2C64'
data_scientist_df = reed_webscraper(url, 14)

In [18]:
data_scientist_df.shape

(364, 6)

In [19]:
# scrape data analyst job data
url = 'https://www.reed.co.uk/jobs/data-analyst-jobs-in-united-kingdom?hideTrainingJobs=true&excludeSalaryDescriptions=16%2C32%2C64'
data_analyst_df = reed_webscraper(url, 109)

In [21]:
data_analyst_df.shape

(2943, 6)

In [22]:
# scrape data engineer job data
url = 'https://www.reed.co.uk/jobs/data-engineer-jobs-in-united-kingdom?hideTrainingJobs=true&excludeSalaryDescriptions=16%2C32%2C64'
data_engineer_df = reed_webscraper(url, 116)

In [24]:
data_engineer_df.shape

(3016, 6)

In [25]:
# scrape machine learning engineer job data
url = 'https://www.reed.co.uk/jobs/machine-learning-engineer-jobs-in-united-kingdom?hideTrainingJobs=true&excludeSalaryDescriptions=16%2C32%2C64'
ml_engineer_df = reed_webscraper(url, 7)

In [27]:
ml_engineer_df.shape

(175, 6)

In [28]:
# scrape data science job data
url = 'https://www.reed.co.uk/jobs/data-science-jobs-in-united-kingdom?hideTrainingJobs=true&excludeSalaryDescriptions=16%2C32%2C64'
data_science_df = reed_webscraper(url, 43)

In [30]:
data_science_df.shape

(1075, 6)

In [31]:
# concatenate all dfs
df = pd.concat([data_scientist_df, data_analyst_df, data_engineer_df, ml_engineer_df, data_science_df], ignore_index=True)

In [39]:
df.shape

(7573, 6)

In [40]:
# save df to csv file
df.to_csv('reed_data.csv', index=False)