In [1]:
from bs4 import BeautifulSoup 
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv

In [2]:

# use an instance of Chrome to open the desired website and scrape the data
driver = webdriver.Chrome()

# open the single-page application (SPA) in a browser window
driver.get('https://keybank.wd5.myworkdayjobs.com/External_Career_Site')
time.sleep(0.5)

# find a list of buttons on the page that can be clicked to change the HTML on the SPA
pages = driver.find_elements(By.CLASS_NAME, 'css-1j096s0')

# create lists that can be used to store the data scraped from the websites
html_text = []
key_soup = []

# append the first page of the SPA to the html_text list and key_soup list
html_text.append(driver.page_source)
key_soup.append(BeautifulSoup(html_text[0], 'lxml'))

# click on each of the buttons on the page and append the new HTML to the html_text list and key_soup list
for page in range(1,len(pages)):
    pages[page].click()
    time.sleep(0.25)
    html_text.append(driver.page_source)
    key_soup.append(BeautifulSoup(html_text[page], 'lxml'))

In [3]:
jobs = []
locations = []

# parse the inputs to place each listing's data into the details
for page in range(len(key_soup)):
    # use BeautifulSoup to find all of the links to the job titles
    listing = key_soup[page].find_all('a', class_ = 'css-19uc56f')
    for title in listing:
        jobs.append(title)

    # use BeautifulSoup to find all of the job location and dates posted
    place = key_soup[page].find_all('dd', class_='css-129m7dg')
    for citystate in place:
        locations.append(citystate)

In [16]:
# write the scraped data to a csv file
with open('Scraped Internships.csv', 'w', newline='') as file:
    csv_writer = csv.writer(file)
    
    # include the following columns of data
    csv_writer.writerow(['Company', 'Job Title', 'Job Link', 'Salary', 'Location', 'Date Posted'])

In [17]:
with open('Scraped Internships.csv', 'a', newline='') as file:
    csv_writer = csv.writer(file)
    
    # input the data to the csv in the following format: [Company, Job Title, Job Link, Salary, Location, Date Posted]
    for job, location in zip(jobs, range(len(locations))):
        line = ['KeyBank', job.text.split(' -')[0].strip(), 'https://keybank.wd5.myworkdayjobs.com' + job.get('href'), 'N/A', locations[(location * 2)].text, locations[(location * 2) + 1].text]
        csv_writer.writerow(line)

In [5]:
# SCE Internships
# first 5 pages of SCE internships in the USA
SCE_first5_pages = ['https://www.edisoncareers.com/job-search-results/?category=Internship&compliment=SCE',
                    'https://www.edisoncareers.com/job-search-results/?category=Internship&compliment=SCE&pg=2',
                    'https://www.edisoncareers.com/job-search-results/?category=Internship&compliment=SCE&pg=3',
                    'https://www.edisoncareers.com/job-search-results/?category=Internship&compliment=SCE&pg=4',
                    'https://www.edisoncareers.com/job-search-results/?category=Internship&compliment=SCE&pg=5']

# create lists that can be used to store the data scraped from the websites
SCE_html = []
SCE_soup = []

# append the first 5 pages of the SCE website to the html_text list and key_soup list
for page, num in zip(SCE_first5_pages, range(len(SCE_first5_pages))):
    # keep getting the new page and waiting a second for the page to laod
    driver.get(page)
    time.sleep(0.25)
    
    # append the new page to the html_text list and key_soup list
    SCE_html.append(driver.page_source)
    SCE_soup.append(BeautifulSoup(SCE_html[num], 'lxml'))

In [6]:
# create lists that can be used to store the individual data scraped from the websites
SCE_jobs = []
SCE_locations = []

# parse the inputs to place each listing's data into the details
for page in range(len(SCE_soup)):
    # use BeautifulSoup to find all of the links to the job titles
    positions = SCE_soup[page].find_all('div', class_ = 'jobTitle')
    for pos in positions:
        SCE_jobs.append(pos)
        
    # use BeautifulSoup to find all of the job location and dates posted
    places = SCE_soup[page].find_all('div', class_ = 'job-innerwrap g-cols')
    for city_state in places:
        SCE_locations.append(city_state.find('div', class_ = 'flex_column joblist-location fusion-layout-column fusion-one-fifth').text)

In [7]:
def working_type(job_title):
    '''
    Parses the job title to determine if the job is remote, hybrid, or on-site

    Parameters: job_title - the job title to be parsed
    Returns: job_title - the job title with the remote, hybrid, or on-site removed
             working_type - the remote, hybrid, or on-site type of the job
    '''
    
    # convert the title to a string
    job_title = str(job_title)
    
    # check if the job is remote, hybrid, or on-site
    if ('hybrid' in job_title.lower()):
        split = job_title.split(' [')
        return split[0], split[1][:-1]
    elif ('remote' in job_title.lower()):
        split = job_title.split(' [')
        return split[0], split[1][:-1]
    elif ('on-site' or 'onsite' in job_title.lower()):
        split = job_title.split(' [')
        return split[0], split[1][:-1]
    
    # return just the job title otherwise
    return job_title, 'N/A'

In [8]:
# write the scraped data to a csv file
with open('Scraped Internships.csv', 'a', newline='') as file:
    csv_writer = csv.writer(file)
    
    # include the following columns of data: [Company, Job Title, Job Link, Location, Date Posted]
    for po, place in zip(SCE_jobs, SCE_locations):
        # parse the job title to determine if the job is remote, hybrid, or on-site
        title, work_type = working_type(po.text)
        
        # add the line to the csv in the given format
        line = ['Southern California Edison', title, 'https://www.edisoncareers.com' + SCE_jobs[SCE_jobs.index(po)].find('a').get('href'), place + ' (' + work_type + ')', 'N/A']
        csv_writer.writerow(line)

In [9]:
driver.get('https://jobs.lenovo.com/en_US/careers/SearchJobs/?13036=%5B12016802%5D&13036_format=6621&7715=%5B327885%5D&7715_format=3083&listFilterMode=1&jobRecordsPerPage=10&sort=relevancy')
time.sleep(0.5)

# create lists that can be used to store the data scraped from the websites
lenovo_html = []
lenovo_soup = []

# append the first page of the website to the html_text list and key_soup list
lenovo_html.append(driver.page_source)
lenovo_soup.append(BeautifulSoup(lenovo_html[0], 'lxml'))

# find the next page button and only use the first half of the results (page buttons on top and bottom of website)
next_page_button = lenovo_soup[0].find_all('a', class_='list-controls__pagination__item paginationLink')
next_page_button = next_page_button[:int(len(next_page_button)/2)]
    
# go through the first 5 pages of the website and wait a second for each page to load
for page, idx in zip(next_page_button, range(len(next_page_button))):
    # break if more than 5 pages are parsed to ensure that the csv is not dominated with Lenovo listing
    if (idx >= 5):
        break
    
    # keep getting the new page and waiting a second for the page to load
    driver.get(page['href'])
    time.sleep(0.25)
    
    # append the new page to the html_text list and key_soup list
    lenovo_html.append(driver.page_source)
    lenovo_soup.append(BeautifulSoup(lenovo_html[idx + 1], 'lxml'))

In [10]:
# create lists that can be used to store the individual data scraped from the websites
lenovo_jobs = []
lenovo_locations = []

# parse the inputs to place each listing's data into the details
for page in range(len(lenovo_soup)):
    # use BeautifulSoup to find all of the links to the listings
    internships = lenovo_soup[page].find_all('h3', class_='article__header__text__title article__header__text__title--4')
    for internship in internships:
        lenovo_jobs.append(internship)
    
    # use BeautifulSoup to find all of the job location and dates posted
    subtitles = lenovo_soup[page].find_all('div', class_='article__header__text__subtitle')
    for subtitle in subtitles:
        lenovo_locations.append(subtitle)

In [11]:
# write the scraped data to a csv file
with open('Scraped Internships.csv', 'a', newline='') as file:
    csv_writer = csv.writer(file)
    
    # include the following columns of data: [Company, Job Title, Job Link, Location, Date Posted]
    for int, sub in zip(lenovo_jobs, lenovo_locations):
        # break the subtitles up into their respective categories
        subs = sub.find_all('span')
        
        # add the line to the csv in the given format: [Company, Job Title, Job Link, Location, Date Posted]
        line = ['Lenovo', int.text.strip(), int.find('a').get('href'), subs[0].text.strip(), subs[2].text.strip()]
        csv_writer.writerow(line)

In [12]:
# first 5 pages of the Fedex website
fedex_first_5 = ['https://careers.fedex.com/intern/jobs/categories/Intern?categories=Intern&page=1',
                'https://careers.fedex.com/intern/jobs/categories/Intern?categories=Intern&page=2',
                'https://careers.fedex.com/intern/jobs/categories/Intern?categories=Intern&page=3',
                'https://careers.fedex.com/intern/jobs/categories/Intern?categories=Intern&page=4',
                'https://careers.fedex.com/intern/jobs/categories/Intern?categories=Intern&page=5']

# create lists that can be used to store the data scraped from the websites
fedex_html = []
fedex_soup = []

# go through the first 5 pages of the website and wait a second for each page to load
for page, idx in zip(fedex_first_5, range(5)):
    driver.get(page)
    time.sleep(0.25)
    
    # append the new page to the html_text list and key_soup list
    fedex_html.append(driver.page_source)
    fedex_soup.append(BeautifulSoup(fedex_html[idx], 'lxml'))

In [13]:
# create lists that can be used to store the details from the scraped data
fedex_jobs = []
fedex_locations = []
fedex_companies = []

# parse the inputs to place each listing's data into the details
for page in range(5):
    
    # look through each of the 10 listings on each page and add each job title that isn't blank
    for idx in range(10):
        jobs = fedex_soup[page].find_all('a', id = 'link-job-' + str(idx))
        for job in jobs:
            if job.text != '':
                fedex_jobs.append(job)

    # find all the locations and add then to a list
    locations = fedex_soup[page].find_all('span', class_ = 'label-value location')
    for location in locations:
        fedex_locations.append(location)
        
    # find all the companies and add them to a list
    companies = fedex_soup[page].find_all('span', class_ = 'brand label-value')
    for company in companies:
        fedex_companies.append(company)

In [14]:
def reformat_location(location):
    '''
    Reformat the location to remove any extra spaces or newlines

    Parameters: location - the location to be reformatted
    Returns: location - the reformatted location
    '''
    
    # remove any extra spaces or newlines
    location = ' '.join(location.split('\n'))
    return location.strip()

In [15]:
# write the scraped data to a csv file
with open('Scraped Internships.csv', 'a', newline='') as file:
    csv_writer = csv.writer(file)
    
    # input the data to the csv in the following format: [Company, Job Title, Job Link, Location, Date Posted]
    for job, location, company in zip(fedex_jobs, fedex_locations, fedex_companies):
        line = [company.text.strip(), job.text, 'https://careers.fedex.com/' + job.get('href'), reformat_location(location.text), 'N/A']
        csv_writer.writerow(line)