In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# create a dictionary for links to wayback machine that I will scrape (urls)
# this code is for working with wayback machine from 2014-2019

urls = {
    "2014":'https://web.archive.org/web/20140602022834/http://www.spacex.com/careers/list',
    "2015":'https://web.archive.org/web/20150607064025/http://www.spacex.com:80/careers/list',
    "2016":'https://web.archive.org/web/20160608061222/http://www.spacex.com:80/careers/list',
    "2017":'https://web.archive.org/web/20170609025251/http://www.spacex.com:80/careers/list',
    "2018":'https://web.archive.org/web/20180610152924/http://www.spacex.com:80/careers/list'
}

In [3]:
# gets the title of the job
def find_title(job):
    title = job.find('a').text
    return title

In [4]:
# gets the location of the job in the form of a dictionary 
def find_location(job):
    location = job.find(class_ ="field field-name-field-job-location field-type-taxonomy-term-reference field-label-hidden").text
    location = location.split()
    location = {
        "city":location[0].split(',')[0],
        "state":location[1].split(',')[0],
        "country":' '.join(location[2:])
    }
    return location

In [5]:
# gets the link to the job posting
def find_job_link(job):
    job_link = job.find('a', href=True)
    return job_link['href']

In [6]:
# gets job requirement info from the link
def find_job_information(job_link):
    # for each year, get the url...
    response = requests.get(job_link)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get the job table from that year...
    job_information = soup.find( class_ = "details")
    return job_information

#find_job_information('/web/20140602022834/http://www.spacex.com/careers/position/4653')

In [7]:
# Comments from the stack overflow post I adapted this from
#  the first argument to find tells it what tag to search for
#  the second you can pass a dict of attr->value pairs to filter
#  results that match the first tag
#  now rows contains each tr in the table (as a BeautifulSoup object)
#  and you can search them to pull out the times

# job_dict should be global, it will contain the info for 2014-2019
# job_number is made up for the purposes of this script, it is NOT the actual job number
job_dict={}
job_number=0

# the year is the key
for year in urls:
    # for each year, get the url...
    response = requests.get(urls[year])
    soup = BeautifulSoup(response.text, 'html.parser')

    # get the job table from that year...
    job_table = soup.find( class_ = "view-content")

    # and then make a rough table of each of the posts
    jobs=list()
    for job in job_table.findAll("tr"):
       jobs.append(job)
    
    # then, process them using the functions written earlier to find the 
    # job link, location, and job title for every element in the table
    for job in jobs:
        job_dict[job_number] = dict(
            job_title = find_title(job),
            job_location_city=find_location(job)["city"], 
            job_location_state=find_location(job)["state"], 
            job_location_country=find_location(job)["country"], 
            #job_link=find_job_link(job),
            #job_information=find_job_information('https://web.archive.org'+find_job_link(job)),
            job_post_year=year
                                        )
        job_number+=1
        
    # add in the job information for each one, since the link needs to be there
    #job_number = 0
    
    #for job in jobs:
   #     job_dict[job_number][job_information]=find_job_information('https://web.archive.org'+job_dict[job_number][job_link])
        
   #     job_number+=1


In [8]:
print(job_dict[0])

{'job_title': 'Materials Test Technician', 'job_location_city': 'Hawthorne', 'job_location_state': 'CA', 'job_location_country': 'United States', 'job_post_year': '2014'}


In [110]:
# now, export it to an excel file

df = pd.DataFrame.from_dict(job_dict).T
df.to_excel('spacex_jobs_2014-2019.xlsx')