---
##### Goal of Script: Pull data from a website 

How do candidates identify potential companies during a job search?

One method is to read online articles such as Fortune's "Best Workplaces for Millennials" ([link](https://www.greatplacetowork.com/best-workplaces/Millennials/2016)).I created a script that retrieves (i.e. web scrapes) company information directly from the website. 

* Fortune will continue to use the same html structure in the future
* Fortune will continue to allow web scraping i.e. the site isn't blocked 
* A company can only occur once on the Fortune list every year

In [3]:
!python --version

Python 3.7.3


In [1]:
#pip install selenium 

In [2]:
#pip show selenium

In [3]:
#download and move chromedrive.exe file to same location as jupyter notebook 
#used the stable version ChromeDriver 107.0.5304.62 from the website https://chromedriver.chromium.org/home

In [4]:
import os
import pandas as pd 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

cms_url = "https://data.cms.gov/provider-compliance/cost-report/hospital-provider-cost-report/api-docs/"

# configure webdriver
options = Options()
options.headless = True  # hide GUI
options.add_argument("--window-size=1920,1080")  # set window size to native GUI size
options.add_argument("start-maximized")  # ensure window is full-screen

# get path of chromedrive
path = os.getcwd() + '\\chromedriver.exe'
print("chromedriver location: {}".format(path))

# access data.cms.gov website
s = Service(path)
driver = webdriver.Chrome(service=s, options=options)
driver.get(cms_url)
print("driver information: {}".format(driver))

chromedriver location: C:\Users\mmuno\Desktop\GitHub\Capstone\chromedriver.exe
driver information: <selenium.webdriver.chrome.webdriver.WebDriver (session="435cd35e2be0b17c76f056b54ca6491f")>


In [7]:
# get row count of table
rows = len(driver.find_elements(by=By.XPATH, value="/html/body/div/div/div/div/div/div/div[2]/div[2]/div/div/table/tbody/tr"))
cols = len(driver.find_elements(by=By.XPATH, value="/html/body/div/div/div/div/div/div/div[2]/div[2]/div/div/table/thead/tr/th"))

In [8]:
# Webscrape cms website
version_ls = []

for r in range(1,rows+1):
    values = []
    for c in range(1,cols+1):
        xpath = "/html/body/div/div/div/div/div/div/div[2]/div[2]/div/div/table/tbody/tr["+str(r)+"]/td["+str(c)+"]"
        driver_value = driver.find_elements(by=By.XPATH, value=xpath)
        col_value = driver_value[0].text
        values.append(col_value)

    version_ls.append(values)
    
# create dataframe with webscraped information from cms website  
version_df = pd.DataFrame(version_ls, columns =['year', 'version'])
version_df

Unnamed: 0,year,version
0,2019,6ebd03b1-ff48-4994-94ed-0a54e90c1bd6
1,2018,90869abf-c649-4d65-84b3-4d6a1b568b69
2,2017,b2a1e8c3-62c3-4c47-94b2-5fa16b122a4d
3,2016,2981f550-653f-46a6-a5a5-06a3408eb245
4,2015,73e66edc-0b70-4e88-b1af-7d2b98a243f5
5,2014,9855c8b2-1514-47f8-a3ed-c34c2d41eed3
6,2013,7d1090ac-9d79-47d6-b42d-533a1f3edd7a
7,2012,9f3eee40-cdbc-4082-af3f-30e1807399b9
8,2011,db36eabb-2344-4053-a579-6fa48602bb29


In [None]:
version_df.to_csv("URL_Versions.csv", index=False) #export data as excel file

In [None]:
driver.close()  # Closes the tab which was opened earlier
driver.quit()  # Quits the entire browser