## Libraries

In [102]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from bs4.element import Comment
import pandas as pd
import time
from datetime import datetime
import json
from tqdm import tqdm
import requests
import re
import pathlib
import janitor

## Base Webscraper Class

In [3]:
class BaseWebscraper():
    def __init__(self, data_directory):
        
        self.parent_directory = pathlib.Path(data_directory)          
        
    # Scraper ------------------------------------------------
    ## This section of the class handles the downloading of html or files from websites
    def _make_request_soup(self):
        """
        description: uses requests library to download html
        input: url defined by scrape 
        output: html of webpage        
        """
        r = requests.get(self.url)
        self.html = r.content        

    def _make_selenium_soup(self):

        """
        description: uses selenium library to download html
        input: url defined by scrape 
        output: html of webpage        
        """

        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(
                ChromeDriverManager().install(),
                options=chrome_options
                )
        driver.get(self.url)
        self.html = driver.page_source 

    def _get_csv(self):
        """
        description: uses pandas library to download csv file from website
        input: url defined by scrape
        output: pandas dataframe
        
        """
        
        df_csv = pd.read_csv(self.url)
        return(df_csv)  

    def _tag_visible(self,element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]', 'table']:
            return False
        if isinstance(element, Comment):
            return False
        return True

    def scrape(self,url, scraper_type, name, save_folder):
        """
        description: saves html code from website to specified folder
        input:
            - url: url of the website
            - scraper_type: requests or selenium as a string
            - name: name of the file
            - save_folder: the name of the folder the file wil save in with respect to data directory already defined
        output: saved html file on local disk        
        """
        self.url = url
        if scraper_type == "requests":      
            self.soup = self._make_request_soup()
        if scraper_type == "selenium":
            self.soup = self._make_selenium_soup()
        self.save_html(name, save_folder)
        


    def _text_from_html(self):
    
        texts = self.soup.findAll(text=True)
        visible_texts = filter(self._tag_visible, texts)  
        return u" ".join(t.strip() for t in visible_texts)     
        

   
    def _save_file(self, save_folder):
        self.file_path = self.parent_directory.joinpath(save_folder)
        self.file_path.mkdir(parents=True, exist_ok=True)
        

    def save_html(self,name, save_folder):
        self._save_file(save_folder)
        full_name = name + ".html"        
        file_path_final = self.file_path.joinpath(full_name)

        with open(file_path_final, 'wb') as f:
            f.write(self.html)

    def _save_txt(self,text,name, save_folder):
        self._save_file(save_folder)
        full_name = name + ".txt"        
        file_path_final = self.file_path.joinpath(full_name)
        

        with open(file_path_final, 'w', encoding='utf-8') as f:
            f.write(text)

    # Parser ----------------------------------------
    ## This section of the class handles parsing html files to get data

    def _make_parse_soup(self, file_name):
        with open(file_name) as fp:
            html = fp      

            soup = BeautifulSoup(html, 'html.parser') 
            return(soup)          

        

    def _get_table(self, table_element, option):
        ls_table_tr = table_element.find_all("tr")
        rows = []
        for tr in ls_table_tr:
            row = []
            count = 0
            for child in tr.children:
                if count == 0:
                    result = child.text.replace('\n', '')
                    if option == 1:
                        if result != "":
                            row.append(result)
                    if option == 2:
                        row.append(result)
                else: 

                    try:
                        row.append(child.text.replace('\n', ''))
                    except:
                        continue

            count = count + 1
            if len(row) > 0:
                rows.append(row)     
        
        # diff_btwn_col_row = len(rows[0]) - len(rows[1])
        # if diff_btwn_col_row == 0:
        df = pd.DataFrame(rows[1:], columns = rows[0])
        df_table = df
        # else:
        #     column = rows[0][diff_btwn_col_row:]
        #     df = pd.DataFrame(rows[1:], columns = column)
        #     df_table = df
        return(df_table)

    def _find_table_by_title(self,soup,tag, title):
        try:
            item = soup.find(string = re.compile(title))
            ls_parents = item.parents
            
            for parent in ls_parents:
                if parent.find(tag):
                    return(parent)
                    break
        except:        
            return("no table found")

   

    def get_files_in_folder(self,folder_path):
        path = pathlib.Path(folder_path)   
        files = [e for e in path.iterdir() if e.is_file()]
        return(files)
            
        

    

## Creat OSHA class
- This class inherits functions from the base class
- Also added to the class are custom functions to handle the website that general functions don't cover



In [82]:
class OshaScraperParser(BaseWebscraper):
    def __init__(self, data_directory):
        super().__init__(data_directory)
    
    # THis part of scraper includes functions for parsing and returning the links from result page to scrape
    def parse_links(self, link):
        
        soup = self._make_parse_soup(link)
        ls_tables = soup.find_all("div", {"class":"table-responsive"})
        result_table = ls_tables[1]
        ls_links = result_table.find_all("a")
        ls_urls = []
        for link in ls_links:    
            url = link["href"]
            if "establishment" in url:
                url_start = "http://www.osha.gov/pls/imis/"
                ls_urls.append(url_start + url)
        return(ls_urls)
    def download_link_html(self, ls_links):
        for url in tqdm(ls_links):
            url_split = url.split("=")
            case_number = url_split[1].strip()
            self.scrape(url, "requests", case_number, "dollar_tree_cases")

    # This part of scraper includes functions specific to osha for parsing and returning information for each case
    def _parse_inspection_information(self,line):
    
        inspection_information_dict = {}
        ls_split_line = line.split(":")
        key = ls_split_line[0].strip()
        
        value = ls_split_line[1].strip()

        if key == "Related Activity":
            return("Related Activity")
        
        elif value != "":              
            return(ls_split_line)
        else: # need to go to sibling for value of the key
            if key == "Scope":
                value = line.parent.find_all("td")[0].text
            else:
                try:
                    value = line.parent.next_sibling.text
                except:
                    value = ""
        
            return([key, value])
    def _get_inspection_information(self,inspection_table):

        hr_2 = inspection_table.find_all("hr")[1]
        ls_name_tr = hr_2.parent.find_all("tr")
        company_name = ls_name_tr[0].text
        address = ls_name_tr[1].get_text(separator=" ").strip().split("\n")[0]    

        ls_keys = inspection_table.find_all(string = re.compile(":"))    
        ls_inspection_information = []
        inspection_information_dict ={}

        inspection_information_dict["address"] = address

        for i in ls_keys:   
            
            result = self._parse_inspection_information(i)
            if result == "Related Activity":
                continue
            else:
                inspection_information_dict[result[0]] = result[1]
        ls_inspection_information.append(inspection_information_dict)      
        df_inspection_information = pd.DataFrame.from_dict(ls_inspection_information)
        return(df_inspection_information)

    def _get_related_activity_from_inspection_information(self,inspection_table):
        ls_keys = inspection_table.find_all(string = re.compile(":"))
        for i in ls_keys:           
            result = self._parse_inspection_information(i)
            if result == "Related Activity":
                return(i.parent.parent.parent)

    def _get_violation_table(self,table_element):
        rows = []
        ls_table_tr = table_element.find_all("tr")

        for tr in ls_table_tr:
            row = []
            count = 0
            for child in tr.children:              
                result = child.text.replace('\n', '').strip()
                row.append(result)
            rows.append(row)
        # print(rows)

        rows_cleaned = []
        for row in rows:
            row_cleaned =[]
            counter = 0
            for i in row:
                counter = counter +1
                if counter %2 == 0:
                    row_cleaned.append(i)
            rows_cleaned.append(row_cleaned)

        df = pd.DataFrame(rows_cleaned[1:], columns = rows_cleaned[0])
        df_table = df
        return(df_table)

    def parse_osha_datatables(self, file_path):

        soup = self._make_parse_soup(file_path)

        # isolate the section of html for each table ---------------

        ## Case Status and Nr
        try:
            case_status = soup.find(string = re.compile("Case Status")).split(":")
        except:
            case_status = "info_not_on_page"

        ## Inspection information
        inspection_information = self._find_table_by_title(soup,"table", "Inspection Information")
        df_inspection_information = self._get_inspection_information(inspection_information)
        df_inspection_information[case_status[0]] = case_status[1]        
        df_inspection_information_clean = df_inspection_information.clean_names()
        nr = df_inspection_information_clean["nr"][0] # isolate unique id to put in other dataframes

        ## Related Activity 
        try:
            related_activity =  self._get_related_activity_from_inspection_information(inspection_information)
            df_related_activity =self._get_table(related_activity, 1)
            df_related_activity_clean = (df_related_activity
            .clean_names()
            .add_prefix("related_activity_")
            .drop(columns="related_activity_related_activity_")
            )
            df_related_activity_clean["nr"] = nr # add unique id
        except:
            df_related_activity_clean = "no table"

        try:
            ## Violation Summary
            violation_summary = self._find_table_by_title(soup, "table", "Violation Summary")
            df_violation_summary = self._get_table(violation_summary, 2)
            df_violation_summary_clean = df_violation_summary.iloc[:, [1, 3, 5, 7, 9, 11,13]]
            df_violation_summary_clean =  df_violation_summary_clean.rename(columns = {df_violation_summary.columns[1]: "violation_type"})
            df_violation_summary_clean = df_violation_summary_clean.clean_names()
            df_violation_summary_clean["nr"] = nr # add unique id
        except:
            df_violation_summary_clean = "no table"

        try:
            ## Violation Items
            violation_items = self._find_table_by_title(soup,"table", "Violation Items")
            df_violation_items = self._get_violation_table(violation_items) 
            df_violation_items_clean = (df_violation_items
            .rename(columns = {"":"notes"})
            .clean_names()
            )
            df_violation_items_clean["nr"] = nr
        except:
            df_violation_items_clean = "no_table"                

      
        return(df_inspection_information_clean,df_related_activity_clean,df_violation_summary_clean,df_violation_items_clean )      
    

## Make an instant of the class and preform scraping steps

### Create instance of class

In [None]:
osha_scraper = OshaScraperParser( "../data/" ) # create instance of the class


### Download html of main page that has the links to each page

In [None]:
osha_scraper.scrape("https://www.osha.gov/pls/imis/establishment.search?establishment=Dollar%20Tree&state=all&officetype=all&office=all&sitezip=100000&startmonth=08&startday=03&startyear=2012&endmonth=08&endday=03&endyear=2022&p_case=all&p_violations_exist=both&p_start=&p_finish=0&p_sort=12&p_desc=DESC&p_direction=Next&p_show=700", "requests","dollar_tree_links", "html/links/" ) # download html of page with results

### Parse main page and create list of urls to scrape

In [None]:
ls_dollar_tree_urls = osha_scraper.parse_links(r"C:\\Users\\nicho\\Documents\\GitHub\\presidential_speeches\\data\html\\links\\dollar_tree_links.html") # parse result links from the html of page save above

### Download html page of each url link

In [None]:
osha_scraper.download_link_html(ls_dollar_tree_urls) # download all html files from the results links

### Parse all files and create 4 dataframes from result

In [100]:
ls_dollar_tree_html = osha_scraper.get_files_in_folder(r"C://Users//nicho//Documents//GitHub//presidential_speeches//data//dollar_tree_cases") # return all html paths


# Scrape each html file and add to table if table exists add ------------------------------
counter = 0
for file in tqdm(ls_dollar_tree_html): 
    # print(file)
    # print("-----")   
    counter = counter + 1
    df_inspection_information, df_related_activity, df_violation_summary, df_violation_items = osha_scraper.parse_osha_datatables(file)
    if counter == 1:
        df_final_inspection_information = df_inspection_information
        df_final_related_activity = df_related_activity
        df_final_violation_summary = df_violation_summary
        df_final_violation_items = df_violation_items
    else:
        
        if isinstance(df_inspection_information, pd.DataFrame) == True: 
            df_final_inspection_information = pd.concat([df_final_inspection_information, df_inspection_information], ignore_index = True)
        if isinstance(df_related_activity, pd.DataFrame) == True: 
            df_final_related_activity = pd.concat([df_final_related_activity, df_related_activity], ignore_index = True)
        if isinstance(df_violation_summary, pd.DataFrame) == True:                       
            df_final_violation_summary = pd.concat([df_final_violation_summary, df_violation_summary], ignore_index = True)
            
        if isinstance(df_violation_items, pd.DataFrame) == True:
            df_final_violation_items = pd.concat([df_final_violation_items,df_violation_items], ignore_index = True)      


    



100%|██████████| 660/660 [00:29<00:00, 22.47it/s]


### Save data to csv

In [104]:
df_final_inspection_information.to_csv("inspection_information.csv")
df_final_related_activity.to_csv("related_activity.csv")
df_final_violation_summary.to_csv("violation_summary.csv")
df_final_violation_items.to_csv("violation_items.csv")
