# HTML Parser

This Jupyter Notebook parses the HTML files in folder *scraped_data*.

In [6]:
# for html parsing:
from bs4 import BeautifulSoup, NavigableString, Tag
# for file importing and exporting:
import csv
import pandas as pd
# import json
import os
# other:
import re # regex
from IPython.display import clear_output

In [7]:
debug_prints = False

In [8]:
def scan_htmls(print_title):
    global debug_prints

    debug_prints = True

    # get list of all files in folder 'scraped_data'
    for root, dirs, files in os.walk('scraped_data'):
        document_list = files

    # document_list = ["KUNUNU_KH Software GmbH  Co.KG.csv_03.09.2019.html"]
    # document_list = ['KUNUNU_MathWorks.csv_28.02.2020.html', "KUNUNU_KH Software GmbH  Co.KG.csv_03.09.2019.html"]

    abort_execution = False
    skip_item = False

    # for company name sanitizing
    keepcharacters = (' ','.','_', '-')
    employee_desc = ['Arbeitnehmer', 'Mitarbeiter']

    count_of_htmls = len([item for item in document_list if item.endswith('.html')])
    html_counter = 1
    export_counter = 0
    company_data = pd.DataFrame(columns=['Unternehmen','Dokumentendatum', 'Dokumententyp','JA_von', 'JA_bis', 'Anzahl_MA', 'Umsatzerlöse', 'Bilanz_Aktiva', 'Bilanz_Passiva', 'GuV', 'Dateiname'])
    regexDates = re.compile(r'vom (\d{2}\.\d{2}\.\d{4}) bis zum (\d{2}\.\d{2}\.\d{4})')

    for item in document_list:
        clear_output(wait=True)
        print(print_title)
        print("Scanning document:", html_counter, "/", count_of_htmls, "--", item)

        temp_unternehmen = ""
        temp_dokumententyp = ""
        temp_JA_von = ""
        temp_JA_bis = ""
        temp_datum =""
        temp_anzahl_MA = ""
        temp_umsatz = ""
        temp_bilanzaktiva = ""
        temp_bilanzpassiva = ""
        temp_guv = ""
        temp_dateiname = ""

        if item.endswith('.html'):
            try:
                with open('scraped_data/'+item) as file:
                    soup = BeautifulSoup(file)
                skip_item = False
            except Exception as EError:
                print("An Error occured!", repr(EError))
                skip_item = True
        else:
            skip_item = True

        if not skip_item:
            html_counter = html_counter + 1

            # do some magic here

            temp_unternehmen = soup.find("h3", class_="z_titel").get_text(separator=" ")
            # for sanitizing the company name, sometimes there are tabs and shit
            #temp_Unternehmen = "".join(c for c in temp_Unternehmen if c.isalnum() or c in keepcharacters).rstrip()
            temp_unternehmen = " ".join(temp_unternehmen.split())
            temp_dokumententyp = soup.find("td", class_="info").get_text().split()[0]
            temp_datum = soup.find("td", class_="date").get_text(separator=" ")
            temp_JA_von = regexDates.findall(soup.find("td", class_="info").get_text())[0][0]
            temp_JA_bis = regexDates.findall(soup.find("td", class_="info").get_text())[0][1]
            temp_dateiname = item


            # find aktiva and passiva tables
            start = soup.find("h3", id=re.compile("^jp_Bilanz", re.IGNORECASE))
            temp_bilanz = []
            if start:
                for elem in start.next_siblings:
                    if elem.name == 'h3':
                        # print("found an h3:", elem.text)
                        break
                    if elem.name != 'table':
                        continue
                    # it's a <table> tag before the next <h3>
                    # print("table elem found")
                    temp_bilanz.append(elem)
                    # print("Länge von tmep_bilnaz:", len(temp_bilanz))
            else:
                print("no jp_Bilanz found")

            if len(temp_bilanz) > 1:
                pass
                # temp_bilanzaktiva = temp_bilanz[0]
                # temp_bilanzpassiva = temp_bilanz[1]
            if len(temp_bilanz) > 2: #there was a third table for GuV
                print("GuV found!")
                temp_guv = temp_bilanz[2]

            #find Umsatzerlöse
            print(soup.find_all(string=re.compile('Umsatz', re.IGNORECASE)))
            # if len(temp_bilanz) > 2: #there was a third table for GuV
            #     for elem in temp_guv.find('td', string=re.compile('Umsatz', re.IGNORECASE)):
            #             print("Umsatz found:", elem)


            # find count of employees
            ma_sentence_found = False
            search_element = soup.find_all('p')
            for elem in search_element:
                # if elem.text contains a string out of employee_desc
                if any(x in elem.text for x in employee_desc):
                    # if there are any br tag children
                    # https://stackoverflow.com/questions/5275359/using-beautifulsoup-to-extract-text-between-line-breaks-e-g-br-tags
                    for br in elem.find_all('br'):
                        next_s = br.next_sibling
                        if not (next_s and isinstance(next_s,NavigableString)):
                            continue
                        next2_s = next_s.next_sibling
                        if next2_s and isinstance(next2_s,Tag) and next2_s.name == 'br':
                            text = str(next_s)
                            if any(x in text for x in employee_desc):
                                if debug_prints:
                                    print("MA found ins brs:", " ".join(text.split()))
                                    print("MA found ins brs:", " ".join(text.split()))
                                temp_anzahl_MA = " ".join(text.split())
                                ma_sentence_found = True
                                break
                    if not ma_sentence_found:
                        if debug_prints:
                            print("MA not found in brs, but directly in p tag:", " ".join(elem.text.split()))
                        temp_anzahl_MA = " ".join(elem.text.split())

                    break

            print(temp_anzahl_MA)

            test = [temp_unternehmen,temp_datum,temp_dokumententyp,temp_JA_von,temp_JA_bis,
                    temp_anzahl_MA,temp_umsatz,temp_bilanzaktiva,temp_bilanzpassiva,temp_guv,temp_dateiname]
            # print(test)
            #company_data.append(test, ignore_index=True)
            company_data.loc[len(company_data)] = test
            export_counter = export_counter + 1
            # print(company_data.describe())
            #print(company_data)
            #company_data.to_pickle('output/company_attributes.pkl')
            # print(company_data.loc[0,'Anzahl_MA'])


            # print("Unternehmen:", temp_unternehmen)
            # print("Dokumentendatum:", temp_datum)
            # print("Dokumententyp:", temp_dokumententyp)
            # print("Daten von-bis:", temp_JA_von, temp_JA_bis)
            # print("Anzahl MA:", temp_anzahl_MA)
            # print("Umsatzerlöse:", temp_umsatz)
            # print("Bilanz Aktiva Länge:", len(temp_bilanzaktiva))
            # print("Bilanz Passiva Länge:", len(temp_bilanzpassiva))
            # print("GuV Länge:", len(temp_guv))
            # print("Dateiname:", temp_dateiname)

    company_data.to_csv('output/company_attributes.csv', index=False, encoding='utf-8', sep=';', quoting=csv.QUOTE_ALL)
    print("Done!")
    print("Found and exported values out of", export_counter, "html files.")

In [9]:
scan_htmls("title")
print("Länge")



title
Scanning document: 226 / 225 -- KUNUNU_Zoom7 GmbH.csv
Done!
Found and exported values out of 225 html files.
Länge


## Struktur
* Name des Unternehmens
* Typ des Abschlusses
* Jahresabschluss_von
* Jahresabschluss_bis
* Datum der Veröffentlichung
* Anzahl der MA
* (optional:) Umsatzerlöse
* Bilanz Aktiva
* Bilanz Passiva
* (optional:) GuV
* Dateiname
