# Bundesanzeiger Scraping

Der folgende Code scraped die Webseite https://www.bundesanzeiger.de/. 

## Imports:

In [1]:
# for html network requests and parsing:
from bs4 import BeautifulSoup
import requests
# for displaying the captcha images:
from PIL import Image
from io import BytesIO
from IPython.display import display, HTML
from IPython.display import clear_output
# for file importing and exporting:
import csv
import pandas as pd
import json
import os
# for waiting: 
from datetime import datetime
import time
# for audio:
import simpleaudio as sa
# other:
import re # regex
#from pathlib import Path


## Globale Variablen:

In [2]:
session_requests = requests.session()
session_id = ""
debug_prints = False

# Basisfunktionen:

## get_search_results

Diese Funktion stellt eine Anfrage an die Suchmaske von https://www.bundesanzeiger.de/ und gibt die Ergebnistabelle als Dataframe zurück.

Diese Funktion wird direkt aufgerufen.

In [3]:
def get_search_results(search_string):
    global session_id
    global debug_prints
    # genericsearch_param.part_id: 22 is for just Jahresabschlüsse
    payload = {'page.navid': 'to_detailsearch', 'global_data.designmode': 'eb', '(page.navid=to_quicksearchlist)': 'Suchen', 'genericsearch_param.part_id': '22', 'genericsearch_param.hitsperpage': '20'}
    payload.update({'genericsearch_param.fulltext': search_string})
    if session_id != "":
        payload.update({'session.sessionid': session_id})

    url = 'https://www.bundesanzeiger.de/ebanzwww/wexsservlet'
    time.sleep(0.1) # wait some seconds before the next request, to not overwhelm the server
    try:
        result = session_requests.get(url, params = payload)
    except Exception as e:
        print('Error in getting search result!', repr(e))
        result = None
    if debug_prints:
        print("get_searchresults url:", result.url)
        print("get_searchresults returned:", result.status_code, result.headers['content-type'])

    if result is not None:
        # Get html content
        soup = BeautifulSoup(result.text, "lxml")
        # Parse for session id, update variable
        session_id = soup.find('a', href=True)['href']
        session_id = session_id[session_id.find("session.sessionid=")+len("session.sessionid="):session_id.find("&", session_id.find("session.sessionid="))]
        if debug_prints:
            print("session_id:", session_id)

        # Parse html content
        result_table = soup.find("table", attrs={"class": "result", "summary": "Trefferliste"})
        rows = result_table.find_all("tr")
        table_contents = []
        for tr in rows:
            # header row
            if rows.index(tr) == 0 :
                row_cells = [ th.getText(separator=" ").strip() for th in tr.find_all('th') if th.getText().strip() != '' ] + ["session.sessionid=", "fts_search_list.selected", "fts_search_list.destHistoryId", "timestamp"]
            # data rows
            else:
                # get 'th' element text
                row_cells = ([ tr.find('th').getText(separator=" ") ] if tr.find('th') else [] ) + \
                [ td.getText(separator=" ").strip() for td in tr.find_all('td') if td.getText().strip() != '' ] + \
                [ a['href'][a['href'].find("session.sessionid=")+len("session.sessionid="):a['href'].find("&", a['href'].find("session.sessionid="))] for a in tr.find_all('a', href=True)] + \
                [ a['href'][a['href'].find("fts_search_list.selected=")+len("fts_search_list.selected="):a['href'].find("&", a['href'].find("fts_search_list.selected="))] for a in tr.find_all('a', href=True)] + \
                [ a['href'][a['href'].find("fts_search_list.destHistoryId=")+len("fts_search_list.destHistoryId="):] for a in tr.find_all('a', href=True)] + \
                [ time.ctime() ]
            if len(row_cells) > 1 :
                table_contents += [ row_cells ]

        # Convert to dataframe and set first row as headers
        df = pd.DataFrame.from_dict(table_contents)
        df.columns = df.iloc[0]
        df = df[1:]
        return df
    else:
        print('get_search_results had an erorr!')
        df = pd.DataFrame()
        return df


# Weitere Funktionen:

## scrape single company

In [4]:
# this function gets the result of the single company
def count_documents_of_company(company, from_year, print_title):
    global session_id
    global debug_prints

    # for company name sanitizing
    keepcharacters = (' ','.','_', '-')
    regex1 = re.compile(r'bis zum \d{2}\.\d{2}\.(\d{4})')
    csv_filename = 'scraped_data_counting/'+"".join(c for c in company if c.isalnum() or c in keepcharacters).rstrip()+'.csv'
    abort_execution = False
    rerun_function = False
    document_counter = 0
    current_counter = 0

    clear_output(wait=True)
    print(print_title)

    # get search results
    try:
        df = get_search_results(company)
    except Exception as e:
        print('get_seach_results returned error!', repr(e))
        df = None
        pass

    try:
        if isinstance(df, pd.DataFrame):
            df.to_csv(csv_filename, index=False, encoding='utf-8', sep=';', quoting=csv.QUOTE_ALL)
    except Exception as e:
        print('Dataframe could not be saved as CSV!', repr(e))
        df = None
        pass


    # count how many douments of search results are to be fetched
    if df is not None and isinstance(df, pd.DataFrame):
        for index, row in df.iterrows():
            if row[0] != "Suche - kein Suchergebnis" and isinstance(row[2], str):
                t = regex1.search(row[2])
                if t is not None:
                    if int(t.group()[-4:]) >= from_year:
                        document_counter = document_counter + 1
    print('>', document_counter, 'documents in scope!')
    return document_counter



# Ausführung:


In [5]:
filepath_merged_companies='../company_timestamps.json'

abort_execution = False

# get company data from file
try:
    with open(filepath_merged_companies) as json_file:
        company_names = json.load(json_file)
        merged_companies_len = len(company_names)
except Exception as e:
    print("couldn't read JSON file!", repr(e))
    pass


# get counter from file
try:
    with open('current_index.json', 'r') as file:
        save_list = json.load(file)
    counter_walking = save_list[0]
    counter_overall = save_list[1]
except Exception as e:
    counter_walking = 0
    counter_overall = 0

counter_curr = 0


# counter_walking = 0
# counter_overall = 0

counted_documents = []

# loop through all companies
for item in company_names:
    if counter_walking <= company_names.index(item):
        # get all douments for single comp any, save return value in variable
        counter_curr = count_documents_of_company(from_year=int(item[1]), company=item[0].strip(), print_title='=== Getting documents for company '+str(company_names.index(item))+'/'+str(merged_companies_len)+' ==='+'\n'+'===> documents currently in scope: '+str(counter_overall))
        counter_overall = counter_overall + counter_curr
        counted_documents.append((item[0].strip(), counter_curr))
        counter_walking = counter_walking + 1
        if counter_walking % 500 == 0:
            try:
                with open('counted_documents.json', 'w') as f:
                    json.dump(counted_documents, f)
            except Exception as e:
                print("saving counted_documents failed", repr(e))

            try:
                with open('counted_documents_sum.json', 'w') as f:
                    json.dump(counter_overall, f)
            except Exception as e:
                print("saving counted_documents_sum failed", repr(e))

            try:
                with open('current_index.json', 'w') as f:
                    save_list = [counter_walking, counter_overall]
                    json.dump(save_list, f)
            except Exception as e:
                print("index saving failed", repr(e))




print('> Done!', counter_overall, 'documents in scope!')
try:
    with open('counted_documents.json', 'w') as f:
        json.dump(counted_documents, f)
except Exception as e:
    print("saving counted_documents failed", repr(e))

try:
    with open('counted_documents_sum.json', 'w') as f:
        json.dump(counter_overall, f)
except Exception as e:
    print("saving counted_documents_sum failed", repr(e))

try:
    with open('current_index.json', 'w') as f:
        save_list = [counter_walking, counter_overall]
        json.dump(save_list, f)
except Exception as e:
    print("index saving failed", repr(e))




=== Getting documents for company 13908/13909 ===
===> documents currently in scope: 52700
> 2 documents in scope!
> Done! 52702 documents in scope!


In [11]:
try:
    with open('counted_documents.json', 'r') as file:
        counted_documents = json.load(file)
except Exception as e:
    print('could not open counted_documents.json!')

cols = ['Company', 'Documents in scope']

print(len(counted_documents))
print(counted_documents[0])

df = pd.DataFrame(counted_documents, columns=['Company', 'Documents in scope'])

print(df.describe())

13908
['wildstyle network GmbH', 12]
       Documents in scope
count        13908.000000
mean             3.789330
std              4.817626
min              0.000000
25%              0.000000
50%              2.000000
75%              6.000000
max             20.000000
