## Scarping insolvency data
Source: https://www.insolvenzbekanntmachungen.de/cgi-bin/bl_suche.pl

1. 'Uneingeschränkte Suche': search for all insolvency court but only for 2 weeks after publication date
2. 'Detail-Suche': 
    * input: insolvency court + company name OR residence of the debtor



In [1]:
# install required packages:
#!pip install requests
#!pip install bs4
#!pip install pandas

In [2]:
# import required packages:
import requests
from bs4 import BeautifulSoup
import json
import time
import pandas as pd

# 1. Get regions and insolvency court from the website:

In [3]:
## User input:

# possible placeholder for unclear input
# ?: 
# *:

URL = 'https://www.insolvenzbekanntmachungen.de/cgi-bin/bl_suche.pl'

# max. number of results per page
matchesperpage = 100

region='--+Alle+Bundesl%E4nder+--' # '--+Alle+Bundesl%E4nder+--' # 'Hessen'

In [4]:
# get all regions
with requests.Session() as s:
    s.headers={"User-Agent":"Mozilla/5.0"}
    s.headers.update({'Content-Type': 'application/x-www-form-urlencoded'})
  
    res = s.post(URL)
    soup = BeautifulSoup(res.text, "lxml")
    
items = soup.find("select",{"name":"Bundesland"}).findAll("option")
regions = values = [item.get('value') for item in items][1:]

In [5]:
regions

['Baden-Württemberg',
 'Bayern',
 'Berlin',
 'Brandenburg',
 'Bremen',
 'Hamburg',
 'Hessen',
 'Mecklenburg-Vorpommern',
 'Niedersachsen',
 'Nordrhein-Westfalen',
 'Rheinland-Pfalz',
 'Saarland',
 'Sachsen',
 'Sachsen-Anhalt',
 'Schleswig-Holstein',
 'Thüringen']

In [6]:
# get all insolvency court:
insolvency_court = {}

for region in regions:
    with requests.Session() as s:
        s.headers={"User-Agent":"Mozilla/5.0"}
        s.headers.update({'Content-Type': 'application/x-www-form-urlencoded'})
        payload = 'Suchfunktion=uneingeschr&Absenden=Suche+starten&' + \
                  'Bundesland=' + region + \
                  '&Gericht=--+Alle+Insolvenzgerichte+--&' \
                  'Datum1=&Datum2='\
                  '&Name=' + \
                  '&Sitz=' + \
                  '&Abteilungsnr=' + \
                  '&Registerzeichen=--&Lfdnr=' + \
                  '&Jahreszahl=--' + \
                  '&Registerart=--+keine+Angabe+--' + \
                  '&select_registergericht=' + \
                  '&Registergericht=--+keine+Angabe+--' + \
                  '&Registernummer=' + \
                  '&Gegenstand=--+Alle+Bekanntmachungen+innerhalb+des+Verfahrens+--' + \
                  '&matchesperpage=100' + \
                  '&page=1' + \
                  '&sortedby=Datum'
        res = s.post(URL, data = payload)
        soup = BeautifulSoup(res.text, "lxml")
        
        items = soup.find("select",{"name":"Gericht"}).findAll("option")
        insolvency_court[region] = [item.get('value') for item in items][1:]

In [7]:
insolvency_court

{'Baden-Württemberg': ['Aalen',
  'Baden-Baden',
  'Crailsheim',
  'Esslingen',
  'Freiburg',
  'Göppingen',
  'Hechingen',
  'Heidelberg',
  'Heilbronn',
  'Karlsruhe',
  'Konstanz',
  'Ludwigsburg',
  'Lörrach',
  'Mannheim',
  'Mosbach',
  'Offenburg',
  'Pforzheim',
  'Ravensburg',
  'Rottweil',
  'Stuttgart',
  'Tübingen',
  'Ulm',
  'Villingen-Schwenningen',
  'Waldshut-Tiengen'],
 'Bayern': ['Amberg',
  'Ansbach',
  'Aschaffenburg',
  'Augsburg',
  'Bamberg',
  'Bayreuth',
  'Coburg',
  'Deggendorf',
  'Fürth',
  'Hof',
  'Ingolstadt',
  'Kempten',
  'Landshut',
  'Memmingen',
  'Mühldorf',
  'München',
  'Neu-Ulm',
  'Nördlingen',
  'Nürnberg',
  'Passau',
  'Regensburg',
  'Rosenheim',
  'Schweinfurt',
  'Straubing',
  'Traunstein',
  'Weiden',
  'Weilheim',
  'Wolfratshausen',
  'Würzburg'],
 'Berlin': ['Charlottenburg',
  'Köpenick',
  'Lichtenberg',
  'Mitte',
  'Neukölln',
  'Pankow/Weißensee',
  'Schöneberg',
  'Spandau',
  'Tempelhof-Kreuzberg',
  'Tiergarten',
  'Weddin

# 2. define function: unlimited_search & detail_search

In [8]:
def unlimited_search(region='--+Alle+Bundesl%E4nder+--', matchesperpage=100):
    """
    Get all the available insolvencies from the homepage 'www.insolvenzbekanntmachungen.de' from the last 2 weeks.
    
    :param region (str): region of the insolvency process
    :param matchesperpage (int): number of max
    :return (list): list of all available insolvencies. 
    """
    
    URL = 'https://www.insolvenzbekanntmachungen.de/cgi-bin/bl_suche.pl'
    
    # 1. find max number of pages
    with requests.Session() as s:
        s.headers={"User-Agent":"Mozilla/5.0"}
        s.headers.update({'Content-Type': 'application/x-www-form-urlencoded'})
        payload = 'Suchfunktion=uneingeschr&Absenden=Suche+starten&' + \
                  'Bundesland=' + region + \
                  '&Gericht=--+Alle+Insolvenzgerichte+--&' \
                  'Datum1=&Datum2='\
                  '&Name=' + \
                  '&Sitz=' + \
                  '&Abteilungsnr=' + \
                  '&Registerzeichen=--&Lfdnr=' + \
                  '&Jahreszahl=--' + \
                  '&Registerart=--+keine+Angabe+--' + \
                  '&select_registergericht=' + \
                  '&Registergericht=--+keine+Angabe+--' + \
                  '&Registernummer=' + \
                  '&Gegenstand=--+Alle+Bekanntmachungen+innerhalb+des+Verfahrens+--' + \
                  '&matchesperpage=' + str(matchesperpage) + \
                  '&page=1' + \
                  '&sortedby=Datum'
        res = s.post(URL, data = payload)
        soup = BeautifulSoup(res.text, "lxml")
        t = soup.select('center a')[-1].attrs['href']
        start_nr=t.find('&page=')
        end_nr=t.find('#Ergebnis')
        result_page_nr = int(t[start_nr+6:end_nr])
        
        # 2. go through all pages and get data
        start_time = time.time()

        insolvency = []

        with requests.Session() as s:
            s.headers={"User-Agent":"Mozilla/5.0"}
            s.headers.update({'Content-Type': 'application/x-www-form-urlencoded'})

            for page in range(1, result_page_nr):

                #for region in regions:
                payload = 'Suchfunktion=uneingeschr&Absenden=Suche+starten&' + \
                          'Bundesland=' + region + \
                          '&Gericht=--+Alle+Insolvenzgerichte+--&' \
                          'Datum1=&Datum2='\
                          '&Name=' + \
                          '&Sitz=' + \
                          '&Abteilungsnr=' + \
                          '&Registerzeichen=--&Lfdnr=' + \
                          '&Jahreszahl=--' + \
                          '&Registerart=--+keine+Angabe+--' + \
                          '&select_registergericht=' + \
                          '&Registergericht=--+keine+Angabe+--' + \
                          '&Registernummer=' + \
                          '&Gegenstand=--+Alle+Bekanntmachungen+innerhalb+des+Verfahrens+--' + \
                          '&matchesperpage=' + str(matchesperpage) + \
                          '&page=' + str(page) + \
                          '&sortedby=Datum'
                res = s.post(URL, data = payload)
                soup = BeautifulSoup(res.text, "lxml")

                for item in soup.select("b li a"):
                    insolvency.append(item.get_text(strip=True))

        print("--- %s minutes ---" % ((time.time() - start_time)/ 60))
        print(len(insolvency))
        
        return insolvency

In [21]:
def detail_search(name = '', region = '', court = '', residence = ''):
    """
    Get insolvencies which are older than 2 weeks and could not be found within the unlimited_search() function. 
    
    :param name (str): name of the firm or real person who has initiated an insolvency process.
    :param region (str): region of the insolvency process
    :param court (str): court of the insolvency
    :param residence (str): residence of the debtor
    :return (list): list of all available insolvencies. 
    """

    URL = 'https://www.insolvenzbekanntmachungen.de/cgi-bin/bl_suche.pl'
    
    start_time = time.time()

    insolvency = []
    with requests.Session() as s:
            s.headers={"User-Agent":"Mozilla/5.0"}
            s.headers.update({'Content-Type': 'application/x-www-form-urlencoded'})

            #for region in regions:
            payload = 'Suchfunktion=detail&Absenden=Suche+starten&' + \
                      'Bundesland=' + region + \
                      '&Gericht=' + court +\
                      '&Datum1=&Datum2='\
                      '&Name=' + name + \
                      '&Sitz=' + residence + \
                      '&Abteilungsnr=' + \
                      '&Registerzeichen=--&Lfdnr=' + \
                      '&Jahreszahl=--' + \
                      '&Registerart=--+keine+Angabe+--' + \
                      '&select_registergericht=' + \
                      '&Registergericht=--+keine+Angabe+--' + \
                      '&Registernummer=' + \
                      '&Gegenstand=--+Alle+Bekanntmachungen+innerhalb+des+Verfahrens+--' + \
                      '&matchesperpage=' + str(matchesperpage) + \
                      '&page=100' + \
                      '&sortedby=Datum'
            res = s.post(URL, data = payload)
            soup = BeautifulSoup(res.text, "lxml")

            for item in soup.select("b li a"):
                insolvency.append(item.get_text(strip=True))

    print("--- %s seconds ---" % ((time.time() - start_time)))
    print(len(insolvency))

    return insolvency

# 3. Example for both functions:

In [10]:
insolvency_data = unlimited_search(region='--+Alle+Bundesl%E4nder+--', matchesperpage=100)

--- 2.484098966916402 minutes ---
43700


In [22]:
detail_search(name='Senvion GmbH', region='Hamburg', court='Hamburg')

--- 0.10202217102050781 seconds ---
4


['2020-04-15Senvion GmbH, Hamburg, 67g IN 113/19, Registergericht Hamburg, HRB 137187',
 '2019-08-22Senvion GmbH, Hamburg, 67g IN 113/19, Registergericht Hamburg, HRB 137187',
 '2019-07-02Senvion GmbH, Hamburg, 67g IN 113/19, Registergericht Hamburg, HRB 137187',
 '2019-07-02Senvion GmbH, Hamburg, 67g IN 113/19, Registergericht Hamburg, HRB 137187']

# 4. Try to structure data in data frame:

In [27]:
real_person = pd.DataFrame({"Date": [], 
                            "Last Name": [],
                            "First Name": [],
                            "Address": [],
                            "Location": [],
                            "Reference Number": [],
                            "Raw Information": []}) 

In [28]:
companies = pd.DataFrame({"Date": [], 
                          "Company Name": [],
                          "Residence ": [],
                          "Reference Number": [],
                          "Raw Information": []}) 

In [30]:
for idx, entry in enumerate(insolvency_data):
    date = entry[0:10]
    entry_list = entry[10:].split(", ")

    # real person without address
    if len(entry_list) == 4:
        real_person.loc[idx] = [date, entry_list[0], entry_list[1], entry_list[2], '-', entry_list[3], entry]
    # real person with full address
    elif len(entry_list) == 5:
        real_person.loc[idx] = [date, entry_list[0], entry_list[1], entry_list[2], entry_list[3], entry_list[4], entry]
    # company name
    elif len(entry_list) == 3:
        companies.loc[idx] = [date, entry_list[0], entry_list[1], entry_list[2], entry]
    else:
        real_person.loc[idx] = [date, entry_list[0], entry_list[1], '?', '?', '?', entry]
        #print(entry_list)

In [31]:
companies.shape

(4908, 5)

In [32]:
real_person.shape

(38792, 7)

In [33]:
companies

Unnamed: 0,Date,Company Name,Residence,Reference Number,Raw Information
20,2020-05-30,Alexa Balko,Cottbus,64 IK 106/14,"2020-05-30Alexa Balko, Cottbus, 64 IK 106/14"
21,2020-05-30,Alexa Balko,Cottbus,64 IK 106/14,"2020-05-30Alexa Balko, Cottbus, 64 IK 106/14"
23,2020-05-30,Andreas Lux,Schönefeld,64 IK 286/14,"2020-05-30Andreas Lux, Schönefeld, 64 IK 286/14"
24,2020-05-30,Andreas Lux,Schönefeld,64 IK 286/14,"2020-05-30Andreas Lux, Schönefeld, 64 IK 286/14"
36,2020-05-30,Björn Thieme,Bad Liebenwerda,64 IK 692/18,"2020-05-30Björn Thieme, Bad Liebenwerda, 64 IK..."
...,...,...,...,...,...
43683,2020-05-18,Yusuf Sahin,Geislingen,4 IK 279/19,"2020-05-18Yusuf Sahin, Geislingen, 4 IK 279/19"
43684,2020-05-18,Yusuf Sahin,Geislingen,4 IK 279/19,"2020-05-18Yusuf Sahin, Geislingen, 4 IK 279/19"
43696,2020-05-18,Zecevic Jana,München,1513 IK 693/19,"2020-05-18Zecevic Jana, München, 1513 IK 693/19"
43698,2020-05-18,Zenebe Tebeje,Heidelberg,82 IK 211/19,"2020-05-18Zenebe Tebeje, Heidelberg, 82 IK 211/19"


In [34]:
real_person

Unnamed: 0,Date,Last Name,First Name,Address,Location,Reference Number,Raw Information
0,2020-05-31,Ali Panah,Mohamad Reza,Hannover,-,903 IK 1012/19 - 3 -,"2020-05-31Ali Panah, Mohamad Reza, Hannover, 9..."
1,2020-05-31,Ali Panah,Mohamad Reza,Hannover,-,903 IK 1012/19 - 3 -,"2020-05-31Ali Panah, Mohamad Reza, Hannover, 9..."
2,2020-05-31,Ali Panah,Mohamad Reza,Hannover,-,903 IK 1012/19 - 3 -,"2020-05-31Ali Panah, Mohamad Reza, Hannover, 9..."
3,2020-05-31,Burdorf,Dominika,Lilienthal,-,11 IK 76/20,"2020-05-31Burdorf, Dominika, Lilienthal, 11 IK..."
4,2020-05-31,Burdorf,Dominika,Lilienthal,-,11 IK 76/20,"2020-05-31Burdorf, Dominika, Lilienthal, 11 IK..."
...,...,...,...,...,...,...,...
43692,2020-05-18,Zahapschi,Vasile,Neulußheim,-,2 IK 1122/19,"2020-05-18Zahapschi, Vasile, Neulußheim, 2 IK ..."
43693,2020-05-18,Zahn,Irene,Kiel,-,24 IK 191/14,"2020-05-18Zahn, Irene, Kiel, 24 IK 191/14"
43694,2020-05-18,Zapf geb. Suhling,Marita,Ludwigshafen,-,3 a IK 136/14 Lu,"2020-05-18Zapf geb. Suhling, Marita, Ludwigsha..."
43695,2020-05-18,Zbierski,Thomas,23968 Wismar,-,580 IK 349/19,"2020-05-18Zbierski, Thomas, 23968 Wismar, 580 ..."


# 5. Save data

In [33]:
# save as .pkl file
real_person.to_pickle('Insolvency/real_person_'+ region +'.pkl') 
companies.to_pickle('Insolvency/companies_'+ region +'.pkl') 

In [34]:
# save as .json file
with open('Insolvency/insolvency_'+ region +'.json','w') as f:
   json.dump(list(insolvency),f)