Building dataset on German energie cooperatives

The goal of this project is to use the German national firm registry website ( https://www.unternehmensregister.de/ureg/search1.2.html;jsessionid=66A8A10BF06DB0484AF0D6797932F100.web02-1.) to gain information on energy cooperatives in Germany and build a dataset on when German energy cooperatives were founded.
From there, I will put 'Genossenschaft' a legal type of the firm, which is cooperative in German. The search terms 'energie', 'bürgenenergie', 'bürgenenergiegenossenschaft', 'energiegenossenschaft', 'solar', 'PV', 'strom', 'wasserkraft', 'windkraft' will be used to narrow the search down for energy related cooperatives. This will be done by using the search bar on the website. The goal of the code is to simulate clicking on the buttons on the user interface and put the documents for each firm found in the document basket of the website. This is necessary, because information on when the cooperatives are founded and registered can be found only in this document. After downloading the documents, the date of the Articles of Association for the cooperative in the document and the name of the cooperative will be exported in an excel file. The number of cooperatives founded in each year will also be plotted.


In [17]:
# importing the necessary packages
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import time
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

In [None]:
url = 'https://www.unternehmensregister.de/ureg/search1.2.html;jsessionid=66A8A10BF06DB0484AF0D6797932F100.web02-1'
driver = webdriver.Chrome()
driver.get(url)
timeout = 10

#first the cookies for the website need to be accepted. buttons in each case are defined based on the code available in webdevtools
cookie_banner = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'cc_banner'))) #locates the cookie banner
allen_zustimmen_button = cookie_banner.find_element(By.ID, 'cc_all') #locates the button to accept the cookies
allen_zustimmen_button.click() #clicks on the button to accept the cookies

wait = WebDriverWait(driver, 10)
select_element = wait.until(EC.presence_of_element_located((By.ID, 'searchRegisterForm'))) #locates the part of the website where the search can be made based on different criteria

driver.execute_script("arguments[0].scrollIntoView();", select_element) #scrolls down on the page to make sure the code can locate the elements in the dropdown menu as well and interact with them

search_name = driver.find_element(By.XPATH, "//input[@id='searchRegisterForm:registerDataCompanyName']") #selects the field where the name of the firm can be specified
driver.execute_script("arguments[0].click();", search_name) #clicks on the field to be able to input words
search_name.send_keys("energie") #inputs the name energie to search for firms with 'energie' in their names


legal_form = driver.find_element(By.XPATH, "//select[@id='searchRegisterForm:registerDataLegalForm']") #selects the field where the legal form of the firm can be speficied from the given options
driver.execute_script("arguments[0].click();", legal_form) #clicks on the field to be able to make the selection

option_value_5 = wait.until(EC.element_to_be_clickable((By.XPATH, "//option[@value='5']"))) #the option with the value 5 means selecting 'cooperative' as legal form
option_value_5.click()

search_button = driver.find_element(By.NAME, 'searchRegisterForm:j_idt336') #this button is the search button at the bottom of the page which needs to be clicked on to submit the search
search_button.click() 

button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Registerinformationen anzeigen"))) #after submitting the search another banner pops up which needs to be closed with this button
button.click()


#the following code goes through all the firms found by the search and clicks on the button named 'AD' for all of them, which will put the document with their registry information in the document basket
index = 0 #indexing the elements is necesary, since after clicking an SI button the the user will be taken to the next site from where they need to return to the previous. unfortunately, with a simple for loop for the AD buttons, the code cannot locate them again after returning to the previous site. opening the SI button in another window could not work, because in that case the documents will not be put in the same document basket
site_count = 1  
while True:
    try:
        ad_buttons = WebDriverWait(driver, timeout).until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='reglink' and contains(text(), 'AD')]")))

        if index >= len(ad_buttons):
            if site_count >= 13:  #if 13 or more sites have been processed, break the loop
                break
            try:
                next_site = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='next']/a[@class='page-nav']")))
                next_site.click()
                site_count += 1  
            except NoSuchElementException:
                print("Next button not found")
                break  

            index = 0  
            continue

        ad_buttons[index].click()

        try:
            zuruck_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[contains(text(), 'Zurück')]")))
            zuruck_button.click()
        except Exception:  
            index += 1  #move on to the next document
            continue

        index += 1
    except (StaleElementReferenceException, TimeoutException): 
        print("Exception occurred. Moving to the next document.")
        index += 1  #move on to the next document
        continue

       


In [19]:
import os
from bs4 import BeautifulSoup, NavigableString

In [20]:
#after all the documents were put in the basket, they were downloaded manually to the specified forder
#from there, the documents are defined 
folder_path = r'C:\Users\menta\Documents\german cooperatives_energie'
documents = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)]

In [None]:
pip install PyPDF2

In [28]:
import pandas as pd
from PyPDF2 import PdfReader

In [35]:


data = []

for document in documents:
    with open(document, "rb") as f:
        pdf_reader = PdfReader(f)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        # Assuming 'bezeichnung.aktuell' and 'aktuellesSatzungsdatum' are somewhere in the text
        name_start = text.find('a) Firma:') + len('a) Firma:')
        date_start = text.find('Satzung vom') + len('Satzung vom')
        name = text[name_start:name_start+50].strip()  # Adjust as needed
        date = text[date_start:date_start+20].strip()  # Adjust as needed
        if name and date:
            data.append([name, date])

df = pd.DataFrame(data, columns=['name', 'date'])

In [36]:
df.to_excel('C:/Users/menta/Documents/german_cooperatives3.xlsx', index=False)

In [37]:
print(df.date)

0        17.05.1992\nZuletzt
1        24.10.2012\nZuletzt
2       14.06.2023\nb) Sonst
3       14.01.2010\nb) Sonst
4        27.10.2009\nZuletzt
               ...          
352     be des aktuellen Reg
353    haftsregister\ndes Am
354     be des aktuellen Reg
355     be des aktuellen Reg
356      06.03.2013\nSatzung
Name: date, Length: 357, dtype: object


In [None]:
# getting data on  cooperatives for the search word 'energiegenossenschaft'
url = 'https://www.unternehmensregister.de/ureg/search1.2.html;jsessionid=66A8A10BF06DB0484AF0D6797932F100.web02-1'
driver = webdriver.Chrome()
driver.get(url)
timeout = 10

cookie_banner = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'cc_banner'))) 
allen_zustimmen_button = cookie_banner.find_element(By.ID, 'cc_all') 
allen_zustimmen_button.click() 

wait = WebDriverWait(driver, 10)
select_element = wait.until(EC.presence_of_element_located((By.ID, 'searchRegisterForm'))) 

driver.execute_script("arguments[0].scrollIntoView();", select_element)

search_name = driver.find_element(By.XPATH, "//input[@id='searchRegisterForm:registerDataCompanyName']") 
driver.execute_script("arguments[0].click();", search_name) 
search_name.send_keys("energiegenossenschaft") 


legal_form = driver.find_element(By.XPATH, "//select[@id='searchRegisterForm:registerDataLegalForm']") 
driver.execute_script("arguments[0].click();", legal_form) 

option_value_5 = wait.until(EC.element_to_be_clickable((By.XPATH, "//option[@value='5']")))
option_value_5.click()

search_button = driver.find_element(By.NAME, 'searchRegisterForm:j_idt336') 
search_button.click() 

button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Registerinformationen anzeigen"))) 
button.click()



index = 0
site_count = 1  
while True:
    try:
        si_buttons = WebDriverWait(driver, timeout).until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='reglink' and contains(text(), 'AD')]")))

        if index >= len(si_buttons):
            if site_count >= 10:  
                break
            try:
                next_site = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='next']/a[@class='page-nav']")))
                next_site.click()
                site_count += 1  
            except NoSuchElementException:
                print("Next button not found")
                break  

            index = 0  
            continue

        si_buttons[index].click()

        try:
            zuruck_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[contains(text(), 'Zurück')]")))
            zuruck_button.click()
        except Exception:  
            index += 1  
            continue

        index += 1
    except (StaleElementReferenceException, TimeoutException):  
        print("Exception occurred. Moving to the next document.")
        index += 1  
        continue

       


In [39]:
folder_path = r'C:\Users\menta\Documents\cooperatives_energiegenossenschaft'
documents = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)]

In [40]:

data = []

for document in documents:
    with open(document, "rb") as f:
        pdf_reader = PdfReader(f)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        
        name_start = text.find('a) Firma:') + len('a) Firma:')
        date_start = text.find('Satzung vom') + len('Satzung vom')
        name = text[name_start:name_start+50].strip() 
        date = text[date_start:date_start+20].strip()  
        if name and date:
            data.append([name, date])

df = pd.DataFrame(data, columns=['name', 'date'])

In [41]:
df.to_excel('C:/Users/menta/Documents/german_cooperatives_energiegenossenschaft.xlsx', index=False)

In [None]:
#next search word: Bürgenenergie
url = 'https://www.unternehmensregister.de/ureg/search1.2.html;jsessionid=66A8A10BF06DB0484AF0D6797932F100.web02-1'
driver = webdriver.Chrome()
driver.get(url)
timeout = 10

cookie_banner = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'cc_banner'))) 
allen_zustimmen_button = cookie_banner.find_element(By.ID, 'cc_all') 
allen_zustimmen_button.click() 

wait = WebDriverWait(driver, 10)
select_element = wait.until(EC.presence_of_element_located((By.ID, 'searchRegisterForm'))) 

driver.execute_script("arguments[0].scrollIntoView();", select_element)

search_name = driver.find_element(By.XPATH, "//input[@id='searchRegisterForm:registerDataCompanyName']") 
driver.execute_script("arguments[0].click();", search_name)
search_name.send_keys("bürgerenergie") 


legal_form = driver.find_element(By.XPATH, "//select[@id='searchRegisterForm:registerDataLegalForm']")
driver.execute_script("arguments[0].click();", legal_form) 

option_value_5 = wait.until(EC.element_to_be_clickable((By.XPATH, "//option[@value='5']"))) 
option_value_5.click()

search_button = driver.find_element(By.NAME, 'searchRegisterForm:j_idt336')
search_button.click() 

button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Registerinformationen anzeigen")))
button.click()


index = 0 
site_count = 1  
while True:
    try:
        si_buttons = WebDriverWait(driver, timeout).until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='reglink' and contains(text(), 'AD')]")))

        if index >= len(si_buttons):
            if site_count >= 5: 
                break
            try:
                next_site = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='next']/a[@class='page-nav']")))
                next_site.click()
                site_count += 1  
            except NoSuchElementException:
                print("Next button not found")
                break  

            index = 0  
            continue

        si_buttons[index].click()

        try:
            zuruck_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[contains(text(), 'Zurück')]")))
            zuruck_button.click()
        except Exception:  
            index += 1  
            continue

        index += 1
    except (StaleElementReferenceException, TimeoutException):  
        print("Exception occurred. Moving to the next document.")
        index += 1  
        continue

       


In [None]:
folder_path = r'C:\Users\menta\Documents\cooperatives_bürgerenergie'
documents = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)]

In [None]:

data = []

for document in documents:
    with open(document, "rb") as f:
        pdf_reader = PdfReader(f)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        
        name_start = text.find('a) Firma:') + len('a) Firma:')
        date_start = text.find('Satzung vom') + len('Satzung vom')
        name = text[name_start:name_start+50].strip() 
        date = text[date_start:date_start+20].strip()  
        if name and date:
            data.append([name, date])

df = pd.DataFrame(data, columns=['name', 'date'])

In [None]:
df.to_excel('C:/Users/menta/Documents/german_cooperatives_bürgenenergie.xlsx', index=False)

In [None]:
#same for 'Bürgenenergiegenossenschaft'
url = 'https://www.unternehmensregister.de/ureg/search1.2.html;jsessionid=66A8A10BF06DB0484AF0D6797932F100.web02-1'
driver = webdriver.Chrome()
driver.get(url)
timeout = 10

cookie_banner = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'cc_banner'))) 
allen_zustimmen_button = cookie_banner.find_element(By.ID, 'cc_all') 
allen_zustimmen_button.click() 

wait = WebDriverWait(driver, 10)
select_element = wait.until(EC.presence_of_element_located((By.ID, 'searchRegisterForm'))) 

driver.execute_script("arguments[0].scrollIntoView();", select_element)

search_name = driver.find_element(By.XPATH, "//input[@id='searchRegisterForm:registerDataCompanyName']")
driver.execute_script("arguments[0].click();", search_name)
search_name.send_keys("Bürgerenergiegenossenschaft") 


legal_form = driver.find_element(By.XPATH, "//select[@id='searchRegisterForm:registerDataLegalForm']")
driver.execute_script("arguments[0].click();", legal_form) 

option_value_5 = wait.until(EC.element_to_be_clickable((By.XPATH, "//option[@value='5']")))
option_value_5.click()

search_button = driver.find_element(By.NAME, 'searchRegisterForm:j_idt336')
search_button.click() 

button = wait.until(EC.element_to_be_clickable((By.LINK_TEXT, "Registerinformationen anzeigen")))
button.click()


index = 0
site_count = 1  
while True:
    try:
        si_buttons = WebDriverWait(driver, timeout).until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='reglink' and contains(text(), 'AD')]")))

        if index >= len(si_buttons):
            if site_count >= 4:
                break
            try:
                next_site = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='next']/a[@class='page-nav']")))
                next_site.click()
                site_count += 1  
            except NoSuchElementException:
                print("Next button not found")
                break  

            index = 0  
            continue

        si_buttons[index].click()

        try:
            zuruck_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[contains(text(), 'Zurück')]")))
            zuruck_button.click()
        except Exception:  
            index += 1 
            continue

        index += 1
    except (StaleElementReferenceException, TimeoutException): ú
        print("Exception occurred. Moving to the next document.")
        index += 1 
        continue

       


In [None]:
folder_path = r'C:\Users\menta\Documents\cooperatives_bürgerenergiegenossenschaft'
documents = [os.path.join(folder_path, filename) for filename in os.listdir(folder_path)]

In [None]:

data = []

for document in documents:
    with open(document, "rb") as f:
        pdf_reader = PdfReader(f)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
        
        name_start = text.find('a) Firma:') + len('a) Firma:')
        date_start = text.find('Satzung vom') + len('Satzung vom')
        name = text[name_start:name_start+50].strip() 
        date = text[date_start:date_start+20].strip()  
        if name and date:
            data.append([name, date])

df = pd.DataFrame(data, columns=['name', 'date'])

df.to_excel('C:/Users/menta/Documents/german_cooperatives_bürgenenergiegenossenschaft.xlsx', index=False)

In [None]:
url = 'https://www.unternehmensregister.de/ureg/registerPortal.html;jsessionid=26BD13F6C3C3D45497CB521B12BD7C96.web04-1?submitaction=pathnav&page.24=page'
driver = webdriver.Chrome()
driver.get(url)
timeout = 10

#first the cookies for the website need to be accepted. buttons in each case are defined based on the code available in webdevtools
cookie_banner = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'cc_banner'))) #locates the cookie banner
allen_zustimmen_button = cookie_banner.find_element(By.ID, 'cc_all') #locates the button to accept the cookies
allen_zustimmen_button.click() #clicks on the button to accept the cookies




#the following code goes through all the firms found by the search and clicks on the button named 'SI' for all of them, which will put the document with their registry information in the document basket
index = 0 #indexing the elements is necesary, since after clicking an SI button the the user will be taken to the next site from where they need to return to the previous. unfortunately, with a simple for loop for the SI buttons, the code cannot locate them again after returning to the previous site. opening the SI button in another window could not work, because in that case the documents will not be put in the same document basket
site_count = 11
while True:
    try:
        si_buttons = WebDriverWait(driver, timeout).until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='reglink' and contains(text(), 'CD')]")))

        if index >= len(si_buttons):
            if site_count >= 47:  #if 13 or more sites have been processed, break the loop
                break
            try:
                next_site = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='next']/a[@class='page-nav']")))
                next_site.click()
                site_count += 1  
            except NoSuchElementException:
                print("Next button not found")
                break  

            index = 0  
            continue

        si_buttons[index].click()

        try:
            zuruck_button = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[contains(text(), 'Zurück')]")))
            zuruck_button.click()
        except Exception:  
            index += 1  #move on to the next document
            continue

        index += 1
    except (StaleElementReferenceException, TimeoutException):  # Catch StaleElementReferenceException and TimeoutException
        print("Exception occurred. Moving to the next document.")
        index += 1  #move on to the next document
        continue

       


In [None]:
#same for strom, windkraft, solar, PV, wasserkraft