In [None]:
import sqlite3 as db
import pandas as pd
import re
import os


## 1. Load company names we want to extract XML files from

In [None]:
conn = db.connect('database.db')
df = pd.read_sql_query('select * from SI_docs_needed', conn)
conn.close()
df= df.drop_duplicates()

## 2. Check whether XML files have already been downloaded

In [None]:
# Directory where the documents are stored
doc_directory = r"Directory"

# Lists to store company names and reference numbers with and without matching documents
matching_companies = []
matching_refnum = []
no_matching_companies = []
no_matching_refnum = []

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    company_name = row['Name']
    reference_number = str(row['Reference_number'])

    # Flag to check if a matching document was found
    found = False

    # Search term is simply the reference number
    search_term = reference_number

    for doc_name in os.listdir(doc_directory):
        if search_term in doc_name:
            matching_companies.append(company_name)
            matching_refnum.append(reference_number)
            found = True
            break  # Exit loop if a match is found

    if not found:
        #print(f"No document found for {company_name} with reference number {reference_number}")
        no_matching_companies.append(company_name)
        no_matching_refnum.append(reference_number)

print("Xml files found for ", len(matching_companies), "companies")
print("Xml files not found for ", len(no_matching_companies), "companies")

## 3. Use Selenium to extract XML files from Handelsregister

In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time

# Combine the data into a DataFrame
combined_data = list(zip(no_matching_companies, no_matching_refnum))
df_combined_data = pd.DataFrame(combined_data, columns=['Name', 'Reference_number'])

# Function to download documents
def download_documents(df):
    # Chrome driver path
    chrome_driver_path = r'C:\Program Files\chromedriver.exe'

    # Specify the directory where you want to save the downloaded files
    download_directory = r"C:\Users\marma\Documents\INGENIERIA  INDUSTRIAL\2º MÁSTER\TFM\SI DOCS"

    # Specify the preferences to set the download directory
    prefs = {
        "download.default_directory": download_directory,
        "download.prompt_for_download": False,
        "download.directory_upgrade": True,
        "safebrowsing.enabled": True  # Disable safe browsing
    }

    # Set the preferences
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option("prefs", prefs)
    chrome_options.add_argument("--headless")  # Run Chrome in headless mode
    chrome_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
    chrome_options.add_argument("--remote-debugging-port=9222")  # This can sometimes help

    service = Service(chrome_driver_path)
    
    # Initialize Chrome WebDriver with the options and executable path
    driver = webdriver.Chrome(options=chrome_options, service=service)

    # Iterate over each company
    for company in df.Name:
        try:
            # Open the browser
            driver.get("https://www.handelsregister.de/rp_web/erweitertesuche.xhtml")

            # Enter the company name in the search box
            search = driver.find_element(By.NAME, "form:schlagwoerter")
            search.send_keys(company)

            # Select exact company name option
            search_exact_companyname = driver.find_element(By.XPATH, "//label[@for='form:schlagwortOptionen:2']")
            search_exact_companyname.click()

            # Click the search button
            button = driver.find_element(By.ID, "form:btnSuche")
            driver.execute_script("arguments[0].scrollIntoView(true);", button)
            button.click()

            # Wait for the search results to be visible
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//table[contains(@class, 'ergebnis')]"))
            )

            # Find all potential SI documents
            document_links = driver.find_elements(By.XPATH, "//span[contains(@class, 'underlinedText')]")

            # Iterate through each found element to check for 'SI'
            for link in document_links:
                if "SI" in link.get_attribute("outerHTML"):
                    driver.execute_script("arguments[0].scrollIntoView(true);", link)
                    link.click()
                    print(f"Document link 'SI' clicked successfully for the company: {company}")
                    break
            else:
                print(f"No 'SI' document found for the company: {company}")

        except (NoSuchElementException, TimeoutException) as e:
            print(f"Failed to click the document link for the company: {company} - Error: {str(e)}")
        
        time.sleep(5)  # Adjust the sleep time as necessary

    driver.quit()

# Run the document download process
download_documents(df_combined_data)
