In [1]:
import sqlite3 as db
import pandas as pd
import re


# List of common suffixes to remove
suffixes = ['gmbh', 'co', 'kg', 'inc', 'llc', 'ltd', 'ag', 'corporation', 'corp','deutschland','raffinerie','oel','werk','nord','sud','europa','holding','europe','se','oil','aluminium','leuna','trebsen']


## 1. Load databases and extract key words from company names for further comparison

In [2]:
#load heat production database
# If the first row contains headers, you can skip it using the 'header' parameter:
df_ref = pd.read_excel('points.xlsx')

# Ensure all columns are converted to strings before concatenation
df_ref['StreetNameAndNumber'] = df_ref['StreetNameAndNumber'].astype(str)
df_ref['PostalCode'] = df_ref['PostalCode'].astype(str)

# Perform the concatenation
df_ref['Address'] = df_ref['StreetNameAndNumber'] + ',' + df_ref['PostalCode'] + ', Deutschland'

df_ref = df_ref.drop(columns=['Unnamed: 0','SiteId','ID_Eurostat','ProcessInfo','NUTS1ID','NUTS3ID','geometry','StreetNameAndNumber','Country'])

print(len(df_ref))
df_ref.head()

370


Unnamed: 0,CompanyName,PostalCode,Subsector_Name,Latitude,Longitude,H2-Potential in TWh,Address
0,Shell Deutschland Oil GmbH Rheinland Raffineri...,50997,Refineries,50.854993,6.976846,1.5837,"Godorfer Hauptstr. 150,50997, Deutschland"
1,Shell Deutschland Oil GmbH Rheinland Raffineri...,50389,Refineries,50.81411,7.005756,10.2139,"Ludwigshafener Str. 1,50389, Deutschland"
2,RUHR OEL GMBH Werk Scholven,45896,Refineries,51.596033,7.028909,28.8769,"Pawiker Strasse 30,45896, Deutschland"
3,Ruhr Oel GmbH Werk Horst,45899,Refineries,51.539607,7.037203,1.894,"Johannastrasse 2-8,45899, Deutschland"
4,TOTAL Raffinerie Mitteldeutschland GmbH (Raffi...,6237,Refineries,51.289677,11.990582,5.1709,"Maienweg 1,6237, Deutschland"


##### Selecting main words from company names to search for them in the Handelsregister database

In [3]:
df = df_ref

# Function to clean company names
def clean_name(name):
    # Remove common suffixes and extra whitespace
    pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
    # Remove special characters (e.g., & . ,)
    name = re.sub(r'[&.,+()]', '', name)
    
    return re.sub(pattern, '', name, flags=re.IGNORECASE).strip()

# Apply the clean_name function to create a new column for comparison
df['CleanedName'] = df['CompanyName'].apply(clean_name)

print("length df: ", len(df))

length df:  370


In [4]:
#load handelsregister
conn = db.connect('latlongsdata.db')
df_h = pd.read_sql_query('select * from Lat_Long_Table_HandelsregisterV3', conn)
#df_h = pd.read_sql_query('select * from Lat_Long_Table_Handelsregister_Referencepaper', conn)
conn.close()
df_h = df_h.drop_duplicates(subset=['name','register_identifier','zip'], keep='first')
print(len(df_h))
#df_h = df_h.drop(columns=['level_0'])
df_h.head()

3551


Unnamed: 0,name,register_identifier,location_lat,location_long,location_address,zip,registered_address
0,MEDIAN Klinik Berlin-Kladow GmbH & Co. KG,HRA_21285,52.5177232,13.3248465,"Jeep König, 28-29, Franklinstraße, Charlottenb...",10587,"Franklinstraße 28-29, 10587 Berlin"
1,Am Hafen Rudow - West ESM Grundstücksverwaltun...,HRA_41528,52.4331859,13.4941859,"145, Seidelbastweg, Rudow, Neukölln, Berlin, 1...",12357,"Seidelbastweg 145, 12357 Berlin"
2,Waldow6A GmbH & Co. KG,HRA_50946,52.505358,13.4889231,"Yoga Barn, 5, Münsterlandstraße, Weitlingkiez,...",10317,"Münsterlandstraße 5, 10317 Berlin"
3,Wepag Liegenschaftsverwaltung GmbH & Co. Jüter...,HRA_62557,52.5885063,13.3663073,"45, Schillerstraße, Wilhelmsruh, Pankow, Berli...",13158,"Schillerstraße 45, 13158 Berlin"
4,Wepag Liegenschaftsverwaltung GmbH & Co. Berli...,HRA_62592,52.5885063,13.3663073,"45, Schillerstraße, Wilhelmsruh, Pankow, Berli...",13158,"Schillerstraße 45, 13158 Berlin"


In [5]:
df2 = df_h

# Function to clean company names
def clean_name(name):
    # Remove common suffixes and extra whitespace
    pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
    # Remove special characters (e.g., & . ,)
    name = re.sub(r'[&.,+()]', '', name)
    
    return re.sub(pattern, '', name, flags=re.IGNORECASE).strip()

# Apply the clean_name function to create a new column for comparison
df2['CleanedName'] = df2['name'].apply(clean_name)

print("length df: ", len(df2))

length df:  3551


## 2. Finding coincidences based on comparing company names' key words and then postal codes

### A. Compare names and then postal codes

In [6]:
df1 = df

df2 = df2

# Create an empty list to store merged rows
merged_rows = []

# Iterate through each row in df1
for i, row1 in df1.iterrows():
    words1 = set(row1['CleanedName'].split())
    
    # Compare with each row in df2
    for j, row2 in df2.iterrows():
        words2 = set(row2['CleanedName'].split())
        
        # If there's at least one common word, merge the rows
        if words1.intersection(words2):
            merged_row = {**row1, **row2}  # Merge the two rows into one dictionary
            merged_rows.append(merged_row)  # Add the merged row to the list

# Convert the list of merged rows into a new DataFrame
merged_df = pd.DataFrame(merged_rows)

In [7]:
#Keep rows where postalcodes coincide
filtered_df = merged_df[merged_df['zip'] == merged_df['PostalCode']]

### B. Comparing postal codes and then names

In [8]:
#comparing postal code
# Merge the two dataframes on the Postal code column
filtered_df2 = pd.merge(df_h, df_ref, left_on='zip', right_on='PostalCode', how='inner')

In [9]:
merged_df = filtered_df2
# Function to clean and split text into words
def clean_and_split(text):
    # Remove special characters and split into words
    words = re.sub(r'[^\w\s]', '', text).lower().split()
    return [word for word in words if word not in suffixes]

# Apply the function to both columns
merged_df['name_words'] = merged_df['name'].apply(clean_and_split)
merged_df['CompanyName_words'] = merged_df['CompanyName'].apply(clean_and_split)

# Function to check if there is any common word between two lists
def has_common_word(list1, list2):
    return any(word in list2 for word in list1)

# Filter the DataFrame
filteredmerged_df = merged_df[merged_df.apply(lambda row: has_common_word(row['name_words'], row['CompanyName_words']), axis=1)]

# Drop the helper columns if needed
filteredmerged_df = filteredmerged_df.drop(columns=['name_words', 'CompanyName_words'])
filteredmerged_df.drop_duplicates()

Unnamed: 0,name,register_identifier,location_lat,location_long,location_address,zip,registered_address,CleanedName_x,CompanyName,PostalCode,Subsector_Name,Latitude,Longitude,H2-Potential in TWh,Address,CleanedName_y
4,ArcelorMittal Eisenhüttenstadt GmbH,HRB_3883,52.1547005,14.6345007,"Werkstraße, Eisenhüttenstadt, Oder-Spree, Bran...",15890,"Werkstraße 1, 15890 Eisenhüttenstadt",ArcelorMittal Eisenhüttenstadt,ArcelorMittal Eisenhüttenstadt GmbH,15890,Mineral Processing,52.169490,14.626637,0.1075,"Straße 20 Nr. 6, 15890 Eisenhüttenstadt, Germa...",ArcelorMittal Eisenhüttenstadt
6,ArcelorMittal Eisenhüttenstadt GmbH,HRB_3883,52.1547005,14.6345007,"Werkstraße, Eisenhüttenstadt, Oder-Spree, Bran...",15890,"Werkstraße 1, 15890 Eisenhüttenstadt",ArcelorMittal Eisenhüttenstadt,ArcelorMittal Eisenhüttenstadt GmbH,15890,Metal Processing,52.166141,14.617682,0.8461,"Werkstraße 1,15890, Deutschland",ArcelorMittal Eisenhüttenstadt
7,ArcelorMittal Eisenhüttenstadt GmbH,HRB_3883,52.1547005,14.6345007,"Werkstraße, Eisenhüttenstadt, Oder-Spree, Bran...",15890,"Werkstraße 1, 15890 Eisenhüttenstadt",ArcelorMittal Eisenhüttenstadt,ArcelorMittal Eisenhüttenstadt GmbH,15890,"Steel, primary",52.166141,14.617682,4.6214,"Werkstraße 1,15890, Deutschland",ArcelorMittal Eisenhüttenstadt
9,LEIPA Georg Leinfelder GmbH,HRB_10892,53.07905485,14.3184909807109,"LEIPA Leinfelder Papier Werk Schwedt Süd, 34, ...",16303,"Kuhheide 34, 16303 Schwedt/Oder",LEIPA Georg Leinfelder,LEIPA Georg Leinfelder GmbH Werk Schwedt,16303,Paper and printing,53.079099,14.319766,0.9789,"Kuhheide 34,16303, Deutschland",LEIPA Georg Leinfelder Schwedt
12,LEIPA Group GmbH,HRB_9825,53.07905485,14.3184909807109,"LEIPA Leinfelder Papier Werk Schwedt Süd, 34, ...",16303,"Kuhheide 34, 16303 Schwedt/Oder",LEIPA Group,LEIPA Georg Leinfelder GmbH Werk Schwedt,16303,Paper and printing,53.079099,14.319766,0.9789,"Kuhheide 34,16303, Deutschland",LEIPA Georg Leinfelder Schwedt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472,Zement- und Kalkwerke Otterbein GmbH & Co. KG,HRA_431,50.59020975,9.50159388005987,"50, Hauptstraße, Müs, Großenlüder, Landkreis F...",36137,"Hauptstraße 50, 36137 Großenlüder",Zement- und Kalkwerke Otterbein,Zement- und Kalkwerke Otterbein GmbH & Co. KG,36137,Mineral Processing,50.584320,9.509245,0.0495,"Georg-Otterbein-Straße 123,36137, Deutschland",Zement- und Kalkwerke Otterbein
473,W & M Pappen GmbH & Co. KG,HRA_3938,51.91256665,9.25288328186393,"W&M Pappen, 2, Pappmühle, Elbrinxen, Lügde, Kr...",32676,"Pappmühle 2, 32676 Lügde",W M Pappen,W & M Pappen GmbH & Co. KG,32676,Paper and printing,51.912450,9.252620,0.0008,"Pappmühle 2,32676, Deutschland",W M Pappen
475,Mercer Stendal GmbH,HRB_2446,52.72810135,12.0069982604889,"Mercer Stendal, 1, Goldbecker Straße, Industri...",39596,"Goldbecker Str. 1, 39596 Arneburg",Mercer Stendal,Mercer Group,39596,Paper and printing,52.690620,11.985900,1.3197,"Goldbecker Strasse 1,39596, Deutschland",Mercer Group
477,Mercer Timber Products Stendal GmbH,HRB_28026,52.72810135,12.0069982604889,"Mercer Stendal, 1, Goldbecker Straße, Industri...",39596,"Goldbecker Straße 1, 39596 Arneburg",Mercer Timber Products Stendal,Mercer Group,39596,Paper and printing,52.690620,11.985900,1.3197,"Goldbecker Strasse 1,39596, Deutschland",Mercer Group


## C. Now we merge both (1filtered_df and 2filteredmerged_df)

In [10]:
filteredmerged_df = filteredmerged_df.drop(columns=['CleanedName_x', 'CleanedName_y'])

# Concatenate the two DataFrames
merged_dfh2pot = pd.concat([filteredmerged_df, filtered_df])

# Drop duplicate rows
# Optionally, you can specify which columns to consider for detecting duplicates
merged_dfh2pot.drop_duplicates(subset=['H2-Potential in TWh', 'CompanyName'], keep='first')
merged_dfh2pot = merged_dfh2pot.drop_duplicates()
len(merged_dfh2pot)

391

## 3. Check from which documents I can get information from the XML files available in the Handelsregister

In [11]:
#antes de runear esto asegurarme de que todos los si docs nuevos están en la carpeta de si docs from other tries 
import os

df = finaldf
df = df[['name','register_identifier']]

df = df.rename(columns={'name': 'Name', 'register_identifier': 'Reference_number'})

# Directory where the documents are stored
doc_directory = r"C:\Users\marma\Documents\INGENIERIA  INDUSTRIAL\2º MÁSTER\TFM\SI DOCS from other tries"

# Lists to store company names and reference numbers with and without matching documents
matching_companies = []
matching_refnum = []
no_matching_companies = []
no_matching_refnum = []

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    company_name = row['Name']
    reference_number = str(row['Reference_number'])

    # Flag to check if a matching document was found
    found = False

    # Search term is simply the reference number
    search_term = reference_number

    for doc_name in os.listdir(doc_directory):
        if search_term in doc_name:
            matching_companies.append(company_name)
            matching_refnum.append(reference_number)
            found = True
            break  # Exit loop if a match is found

    if not found:
        #print(f"No document found for {company_name} with reference number {reference_number}")
        no_matching_companies.append(company_name)
        no_matching_refnum.append(reference_number)

print("Xml files found for ", len(matching_companies), "companies")
print("Xml files not found for ", len(no_matching_companies), "companies")

NameError: name 'finaldf' is not defined

## 4. Extract GRUNDKAPITAL / STAMMKAPITAL / HAFTEINLAGE values and add them to the dataframe

In [None]:
import os
import xml.etree.ElementTree as et
import pandas as pd

# Assuming matching_companies and matching_refnum are already defined
combined_data = list(zip(matching_companies, matching_refnum))

# Create DataFrame from combined data
df_grundkapital = pd.DataFrame(combined_data, columns=['Name', 'Reference_number'])
stammkapital = []
grundkapital = []
hafteinlage = []

# Directory containing XML files
download_directory = r"C:\Users\marma\Documents\INGENIERIA  INDUSTRIAL\2º MÁSTER\TFM\SI DOCS from other tries"

# Define the namespaces
namespaces = {
    'xjustiz': 'http://www.xjustiz.de'
}

# Iterate over each company
for company in df_grundkapital.Name:
    # Get the reference number of the company
    reference_number = str(df_grundkapital[df_grundkapital['Name'] == company]['Reference_number'].iloc[0])
    
    xml_file_path = None
    for xml_file in os.listdir(download_directory):
        if xml_file.endswith(".xml") and reference_number in xml_file:
            xml_file_path = os.path.join(download_directory, xml_file)
            break  # Exit loop if a matching file is found

    if xml_file_path:
        with open(xml_file_path, "r", encoding="utf-8") as file:
            xml_content = file.read()
        
        xml_tree = et.fromstring(xml_content)

        # STAMMKAPITAL
        stammkapital_value = xml_tree.findall(".//xjustiz:fachdatenRegister/xjustiz:auswahl_zusatzangaben/xjustiz:kapitalgesellschaft/xjustiz:zusatzGmbH/xjustiz:stammkapital/xjustiz:zahl", namespaces)
        stammkapital.append(stammkapital_value[0].text if stammkapital_value else "0")

        # GRUNDKAPITAL
        grundkapital_value = xml_tree.findall(".//xjustiz:fachdatenRegister/xjustiz:auswahl_zusatzangaben/xjustiz:kapitalgesellschaft/xjustiz:zusatzAktiengesellschaft/xjustiz:grundkapital/xjustiz:hoehe/xjustiz:zahl", namespaces)
        grundkapital.append(grundkapital_value[0].text if grundkapital_value else "0")

        # HAFTEINLAGE - Extract all, sum them up, and append to the list
        hafteinlage_elements = xml_tree.findall(".//xjustiz:fachdatenRegister/xjustiz:auswahl_zusatzangaben/xjustiz:personengesellschaft/xjustiz:zusatzGmbH/xjustiz:datenKommanditist/xjustiz:hafteinlage/xjustiz:zahl", namespaces)
        if not hafteinlage_elements:
            hafteinlage_elements = xml_tree.findall(".//xjustiz:fachdatenRegister/xjustiz:auswahl_zusatzangaben/xjustiz:personengesellschaft/xjustiz:zusatzKG/xjustiz:datenKommanditist/xjustiz:hafteinlage/xjustiz:zahl", namespaces)
        
        total_hafteinlage = sum(float(el.text) for el in hafteinlage_elements)
        hafteinlage.append(str(total_hafteinlage) if hafteinlage_elements else "0")
    else:
        stammkapital.append("0")
        grundkapital.append("0")
        hafteinlage.append("0")

# Add the extracted data to the DataFrame
df_grundkapital['Stammkapital'] = stammkapital
df_grundkapital['Grundkapital'] = grundkapital
df_grundkapital['Hafteinlage'] = hafteinlage

# Display the final DataFrame
print(df_grundkapital)


In [None]:
df_grundkapitaldd = df_grundkapital.drop_duplicates()
len(df_grundkapitaldd)

In [None]:
# Adjust format of dataframe
df_grundkapital.rename(columns={'Reference_number': 'register_identifier'}, inplace=True)
combined_df = pd.merge(df_grundkapital, finaldf, on='register_identifier', how='inner')
combined_df = combined_df.drop(columns=['Name','CleanedName'])
combined_df = combined_df.drop_duplicates()
combined_df.head()

In [None]:
# Connect to SQLite database (or create it if it doesn't exist)
conn = db.connect('ubicando.db')

# Save the DataFrame to a table in the SQLite database
combined_df.to_sql('12h2pot', conn, if_exists='replace', index=False)

# Close the connection
conn.close()

In [None]:
len(combined_df)

#### 4. Save names for which results were not found for further searches

In [None]:
filtered_df_ref = df_ref[~df_ref['CompanyName'].isin(combined_df['CompanyName'])]

filtered_df_ref = filtered_df_ref.drop(columns=['Subsector_Name','H2-Potential in TWh'])
filtered_df_ref = filtered_df_ref.assign(source='Heatpot')

In [None]:
conn = db.connect('ubicando.db')

# Save the DataFrame to a table in the SQLite database
filtered_df_ref.to_sql('notmatching13', conn, if_exists='replace', index=False)

# Close the connection
conn.close()

In [None]:
combined_data = list(zip(no_matching_companies, no_matching_refnum))
combined_data = pd.DataFrame(combined_data, columns=['Name', 'Reference_number'])

conn = db.connect('ubicando.db')

# Save the DataFrame to a table in the SQLite database
combined_data.to_sql('SIdocsccantbefound', conn, if_exists='replace', index=False)

# Close the connection
conn.close()
