In [3]:
# Importing packages

import pandas as pd
import numpy  as np
from sklearn.cluster import KMeans
import seaborn as sns
import datetime 
import chardet


# tBaustoff Dataset

The tBaustoff dataset provides material mapping information with 10 columns. It connects to the ÖKOBAUDAT dataset through process UUIDs and contains end-of-life scenario information for various materials.

End-of-Life Scenarios: Different materials have different potential pathways

- Reuse: Direct application in new construction (highest value retention)
- Recycling: Processing into new material (partial value retention)
- Downcycling: Processing into lower-value applications
- Disposal: Landfilling or incineration (lowest circularity value)

## Data Wrangling

### Data Extraction

In [4]:
# detect files encoding
with open("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/OBD_2024_I.csv", "rb") as f:
    result = chardet.detect(f.read(100000))  # Read first 100,000 bytes
    print(result)
# Importing data from CSV and converting to dataframe
obd_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/obd_merged.csv", delimiter=";", encoding = "utf-8-sig", low_memory=False)
tbau_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/tBaustoff_with_OBD_mapping.csv", delimiter=",", encoding = result["encoding"], low_memory=False)


{'encoding': 'ISO-8859-1', 'confidence': 0.7295400999999999, 'language': ''}


In [5]:
tbau_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 815 entries, 0 to 814
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   oekobaudatProcessUuid        728 non-null    object 
 1   oekobaudatDatastockUuid      731 non-null    object 
 2   oekobaudatDatastockName      731 non-null    object 
 3   productId                    815 non-null    int64  
 4   tBaustoffVersion             815 non-null    object 
 5   productName                  815 non-null    object 
 6   eolCategoryId                815 non-null    int64  
 7   eolCategoryName              815 non-null    object 
 8   eolScenarioUnbuiltReal       815 non-null    object 
 9   eolScenarioUnbuiltPotential  815 non-null    object 
 10  technologyFactor             815 non-null    float64
dtypes: float64(1), int64(2), object(8)
memory usage: 70.2+ KB


In [6]:
tbau_df.head(6)

Unnamed: 0,oekobaudatProcessUuid,oekobaudatDatastockUuid,oekobaudatDatastockName,productId,tBaustoffVersion,productName,eolCategoryId,eolCategoryName,eolScenarioUnbuiltReal,eolScenarioUnbuiltPotential,technologyFactor
0,3b1f0c75-07a4-4182-b310-5529fc5b54a6,22885a6e-1765-4ade-a35e-ae668bd07256,OBD_2023_I,1,2024-Q4,Acetyliertes Holz,1,Holz - acetyliert,EV+,EV+,0.0
1,0dec19c7-53db-4cb9-bbba-4d44d3da62a9,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,1,2024-Q4,Acetyliertes Holz,1,Holz - acetyliert,EV+,EV+,0.0
2,bd6d6d89-b76d-4002-a217-afffbb8aa308,448d1096-2017-4901-a560-f652a83c737e,OBD_2020_II,2,2024-Q4,Aluminium Profil,2,Alu unbeschichtet od. eloxiert (Sz. Knetleg zu...,RC+,CL+,0.75
3,fdc99ab8-d843-44ec-a66c-92367d244321,22885a6e-1765-4ade-a35e-ae668bd07256,OBD_2023_I,2,2024-Q4,Aluminium Profil,2,Alu unbeschichtet od. eloxiert (Sz. Knetleg zu...,RC+,CL+,0.75
4,3feca796-791b-46d3-8160-95ef243ffb9d,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,2,2024-Q4,Aluminium Profil,2,Alu unbeschichtet od. eloxiert (Sz. Knetleg zu...,RC+,CL+,0.75
5,a4c1c27c-53a0-4027-83f6-88c52c758bb1,448d1096-2017-4901-a560-f652a83c737e,OBD_2020_II,3,2024-Q4,Aluminiumblech,2,Alu unbeschichtet od. eloxiert (Sz. Knetleg zu...,RC+,CL+,0.75


### Data Cleaning

In [7]:
# identifying the columns with missing values
missing_values = tbau_df.isnull().sum()

In [8]:
missing_values

oekobaudatProcessUuid          87
oekobaudatDatastockUuid        84
oekobaudatDatastockName        84
productId                       0
tBaustoffVersion                0
productName                     0
eolCategoryId                   0
eolCategoryName                 0
eolScenarioUnbuiltReal          0
eolScenarioUnbuiltPotential     0
technologyFactor                0
dtype: int64

We have 87 products of tbaustoff that are not in the OKD.
Let's see if we can impute them by trying to find the product name in OKD

In [9]:
missing_name = tbau_df[tbau_df["oekobaudatProcessUuid"].isnull()]
missing_name.head()

Unnamed: 0,oekobaudatProcessUuid,oekobaudatDatastockUuid,oekobaudatDatastockName,productId,tBaustoffVersion,productName,eolCategoryId,eolCategoryName,eolScenarioUnbuiltReal,eolScenarioUnbuiltPotential,technologyFactor
16,,,,5,2024-Q4,Asphalttragschicht - Hochbau,4,Asphalt (Hochbau),Dep+,CL+,0.75
26,,,,7,2024-Q4,"Balkenschichtholz Nadelholz - unbeschichtet, n...",6,"Holz - massiv, naturbelassen",RC+,CL+,0.75
30,,,,39,2024-Q4,Brettsperrholz,6,"Holz - massiv, naturbelassen",RC+,CL+,0.75
31,,,,157,2024-Q4,Konstruktionsvollholz - naturbelassen,6,"Holz - massiv, naturbelassen",RC+,CL+,0.75
56,,,,9,2024-Q4,Beton - Carbonfasern / Textilbeton,8,Beton - Carbonfasern,Dep-,Dep-,0.0


In [10]:
# looking for the missing values in the OBD dataframe
missing_obd = obd_df[obd_df["Name (en)"] == "Konstruktionsvollholz - naturbelassen"]
missing_obd

Unnamed: 0,UUID,Version,Name (de),Name (en),Kategorie (original),Kategorie (en),Konformität,Laenderkennung,Typ,Referenzjahr,...,HTPnc (A2),IRP (A2),SOP (A2),ODP (A2),POCP (A2),ADPF (A2),ADPE (A2),WDP (A2),source_year,impact_standard


In [11]:
#check duplicated values in tbau_df
duplicated_rows = tbau_df[tbau_df.duplicated()]
print("duplicated rows: ",len(duplicated_rows))

duplicated rows:  0


### Unique Values

In [12]:
obd_uuids = obd_df['UUID'].nunique()
print (obd_uuids, "unique processes in the OBD data")


3863 unique processes in the OBD data


In [13]:
obd_unique_names = obd_df['Name (en)'].nunique()
print(obd_unique_names," unique materials in the OBD data set")


2495  unique materials in the OBD data set


In [14]:
# unique products id 
tbau_df["productId"].nunique()

345

We find different processuuid for same products id. This is due to the fact that the uuid of a product changes in each version/year of the OBD (at least in Tbaustoff)

In [15]:
# example for productid 2 (Aluminium Profil) of tbaustoff   
aluminum_2020 = obd_df[obd_df["UUID"] == "bd6d6d89-b76d-4002-a217-afffbb8aa308"]
aluminum_2020[["Name (en)","source_year","Modul","AP (A2)", "GWPtotal (A2)", "AP", "GWP"]]

Unnamed: 0,Name (en),source_year,Modul,AP (A2),GWPtotal (A2),AP,GWP
24562,Aluminium section,2020,D,,,-0.025543,-7.889704
24563,Aluminium section,2020,C2,,,5e-06,0.002553
24564,Aluminium section,2020,C1,,,0.0,0.0
24565,Aluminium section,2020,A1-A3,,,0.032528,10.677035


In [16]:
aluminum_2023 = obd_df[obd_df["UUID"] == "fdc99ab8-d843-44ec-a66c-92367d244321"]
aluminum_2023[["Name (en)","source_year","Modul","AP (A2)", "GWPtotal (A2)","AP", "GWP"]]

Unnamed: 0,Name (en),source_year,Modul,AP (A2),GWPtotal (A2),AP,GWP
15297,Aluminium section,2023,A1-A3,0.035951,10.697766,,
15298,Aluminium section,2023,C1,0.0,0.0,,
15299,Aluminium section,2023,C2,1e-05,0.003254,,
15300,Aluminium section,2023,D,-0.025415,-7.276583,,


In [17]:
aluminum_2024 = obd_df[obd_df["UUID"] == "3feca796-791b-46d3-8160-95ef243ffb9d"]
aluminum_2024[["Name (en)","source_year","Modul","AP (A2)", "GWPtotal (A2)","AP", "GWP"]]

Unnamed: 0,Name (en),source_year,Modul,AP (A2),GWPtotal (A2),AP,GWP
2254,Aluminium section,2024,A1-A3,0.040735,10.896927,,
2255,Aluminium section,2024,C2,5e-06,0.00426,,
2256,Aluminium section,2024,C3,0.0,0.0,,
2257,Aluminium section,2024,D,-0.029156,-7.51152,,


same product id linked to 3 different UUID

Are the metrics different in different years?
OKD 2020 was working with E 15804 +A1, so is missing all the information of the +A2 columns
There's not much difference between oko 2023 and 2024, at least in the metrics examined, but they are not exactly the same.
Missing values of 2023 or 204 could definitely be imputed with the metrics of the other year. Could have the same approach for year 2020 and previous datasets. 

The total number of actually unique processes in oko is less than expected, given that we can find a same product name with different uuid (depending on the source year). A rough estimate would be to divide the total number of uuids by 3, but not necessarily all processes will be present in all years. 

SUMMARY:

- 345 tbaustoff materials matching with 728 processes of OBD (of which many are same processes but in different version/year). 
- 2619 number of processes in OBD (after removing duplicates for different versions), so missing 1891 to fully match with tbs

In the OBD there are :
- 3863 unique processes (2619 if we consider that some of them are represented with different uuids depending of the version of the dataset)
- 2495  unique materials 

In [18]:
# determining actual unique UUIDS. find rows with same name and modul

# group by name and modul
grouped_df = obd_df.groupby(['Name (en)', 'Modul']).size().reset_index(name='count')

duplicates = grouped_df[grouped_df['count'] > 1]
duplicates

Unnamed: 0,Name (en),Modul,count
0,1.2.04Expanded clay sand,A1-A3,2
72,3- and 5-layer solid wood panel (German average),A1,2
73,3- and 5-layer solid wood panel (German average),A1-A3,2
74,3- and 5-layer solid wood panel (German average),A2,2
75,3- and 5-layer solid wood panel (German average),A3,2
...,...,...,...
18087,voestalpine rails,D,2
18088,wall elements,A1-A3,2
18089,wall elements,C1,2
18090,wall elements,C2,2


these processes are the actual repeated ones. 

In [19]:
obd_df[obd_df["Name (en)"] == "1.2.04Expanded clay sand"]

Unnamed: 0,UUID,Version,Name (de),Name (en),Kategorie (original),Kategorie (en),Konformität,Laenderkennung,Typ,Referenzjahr,...,HTPnc (A2),IRP (A2),SOP (A2),ODP (A2),POCP (A2),ADPF (A2),ADPE (A2),WDP (A2),source_year,impact_standard
1495,8ac5659c-5918-41bb-966d-91efdd6c4e50,20.24.070,Blähton Sand,1.2.04Expanded clay sand,'Mineralische Baustoffe' / 'Zuschläge' / 'Bläh...,'Mineral building products' / 'Concrete aggreg...,'EN 15804+A2 (EF 3.1)',DE,generic dataset,2023.0,...,,,,1.87e-12,0.001075,4.812913,2.124977e-08,0.02396,2024,A2
15973,008280d2-302f-4735-b845-1b3af6f14151,20.23.050,Blähton Sand,1.2.04Expanded clay sand,'Mineralische Baustoffe' / 'Zuschläge' / 'Bläh...,'Mineral building products' / 'Concrete aggreg...,'EN 15804+A2',DE,generic dataset,2022.0,...,,,,1.5955e-12,0.001071,4.911426,1.548417e-08,0.022202,2023,A2


they both correspond to the same material and process (modul A3), yet they have different UUID (because of the source_year/version). The env metrics are almost the same, but enough to drop them as duplicates? In the next step I will ran a trial to see how it would look like

In [20]:
# Sort by source year descending to keep the latest entry in case of duplicates
obd_df_sorted = obd_df.sort_values(by="source_year", ascending=False)

# Drop duplicates keeping the latest version (based on UUID, Version, Modul)
deduped_df = obd_df_sorted.drop_duplicates(subset=["Name (en)", "Modul"], keep="first") 

# Reset index and show result
deduped_df.reset_index(drop=True, inplace=True)

In [21]:
deduped_df["UUID"].nunique()

2619

In [22]:
deduped_df["Name (en)"].nunique()

2495

In [23]:
deduped_df[deduped_df["Name (en)"] == "Aluminium section"]

Unnamed: 0,UUID,Version,Name (de),Name (en),Kategorie (original),Kategorie (en),Konformität,Laenderkennung,Typ,Referenzjahr,...,HTPnc (A2),IRP (A2),SOP (A2),ODP (A2),POCP (A2),ADPF (A2),ADPE (A2),WDP (A2),source_year,impact_standard
2203,3feca796-791b-46d3-8160-95ef243ffb9d,20.24.070,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2 (EF 3.1)',DE,generic dataset,2023.0,...,,,,1.3e-15,5e-06,0.054593,7.05599e-10,3e-05,2024,A2
2204,3feca796-791b-46d3-8160-95ef243ffb9d,20.24.070,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2 (EF 3.1)',DE,generic dataset,2023.0,...,,,,7.55018e-11,0.027316,131.913039,1.091803e-06,1.610595,2024,A2
2233,3feca796-791b-46d3-8160-95ef243ffb9d,20.24.070,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2 (EF 3.1)',DE,generic dataset,2023.0,...,,,,-4.68803e-11,-0.018711,-87.27409,-6.219902e-07,-1.211407,2024,A2
2234,3feca796-791b-46d3-8160-95ef243ffb9d,20.24.070,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2 (EF 3.1)',DE,generic dataset,2023.0,...,,,,0.0,0.0,0.0,0.0,0.0,2024,A2
14177,fdc99ab8-d843-44ec-a66c-92367d244321,20.23.050,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2',DE,generic dataset,2022.0,...,,,,0.0,0.0,0.0,0.0,0.0,2023,A2


Went from 12 rows to 5, one for each process (instead of nr of processes x number of versions)

In [24]:
deduped_df[deduped_df["Name (en)"] == "Aluminium section"]["Modul"]

2203        C2
2204     A1-A3
2233         D
2234        C3
14177       C1
Name: Modul, dtype: object

In [25]:
obd_df[obd_df["UUID"] == "27f20dc1-5529-4194-8a06-1ae5b7ba6a51"]["Szenario"]

11559              energetisch
11581                      NaN
11582                      NaN
11583                      NaN
11584                      NaN
11585    Entsorgung Verpackung
11586                      NaN
11587                      NaN
Name: Szenario, dtype: object

In [26]:
obd_df[obd_df["Name (en)"] == "Aluminium section"]

Unnamed: 0,UUID,Version,Name (de),Name (en),Kategorie (original),Kategorie (en),Konformität,Laenderkennung,Typ,Referenzjahr,...,HTPnc (A2),IRP (A2),SOP (A2),ODP (A2),POCP (A2),ADPF (A2),ADPE (A2),WDP (A2),source_year,impact_standard
2254,3feca796-791b-46d3-8160-95ef243ffb9d,20.24.070,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2 (EF 3.1)',DE,generic dataset,2023.0,...,,,,7.55018e-11,0.027316,131.913039,1.091803e-06,1.610595,2024,A2
2255,3feca796-791b-46d3-8160-95ef243ffb9d,20.24.070,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2 (EF 3.1)',DE,generic dataset,2023.0,...,,,,1.3e-15,5e-06,0.054593,7.05599e-10,3e-05,2024,A2
2256,3feca796-791b-46d3-8160-95ef243ffb9d,20.24.070,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2 (EF 3.1)',DE,generic dataset,2023.0,...,,,,0.0,0.0,0.0,0.0,0.0,2024,A2
2257,3feca796-791b-46d3-8160-95ef243ffb9d,20.24.070,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2 (EF 3.1)',DE,generic dataset,2023.0,...,,,,-4.68803e-11,-0.018711,-87.27409,-6.219902e-07,-1.211407,2024,A2
15297,fdc99ab8-d843-44ec-a66c-92367d244321,20.23.050,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2',DE,generic dataset,2022.0,...,,,,2.06477e-11,0.019711,146.197295,4.722078e-07,0.525069,2023,A2
15298,fdc99ab8-d843-44ec-a66c-92367d244321,20.23.050,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2',DE,generic dataset,2022.0,...,,,,0.0,0.0,0.0,0.0,0.0,2023,A2
15299,fdc99ab8-d843-44ec-a66c-92367d244321,20.23.050,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2',DE,generic dataset,2022.0,...,,,,5e-16,9e-06,0.043828,2.289147e-10,1.7e-05,2023,A2
15300,fdc99ab8-d843-44ec-a66c-92367d244321,20.23.050,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','EN 15804+A2',DE,generic dataset,2022.0,...,,,,-4.1757e-12,-0.012814,-97.733143,-2.537972e-07,-0.448489,2023,A2
24562,bd6d6d89-b76d-4002-a217-afffbb8aa308,20.19.120,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','DIN EN 15804',DE,generic dataset,2018.0,...,,,,,,,,,2020,A1
24563,bd6d6d89-b76d-4002-a217-afffbb8aa308,20.19.120,Aluminium Profil,Aluminium section,'Metalle' / 'Aluminium' / 'Aluminiumprofil','Metals' / 'Aluminium' / 'Aluminium profiles','DIN EN 15804',DE,generic dataset,2018.0,...,,,,,,,,,2020,A1


In [27]:
# rows where name is missing
missing_name = obd_df[(obd_df["Name (en)"].isna()) & (obd_df["Name (de)"].isna())]
print ("Number of rows with missing names in english and german:", len(missing_name))

Number of rows with missing names in english and german: 0


The missing rows for names are just for one of the language columns, so they all have names in one language at least

In [28]:
# unique values in the 'OBD' column
tbau_df["oekobaudatProcessUuid"].nunique()

728

In [29]:
len(tbau_df["oekobaudatProcessUuid"])


815

815 - 728  = 87 
Each uuid is unique (not like in obd). The remaining are missing values
Note that data entries in `ÖKOBAUDAT` are referred to as *Processes*, not materials, as life cycle assessment is conducted at the process level. Since in the tbaustoff dataset we are talking of *ProcessUuid* in connection with `ÖKOBAUDAT`, it means that the data is related to a speficic material combined with assessments through its entire life cycle.

In [30]:
# looking at the missing values
tbau_df[tbau_df["oekobaudatProcessUuid"].isnull()]

Unnamed: 0,oekobaudatProcessUuid,oekobaudatDatastockUuid,oekobaudatDatastockName,productId,tBaustoffVersion,productName,eolCategoryId,eolCategoryName,eolScenarioUnbuiltReal,eolScenarioUnbuiltPotential,technologyFactor
16,,,,5,2024-Q4,Asphalttragschicht - Hochbau,4,Asphalt (Hochbau),Dep+,CL+,0.75
26,,,,7,2024-Q4,"Balkenschichtholz Nadelholz - unbeschichtet, n...",6,"Holz - massiv, naturbelassen",RC+,CL+,0.75
30,,,,39,2024-Q4,Brettsperrholz,6,"Holz - massiv, naturbelassen",RC+,CL+,0.75
31,,,,157,2024-Q4,Konstruktionsvollholz - naturbelassen,6,"Holz - massiv, naturbelassen",RC+,CL+,0.75
56,,,,9,2024-Q4,Beton - Carbonfasern / Textilbeton,8,Beton - Carbonfasern,Dep-,Dep-,0.00
...,...,...,...,...,...,...,...,...,...,...,...
770,,,,286,2024-Q4,"Splitt 2/8, dauerelastisch gebunden (Latex, so...",124,SchÃ¼ttung organ./min. gebunden,Dep-,Dep-,0.00
771,,22885a6e-1765-4ade-a35e-ae668bd07256,OBD_2023_I,288,2024-Q4,SplittschÃ¼ttung mineral. geb.,125,SchÃ¼ttung min./min. gebunden,RC-,RC-,1.00
772,,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,288,2024-Q4,SplittschÃ¼ttung mineral. geb.,125,SchÃ¼ttung min./min. gebunden,RC-,RC-,1.00
773,,448d1096-2017-4901-a560-f652a83c737e,OBD_2020_II,288,2024-Q4,SplittschÃ¼ttung mineral. geb.,125,SchÃ¼ttung min./min. gebunden,RC-,RC-,1.00


In [31]:
tbau_df[tbau_df["productName"]=="Asphalttragschicht - Hochbau"]

Unnamed: 0,oekobaudatProcessUuid,oekobaudatDatastockUuid,oekobaudatDatastockName,productId,tBaustoffVersion,productName,eolCategoryId,eolCategoryName,eolScenarioUnbuiltReal,eolScenarioUnbuiltPotential,technologyFactor
16,,,,5,2024-Q4,Asphalttragschicht - Hochbau,4,Asphalt (Hochbau),Dep+,CL+,0.75


In [32]:
# Removing duplicates of the tbau_df dataframe
# Sort by source year descending to keep the latest entry in case of duplicates
tbs_sorted = tbau_df.sort_values(by="oekobaudatDatastockName", ascending=False)

# Drop duplicates keeping the latest version (based on UUID, Version, Modul)
tbs_deduped = tbs_sorted.drop_duplicates(subset=["productName"], keep="first") 

# Reset index and show result
tbs_deduped.reset_index(drop=True, inplace=True)

In [33]:
tbs_deduped

Unnamed: 0,oekobaudatProcessUuid,oekobaudatDatastockUuid,oekobaudatDatastockName,productId,tBaustoffVersion,productName,eolCategoryId,eolCategoryName,eolScenarioUnbuiltReal,eolScenarioUnbuiltPotential,technologyFactor
0,26353b00-6cd3-426d-903b-9fc5b1670398,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,345,2024-Q4,Zinkbleche,139,Zink,CL+,CL+,1.00
1,5f091578-7d83-46de-bfba-754087398afe,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,42,2024-Q4,CR Profil (Chloropren-Kautschuk),24,KSTProfil (Halog),EB,EB,0.00
2,530ff9e2-0189-4783-9546-3bb8f64fbbeb,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,119,2024-Q4,Holz-Blendrahmen,29,Holzrahmen,EV+,EV+,0.00
3,5541250a-f8d8-4c67-9f24-47ab54686c30,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,165,2024-Q4,Kunstharzputz,83,organ. Putz,Dep-,Dep-,0.00
4,592ffe6e-4c21-4a24-ba67-273acbfca373,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,45,2024-Q4,Dachziegel / Ton-,26,Tondachziegel,RC+,RC+,1.00
...,...,...,...,...,...,...,...,...,...,...,...
333,,,,262,2024-Q4,Schaumglasgranulat SchÃ¼ttung,121,Mineralische DÃ¤mmschÃ¼ttung,SV,CL-,0.75
334,,,,265,2024-Q4,SchilfrohrdÃ¤mmmatte,123,Schilfrohr,SV,RC+,0.50
335,,,,266,2024-Q4,Schilfrohrmatte (PutztrÃ¤ger),123,Schilfrohr,SV,RC+,0.50
336,,,,286,2024-Q4,"Splitt 2/8, dauerelastisch gebunden (Latex, so...",124,SchÃ¼ttung organ./min. gebunden,Dep-,Dep-,0.00


# Pollutants Dataset

In [34]:
# detect files encoding for pollutant_combinations.csv
with open("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/pollutant_combinations.csv", "rb") as f:
    result = chardet.detect(f.read(100000))  # Read first 100,000 bytes
    print(result)
# importing pullutant_combinations.csv
pollutants_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/pollutant_combinations.csv", delimiter=",", encoding = result["encoding"], low_memory=False)

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


In [35]:
pollutants_df.head()

Unnamed: 0,Baumaterial ohne Fremd-/Störstoffe,Fremd-/Störstoffbeschreibung,Störstoffklasse
0,Gussasphaltestrich,ohne Fremd-/Störstoffe,S0
1,Gussasphaltestrich,"Klebstoff-, Bodenbelagsreste, Trennfolien",S2
2,Gussasphaltestrich,Heizungsverteilrohre,S3
3,Beton,ohne Fremd-/Störstoffe,S0
4,Beton,Stahlbewehrung,S2


In [36]:
pollutants_df["Baumaterial ohne Fremd-/Störstoffe"].nunique()

66

In [37]:
pollutants_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147 entries, 0 to 146
Data columns (total 3 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Baumaterial ohne Fremd-/Störstoffe  147 non-null    object
 1   Fremd-/Störstoffbeschreibung        147 non-null    object
 2   Störstoffklasse                     147 non-null    object
dtypes: object(3)
memory usage: 3.6+ KB


In [38]:
# Merge pollutants_df with tbau_df on productName and Baumaterial ohne Fremd-/Störstoffe
merged_df = tbs_deduped.merge(
    pollutants_df,
    left_on="productName",
    right_on="Baumaterial ohne Fremd-/Störstoffe",
    how="inner"
)

# Display the merged dataframe
merged_df

Unnamed: 0,oekobaudatProcessUuid,oekobaudatDatastockUuid,oekobaudatDatastockName,productId,tBaustoffVersion,productName,eolCategoryId,eolCategoryName,eolScenarioUnbuiltReal,eolScenarioUnbuiltPotential,technologyFactor,Baumaterial ohne Fremd-/Störstoffe,Fremd-/Störstoffbeschreibung,Störstoffklasse
0,c7381cc7-53b7-427e-9c0b-c9edc4152602,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,108,2024-Q4,Gussasphaltestrich,59,Gussasphaltestrich,Dep+,CL+,0.75,Gussasphaltestrich,ohne Fremd-/Störstoffe,S0
1,c7381cc7-53b7-427e-9c0b-c9edc4152602,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,108,2024-Q4,Gussasphaltestrich,59,Gussasphaltestrich,Dep+,CL+,0.75,Gussasphaltestrich,"Klebstoff-, Bodenbelagsreste, Trennfolien",S2
2,c7381cc7-53b7-427e-9c0b-c9edc4152602,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,108,2024-Q4,Gussasphaltestrich,59,Gussasphaltestrich,Dep+,CL+,0.75,Gussasphaltestrich,Heizungsverteilrohre,S3
3,fdbe82c6-7073-4f8c-a540-c37a6bf16cd6,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,68,2024-Q4,Expandierter Kork,39,Kork,EV-,RC+,0.75,Expandierter Kork,"Putz, Klebespachtel, Klebstoff",S3
4,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,342,2024-Q4,Zementestrich,138,Estrich CT - Zementestrich,Dep+,RC-,0.25,Zementestrich,ohne Fremd-/Störstoffe,S0
5,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,342,2024-Q4,Zementestrich,138,Estrich CT - Zementestrich,Dep+,RC-,0.25,Zementestrich,"geringfügig verunreinigt (Klebstoffreste, Bela...",S2
6,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,342,2024-Q4,Zementestrich,138,Estrich CT - Zementestrich,Dep+,RC-,0.25,Zementestrich,Heizungsverteilrohre; verunreinigt mit Dämmsto...,S3
7,,,,182,2024-Q4,Lehmbauplatte,90,Lehmbauplatte,SV,CL+,0.75,Lehmbauplatte,"Lehmputz, Lehmfarbe",S1
8,,,,182,2024-Q4,Lehmbauplatte,90,Lehmbauplatte,SV,CL+,0.75,Lehmbauplatte,Verunreinigungen mit natürl. Materialien (Nat...,S2
9,,,,182,2024-Q4,Lehmbauplatte,90,Lehmbauplatte,SV,CL+,0.75,Lehmbauplatte,"Konvent. Wandfarbe; Zement, Kalk, Gips, Kunsts...",S3


## Fuzzy Match 66 Pollutant Materials to tBaustoff Materials


In [44]:
import pandas as pd
from rapidfuzz import process, fuzz


# Extract the relevant columns
tbs_materials = tbau_df["eolCategoryName"].dropna().unique()
pollutant_materials = pollutants_df["Baumaterial ohne Fremd-/Störstoffe"].dropna().unique()

# Perform fuzzy matching
matches = []
for pollutant_material in pollutant_materials:
    match, score, _ = process.extractOne(pollutant_material, tbs_materials, scorer=fuzz.token_sort_ratio)
    if score >= 60:
        matches.append((match, pollutant_material, score))

# Convert to DataFrame
matches_df = pd.DataFrame(matches, columns=["eolCategory", "pollutant_material", "similarity_score"])

# Join with pollutants_df to bring in pollutant descriptions
pollutants_with_desc = pollutants_df[["Baumaterial ohne Fremd-/Störstoffe", "Fremd-/Störstoffbeschreibung"]]
merged_matches = matches_df.merge(
    pollutants_with_desc,
    left_on="pollutant_material",
    right_on="Baumaterial ohne Fremd-/Störstoffe",
    how="left"
)

merged_matches



Unnamed: 0,eolCategory,pollutant_material,similarity_score,Baumaterial ohne Fremd-/Störstoffe,Fremd-/Störstoffbeschreibung
0,Gussasphaltestrich,Gussasphaltestrich,100.000000,Gussasphaltestrich,ohne Fremd-/Störstoffe
1,Gussasphaltestrich,Gussasphaltestrich,100.000000,Gussasphaltestrich,"Klebstoff-, Bodenbelagsreste, Trennfolien"
2,Gussasphaltestrich,Gussasphaltestrich,100.000000,Gussasphaltestrich,Heizungsverteilrohre
3,Beton - Carbonfasern,Beton mit Carbonfasern,85.714286,Beton mit Carbonfasern,ohne Fremd-/Störstoffe
4,Beton - Carbonfasern,Beton mit Carbonfasern,85.714286,Beton mit Carbonfasern,"feinkörn. Material (Putze, Mörtel); org. Verbi..."
...,...,...,...,...,...
85,Kunststein,"Naturstein, Kunststein",62.500000,"Naturstein, Kunststein",Kalkmörtel (leicht trennbar)
86,Kunststein,"Naturstein, Kunststein",62.500000,"Naturstein, Kunststein","Kalkzementmörtel, Imprägnierung"
87,Glasbaustein,Glasbaustein,100.000000,Glasbaustein,ohne Fremd-/Störstoffe
88,Glasbaustein,Glasbaustein,100.000000,Glasbaustein,Mörtel; Bewehrungsstahl im Mörtel


Summary 
Step | Action
1 | Find fuzzy matches between material names and eol category
2 | Use those as seeds with known pollutants
3 | Search for similar materials (by name + EOL category)
4 | Assign same pollutant label (bootstrapping)
5 | Collect into training dataset

In [45]:
# Save the merged DataFrame to a CSV file
merged_matches.to_csv("merged_pollutant_matches.csv")

# bootstrapping

In [46]:
# Rename the column to match the tbs_df column for merging
matched_df = merged_matches.rename(columns={"eolCategory": "eolCategoryName"})

# Get the unique mapping: eolCategoryName → pollutant
eol_to_pollutant = matched_df[["eolCategoryName", "Fremd-/Störstoffbeschreibung"]].dropna().drop_duplicates()

# Merge with the full dataset to assign pollutant labels
bootstrapped_df = tbau_df.merge(
    eol_to_pollutant,
    on="eolCategoryName",
    how="left"
)

# Keep only the labeled rows for model training
labeled_bootstrapped_df = bootstrapped_df[~bootstrapped_df["Fremd-/Störstoffbeschreibung"].isna()].copy()

labeled_bootstrapped_df

Unnamed: 0,oekobaudatProcessUuid,oekobaudatDatastockUuid,oekobaudatDatastockName,productId,tBaustoffVersion,productName,eolCategoryId,eolCategoryName,eolScenarioUnbuiltReal,eolScenarioUnbuiltPotential,technologyFactor,Fremd-/Störstoffbeschreibung
56,,,,9,2024-Q4,Beton - Carbonfasern / Textilbeton,8,Beton - Carbonfasern,Dep-,Dep-,0.00,ohne Fremd-/Störstoffe
57,,,,9,2024-Q4,Beton - Carbonfasern / Textilbeton,8,Beton - Carbonfasern,Dep-,Dep-,0.00,"feinkörn. Material (Putze, Mörtel); org. Verbi..."
58,,,,9,2024-Q4,Beton - Carbonfasern / Textilbeton,8,Beton - Carbonfasern,Dep-,Dep-,0.00,"org. V., Bitumen, Gips, Leichtanteile in größe..."
59,,,,9,2024-Q4,Beton - Carbonfasern / Textilbeton,8,Beton - Carbonfasern,Dep-,Dep-,0.00,Reaktionsharzbeschichtung (z.B. Kunstharzbelag)
93,e9ae96ee-ba8d-420d-9725-7c8abd06e082,448d1096-2017-4901-a560-f652a83c737e,OBD_2020_II,14,2024-Q4,Bewehrungsstahl,10,Stahl - Bewehrung,CL+,CL+,1.00,Beton (Stahlbeton)
...,...,...,...,...,...,...,...,...,...,...,...,...
1058,abc6d78d-f274-4400-8f31-fe8d4c2949c9,22885a6e-1765-4ade-a35e-ae668bd07256,OBD_2023_I,342,2024-Q4,Zementestrich,138,Estrich CT - Zementestrich,Dep+,RC-,0.25,"geringfügig verunreinigt (Klebstoffreste, Bela..."
1059,abc6d78d-f274-4400-8f31-fe8d4c2949c9,22885a6e-1765-4ade-a35e-ae668bd07256,OBD_2023_I,342,2024-Q4,Zementestrich,138,Estrich CT - Zementestrich,Dep+,RC-,0.25,Heizungsverteilrohre; verunreinigt mit Dämmsto...
1060,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,342,2024-Q4,Zementestrich,138,Estrich CT - Zementestrich,Dep+,RC-,0.25,ohne Fremd-/Störstoffe
1061,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,ca70a7e6-0ea4-4e90-a947-d44585783626,OBD_2024_I,342,2024-Q4,Zementestrich,138,Estrich CT - Zementestrich,Dep+,RC-,0.25,"geringfügig verunreinigt (Klebstoffreste, Bela..."
