In [1]:
# Importing packages

import pandas as pd
import numpy  as np
import seaborn as sns
import datetime 
import chardet


# Goal 1: Automating Pollutant Assessment 
Current Situation: 
- ​To evaluate a material's reusability, users must manually input information about 
pollutants 
- ​These pollutants (adhesives, paints, fire retardants, etc.) affect how easily materials can 
be separated and reused 
- ​The manual nature of this process introduces inconsistency and inefficiency 



## Objective: Develop a system that automatically suggests strong default pollutant values based on the materials being assessed. 
This system should: 
- ​Predict likely pollutants based on material combinations 
- Provide appropriate pollutant classifications 
- Reduce manual input while improving assessment accuracy 

## Methodology and approach
- Machine learning models to predict pollutants based on material combinations 
- Pattern recognition systems that identify common construction methods and associated 
pollutants 
- Rule-based systems derived from expert knowledge 
- ​Default value frameworks with adjustment mechanisms based on material properties 

## Guiding Questions

1. How can we identify which materials are commonly used together in building components?
2. What properties in the datasets can help us determine the type of connections used between materials?
3. How might different connection types affect the end-of-life scenarios in the tBaustoff dataset?
4. Can we classify connections based on their environmental impact using the lifecycle assessment data?
5. What metrics could we use to define the "disturbance potential" of different connection types?
6. How can we develop a recommendation system for material connections that optimizes for circularity?

# Data Wrangling

## Data Extraction

In [7]:
# detect files encoding
with open("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/OBD_2024_I.csv", "rb") as f:
    result = chardet.detect(f.read(100000))  # Read first 100,000 bytes
    print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.7295400999999999, 'language': ''}


In [None]:
# Importing data from CSV and converting to dataframe
oko_2024_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/OBD_2024_I.csv", delimiter=";", encoding = result["encoding"], low_memory=False)
oko_2023_df= pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/OBD_2023_I.csv", delimiter=";", encoding = result["encoding"], low_memory=False)
oko_2020_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/OBD_2020_II.csv", delimiter=";", encoding = result["encoding"], low_memory=False)
tbau_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/tBaustoff_with_OBD_mapping.csv", delimiter=";", encoding = result["encoding"], low_memory=False)


In [None]:
# detect files encoding for pollutant_combinations.csv
with open("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/pollutant_combinations.csv", "rb") as f:
    result = chardet.detect(f.read(100000))  # Read first 100,000 bytes
    print(result)

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


In [None]:
# importing pullutant_combinations.csv
pollutants_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/pollutant_combinations.csv", delimiter=",", encoding = result["encoding"], low_memory=False)

In [None]:
# Assesing if to merge the dataframes or just keep the one from 2024

# Compare unique material names between datasets
materials_2020 = set(oko_2020_df["Name (en)"].dropna().unique())
materials_2023 = set(oko_2023_df["Name (en)"].dropna().unique())
materials_2024 = set(oko_2024_df["Name (en)"].dropna().unique())

# Find materials in 2020 and 2023 that are not in 2024
unique_2020 = materials_2020 - materials_2024
unique_2023 = materials_2023 - materials_2024

# Count them
unique_2020_count = len(unique_2020)
unique_2023_count = len(unique_2023)

unique_2020_count, unique_2023_count


(262, 546)

There are materials in previous datasets that are not present in the 2024 dataset:

2020 has 262 materials not found in 2024.
2023 has 546 materials not found in 2024.

This means that the 2024 dataset is not a full replacement — it doesn’t include all materials from earlier versions.

For maximum coverage (e.g. all materials ever documented), we should consider including previous years.

In [21]:
# merging the datasets

# Add a source column for reference
oko_2020_df["source_year"] = 2020
oko_2023_df["source_year"] = 2023
oko_2024_df["source_year"] = 2024

# Combine all datasets
oko_combined_df = pd.concat([oko_2020_df, oko_2023_df, oko_2024_df], ignore_index=True)

In [22]:
# Sort by source year descending to keep the latest entry in case of duplicates
combined_df_sorted = oko_combined_df.sort_values(by="source_year", ascending=False)


In [23]:
# Drop duplicates keeping the latest version (based on UUID, Version, Modul)
deduped_df = combined_df_sorted.drop_duplicates(subset=["UUID", "Version", "Modul"], keep="first")

# Reset index and show result
deduped_df.reset_index(drop=True, inplace=True)

In [None]:
deduped_df

## Data Cleaning

In [24]:
# Identifying the columns with missing values
missing_values = deduped_df.isnull().sum()

In [25]:
missing_values

UUID                        0
Version                     0
Name (de)                2886
Name (en)                2953
Kategorie (original)        0
                        ...  
WDP (A2)                 9405
Unnamed: 79             25665
source_year                 0
Stueckgewicht (kg)      25477
Unnamed: 80             25665
Length: 83, dtype: int64

In [27]:
# Calculate percentage of missing values per column
null_percentage = deduped_df.isnull().mean().sort_values(ascending=False)

# Select columns with more than 90% missing values
high_null_cols = null_percentage[null_percentage > 0.9]
high_null_cols

URL des Vorgängers        1.000000
Unnamed: 80               1.000000
Unnamed: 79               1.000000
Stueckgewicht (kg)        0.992675
Ergiebigkeit (m2)         0.987064
Schuettdichte (kg/m3)     0.986090
Laengengewicht (kg/m)     0.941126
Version des Vorgängers    0.937502
dtype: float64

These columns have extremely low data coverage and are unlikely to offer analytical value without substantial data imputation or external sourcing. Dropping them will simplify the dataset and reduce noise without significant information loss.

Unnamed: 80 and URL des Vorgängers are completely empty → should be dropped.


In [28]:
# Drop the columns with >90% missing values
columns_to_drop = [
    "Unnamed: 80", "Unnamed: 79", "URL des Vorgängers",
    "Stueckgewicht (kg)", "Ergiebigkeit (m2)", "Schuettdichte (kg/m3)",
    "Laengengewicht (kg/m)", "Version des Vorgängers"
]

cleaned_df = deduped_df.drop(columns=columns_to_drop)

Missing values of Environmental Impact Indicators (EN 15804+A1 and +2)

Understanding the EN 15804 Standard and Its Impact on LCA Calculations
The EN 15804 standard is the foundation for Environmental Product Declarations (EPDs) worldwide. In 2019, a significant update (EN 15804+A2) was released, becoming mandatory in July 2022. This update changes how environmental impacts are measured in EPDs.

What Changed?
While the environmental impact categories (like global warming and ozone depletion) are mostly the same, the units used to measure them have changed. This means that older EPDs (EN 15804+A1) cannot be directly compared or used with newer EPDs (EN 15804+A2) in LCA calculations.

How does this affect Building LCA?
Simply put, you can't mix and match old and new EPDs in your building LCA calculations.

Since 15804+A1 has a big % of missing values and it's out of use, we will drop the whole columns unless they have values that are not covered in +A2



In [29]:
# Define A1 and A2 column groups
a1_columns = ["GWP", "ODP", "POCP", "AP", "EP", "ADPE", "ADPF"]
a2_columns = [
    "GWPtotal (A2)", "GWPbiogenic (A2)", "GWPfossil (A2)", "GWPluluc (A2)",
    "ODP (A2)", "POCP (A2)", "AP (A2)", "EPmarine (A2)", "EPfreshwater (A2)", 
    "EPterrestrial (A2)", "ADPE (A2)", "ADPF (A2)"
]

# Check for non-null rows in A1 indicators
a1_non_null = (cleaned_df[a1_columns].notnull().sum()).sort_values(ascending=False)

# Check for any A1 values that do not have A2 counterparts in the same row
# Simplest heuristic: check if A1 column is filled but all A2s are null for that row
rows_with_only_a1 = cleaned_df[a1_columns].notnull().any(axis=1) & (~cleaned_df[a2_columns].notnull().any(axis=1))

# Count how many rows this applies to
only_a1_count = rows_with_only_a1.sum()

a1_non_null, only_a1_count


(GWP     8294
 ODP     8281
 POCP    8281
 AP      8281
 ADPE    8281
 EP      8279
 ADPF    8277
 dtype: int64,
 np.int64(8294))

Interpretation: These A1 values are not duplicated or represented in the A2 columns — they are unique to older datasets.
Therefore, dropping them would result in data loss for ~8,300 records.

In [30]:
# Helper column classifying the environmental impact standard
def classify_impact_standard(row):
    has_a1 = row[a1_columns].notnull().any()
    has_a2 = row[a2_columns].notnull().any()
    if has_a1 and not has_a2:
        return "A1"
    elif has_a2 and not has_a1:
        return "A2"
    elif has_a1 and has_a2:
        return "mixed"
    else:
        return "none"

# Apply the function
cleaned_df["impact_standard"] = cleaned_df.apply(classify_impact_standard, axis=1)

# Show counts for each category
impact_counts = cleaned_df["impact_standard"].value_counts()

impact_counts


impact_standard
A2      16260
A1       8294
none     1111
Name: count, dtype: int64

## Data Enrichment

## Data Validation

# EDA

# Machine Learning