In [1]:
# Importing packages

import pandas as pd
import numpy  as np
import seaborn as sns
import datetime 
import chardet


# Goal 1: Automating Pollutant Assessment 
Current Situation: 
- ​To evaluate a material's reusability, users must manually input information about 
pollutants 
- ​These pollutants (adhesives, paints, fire retardants, etc.) affect how easily materials can 
be separated and reused 
- ​The manual nature of this process introduces inconsistency and inefficiency 



## Objective: Develop a system that automatically suggests strong default pollutant values based on the materials being assessed. 
This system should: 
- ​Predict likely pollutants based on material combinations 
- Provide appropriate pollutant classifications 
- Reduce manual input while improving assessment accuracy 

## Methodology and approach
- Machine learning models to predict pollutants based on material combinations 
- Pattern recognition systems that identify common construction methods and associated 
pollutants 
- Rule-based systems derived from expert knowledge 
- ​Default value frameworks with adjustment mechanisms based on material properties 

## Guiding Questions

1. How can we identify which materials are commonly used together in building components?
2. What properties in the datasets can help us determine the type of connections used between materials?
3. How might different connection types affect the end-of-life scenarios in the tBaustoff dataset?
4. Can we classify connections based on their environmental impact using the lifecycle assessment data?
5. What metrics could we use to define the "disturbance potential" of different connection types?
6. How can we develop a recommendation system for material connections that optimizes for circularity?

# Data Wrangling

## Data Extraction

In [7]:
# detect files encoding
with open("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/OBD_2024_I.csv", "rb") as f:
    result = chardet.detect(f.read(100000))  # Read first 100,000 bytes
    print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.7295400999999999, 'language': ''}


In [None]:
# Importing data from CSV and converting to dataframe
oko_2024_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/OBD_2024_I.csv", delimiter=";", encoding = result["encoding"], low_memory=False)
oko_2023_df= pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/OBD_2023_I.csv", delimiter=";", encoding = result["encoding"], low_memory=False)
oko_2020_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/OBD_2020_II.csv", delimiter=";", encoding = result["encoding"], low_memory=False)
tbau_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/tBaustoff_with_OBD_mapping.csv", delimiter=";", encoding = result["encoding"], low_memory=False)


In [None]:
# detect files encoding for pollutant_combinations.csv
with open("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/pollutant_combinations.csv", "rb") as f:
    result = chardet.detect(f.read(100000))  # Read first 100,000 bytes
    print(result)

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


In [None]:
# importing pullutant_combinations.csv
pollutants_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/csv/pollutant_combinations.csv", delimiter=",", encoding = result["encoding"], low_memory=False)

In [13]:
oko_2024_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14985 entries, 0 to 14984
Data columns (total 81 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   UUID                                           14985 non-null  object 
 1   Version                                        14985 non-null  object 
 2   Name (de)                                      12848 non-null  object 
 3   Name (en)                                      14945 non-null  object 
 4   Kategorie (original)                           14985 non-null  object 
 5   Kategorie (en)                                 14985 non-null  object 
 6   Konformität                                    14985 non-null  object 
 7   Laenderkennung                                 14982 non-null  object 
 8   Typ                                            14985 non-null  object 
 9   Referenzjahr                                   149

## Data Cleaning

In [14]:
# Identifying the columns with missing values
oko_2024_df.isnull().sum()

UUID                        0
Version                     0
Name (de)                2137
Name (en)                  40
Kategorie (original)        0
                        ...  
POCP (A2)                 810
ADPF (A2)                 810
ADPE (A2)                 810
WDP (A2)                  810
Unnamed: 80             14985
Length: 81, dtype: int64

Unnamed: 80 and URL des Vorgängers are completely empty → should be dropped.


In [17]:
# dropping the columns with all values as NaN
oko_2024_df.drop(columns=["Unnamed: 80", "URL des Vorgängers"], inplace=True)

## Data Enrichment

## Data Validation

# EDA

# Machine Learning