In [1]:
import pandas as pd
import numpy  as np
from sklearn.cluster import KMeans
import seaborn as sns
import chardet
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
# importing csvs
materials_from_components = pd.read_csv('/Users/pablosoriano/Documents/Data Science/bbsr-challenge/all_uuid_materials_from_components.csv')
obd_with_pollutants = pd.read_csv("pollutant_labeled_obd_translated.csv", sep=";", low_memory=False)


In [3]:
# removing duplicates from obd_with_pollutants, keeping one for each module
obd_with_pollutants_sorted = obd_with_pollutants.sort_values(by=["UUID","Modul"], ascending=[True, True])
# removing duplicates from obd_with_pollutants, keeping one for each module
obd_with_pollutants_clean = obd_with_pollutants_sorted.drop_duplicates(subset=["UUID","Modul"], keep="first") # done in the previous step



# Role keyword mapping

In [4]:
# Combine relevant columns
obd_with_pollutants["combined_text"] = (
    obd_with_pollutants["Name (de)"].fillna("") + " " +
    obd_with_pollutants["Kategorie (original)"].fillna("") + " " +
    obd_with_pollutants["productName"].fillna("") + " " +
    obd_with_pollutants["eolCategoryName"].fillna("")
).str.lower()

# role mapping
role_keywords = {
    "adhesive": ["kleber", "klebstoff", "spachtel"],
    "sealant": ["abdichtung", "dicht", "fuge", "bitumen", "bitumenbahn", "epdm", "eva", "ecb",
                "pvc", "dachbahn", "unterspannbahn", "kunststoffbahn", "dampfbremse", "folie", "vlies"],
    "mortar": ["mörtel", "zement", "putz", "verputz", "fugenmörtel", "kalkzementputz", "leichtputz", "ausgleichsmasse",
               "ziegel", "planstein", "leichtbeton", "dachstein", "glasbaustein"],
    "coating": ["farbe", "beschichtung", "lack", "bodenbelag", "linoleum", "korklinoleum", "gussasphaltestrich", "pvc-bodenbelag"],
    "insulation": ["dämm", "wolle", "schaum", "isolierung"],
    "board": ["platte", "gipskarton", "holzfaser"],
    "aggregate": ["kies", "schotter", "sand", "zuschlag", "granulat", "blähton", "naturbims"],
    "metal": ["stahl", "metall", "blech"],
    "wood": ["holz", "sperrholz"]
}
def infer_role(text):
    if pd.isna(text):
        return None
    for role, keywords in role_keywords.items():
        if any(keyword in text for keyword in keywords):
            return role
    return None
# Initial role inference
obd_with_pollutants["material_role"] = obd_with_pollutants["combined_text"].apply(infer_role).fillna("other")

# Refine sealant into subroles
def refine_sealant_role(row):
    if row["material_role"] != "sealant":
        return row["material_role"]
    text = row["combined_text"]
    if any(x in text for x in ["dachbahn", "epdm", "bitumen", "ecb", "eva"]):
        return "roofing_sealant"
    elif any(x in text for x in ["dampfbremse", "vlies", "folie", "unterspannbahn"]):
        return "vapor_barrier"
    elif any(x in text for x in ["pvc", "bodenbelag", "belag"]):
        return "flooring_sealant"
    else:
        return "sealant"
    
 # Apply refinement
obd_with_pollutants["material_role"] = obd_with_pollutants.apply(refine_sealant_role, axis=1)

# # Final role distribution
# role_distribution = obd_with_pollutants["material_role"].value_counts().sort_values(ascending=False)
# role_distribution


## Creating a multi label one hot encoded format

In [5]:
# We group by the full material context and pivot the target labels

obd_with_pollutants["target_class"] = obd_with_pollutants["Störstoffklasse"]
# Group by material context and pivot pollutant class into columns

context_cols = ["UUID", "material_role", "eolCategoryName", "eolScenarioUnbuiltReal", "eolScenarioUnbuiltPotential", "technologyFactor"]
df_multi = obd_with_pollutants[context_cols + ["target_class"]].dropna()#.drop_duplicates() #REVIEW - stay or go?
df_multi["value"] = 1
df_pivot = df_multi.pivot_table(index=context_cols, columns="target_class", values="value", fill_value=0).reset_index()


In [6]:
# Prepare X and y
label_cols = [col for col in df_pivot.columns if col.startswith("S")]
X = df_pivot.drop(columns=label_cols)
y = df_pivot[label_cols]

# One-hot encode categorical features
X_encoded = pd.get_dummies(X, columns=["material_role", "eolCategoryName", "eolScenarioUnbuiltReal", "eolScenarioUnbuiltPotential"], drop_first=True)


## Training the model

In [7]:
# Drop UUID column before training
X_encoded = X_encoded.drop(columns=["UUID"])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train multi-label Random Forest
multi_rf = MultiOutputClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
multi_rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = multi_rf.predict(X_test)
report_multi = classification_report(y_test, y_pred, target_names=y.columns, output_dict=True)


In [8]:
# Predict probabilities
y_proba = multi_rf.predict_proba(X_test)
proba_df = pd.DataFrame({
    class_name: probs[:, 1] for class_name, probs in zip(y.columns, y_proba)
})

In [9]:
proba_df.head()

Unnamed: 0,S0,S1,S2,S3,S4
0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0
2,0.95,0.0,0.975,0.025,0.005
3,0.815,0.26,0.72,0.95,0.075
4,0.97,0.0,0.235,0.395,0.01


In [10]:
# Apply custom thresholds per class to convert probabilities into binary predictions
custom_thresholds = {
    "S0": 0.5,
    "S1": 0.3,
    "S2": 0.3,
    "S3": 0.3,
    "S4": 0.2
}

# Apply thresholds
binary_predictions = pd.DataFrame({
    class_name: (proba_df[class_name] >= threshold).astype(int)
    for class_name, threshold in custom_thresholds.items()
})

# Evaluate the new thresholded predictions
from sklearn.metrics import classification_report

thresholded_report = classification_report(y_test, binary_predictions, target_names=y.columns, output_dict=True)
# display as dataframe
thresholded_report_df = pd.DataFrame(thresholded_report).transpose()

In [11]:
thresholded_report_df[['precision', 'recall', 'f1-score']]

Unnamed: 0,precision,recall,f1-score
S0,0.944444,1.0,0.971429
S1,1.0,1.0,1.0
S2,0.909091,0.909091,0.909091
S3,0.777778,1.0,0.875
S4,0.5,0.333333,0.4
micro avg,0.865385,0.9375,0.9
macro avg,0.826263,0.848485,0.831104
weighted avg,0.863426,0.9375,0.895089
samples avg,0.883333,0.941667,0.902857


## Predicting pollutant clasess for unlabeld materials of Tbaustof

In [12]:
tbs_df = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/tbs_deduped.csv", sep=";", quotechar='"')

# Continue with the feature processing pipeline
tbs_df["combined_text"] = (
    tbs_df["productName"].fillna("") + " " +
    tbs_df["eolCategoryName"].fillna("")
).str.lower()

# Infer role
tbs_df["material_role"] = tbs_df["combined_text"].apply(infer_role).fillna("other")
tbs_df["material_role"] = tbs_df.apply(refine_sealant_role, axis=1)


In [13]:
# Create modeling features
tbs_context = tbs_df[[
    "productName","oekobaudatProcessUuid", "material_role", "eolCategoryName",
    "eolScenarioUnbuiltReal", "eolScenarioUnbuiltPotential", "technologyFactor"
]].drop_duplicates()

# One-hot encode and align with trained model
tbs_encoded = pd.get_dummies(tbs_context.drop(columns=["productName","oekobaudatProcessUuid"]), drop_first=True)
tbs_encoded = tbs_encoded.reindex(columns=X_train.columns, fill_value=0)

# Predict probabilities
tbs_proba = multi_rf.predict_proba(tbs_encoded)
tbs_proba_df = pd.DataFrame({
    class_name: probs[:, 1] for class_name, probs in zip(["S0", "S1", "S2", "S3", "S4"], tbs_proba)
})

In [25]:
# Apply thresholds
tbs_predicted = pd.DataFrame({
    class_name: (tbs_proba_df[class_name] >= threshold).astype(int)
    for class_name, threshold in custom_thresholds.items()
})

# Combine with UUIDs
tbs_results = pd.concat([tbs_context.reset_index(drop=True)[["productName"]], tbs_predicted], axis=1)

tbs_results
## Pollutant Class Probabilities For TBaustoff
tbs_proba_df["productName"] = tbs_context["productName"].values
tbs_proba_df["UUID"] = tbs_context["oekobaudatProcessUuid"].values

# Reorder columns for clarity
columns_ordered = ["productName","UUID","S0", "S1", "S2", "S3", "S4"]
tbs_proba_df = tbs_proba_df[columns_ordered]

# to csv
tbs_proba_df.to_csv("tbs_proba_df.csv", index=False)
tbs_proba_df


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tbs_proba_df["productName"] = tbs_context["productName"].values


Unnamed: 0,productName,UUID,S0,S1,S2,S3,S4
0,Zinkbleche,26353b00-6cd3-426d-903b-9fc5b1670398,0.880,0.000,0.950,0.025,0.005
1,CR Profil (Chloropren-Kautschuk),5f091578-7d83-46de-bfba-754087398afe,0.605,0.050,0.285,0.305,0.335
2,Holz-Blendrahmen,530ff9e2-0189-4783-9546-3bb8f64fbbeb,0.945,0.460,0.625,0.260,0.490
3,Kunstharzputz,5541250a-f8d8-4c67-9f24-47ab54686c30,0.985,0.055,0.095,0.305,0.015
4,Dachziegel / Ton-,592ffe6e-4c21-4a24-ba67-273acbfca373,0.920,0.000,0.335,0.275,0.105
...,...,...,...,...,...,...,...
333,Schaumglasgranulat SchÃ¼ttung,,0.900,0.115,0.565,0.370,0.010
334,SchilfrohrdÃ¤mmmatte,,0.860,0.145,0.390,0.615,0.035
335,Schilfrohrmatte (PutztrÃ¤ger),,0.905,0.105,0.435,0.695,0.060
336,"Splitt 2/8, dauerelastisch gebunden (Latex, so...",,0.975,0.140,0.180,0.605,0.015


# Multi-Label Contaminant Prediction


## Extract and rank the most common contaminant terms

In [15]:

# Filter for meaningful descriptions
df_valid = obd_with_pollutants[
    obd_with_pollutants["Fremd-/Störstoffbeschreibung"].notna() &
    (obd_with_pollutants["Fremd-/Störstoffbeschreibung"].str.lower() != "ohne fremd-/störstoffe")
]

# Tokenize contaminant terms
def tokenize_contaminants(desc):
    return re.findall(r'[\w/]+', desc.lower())

df_valid["contaminant_tokens"] = df_valid["Fremd-/Störstoffbeschreibung"].apply(tokenize_contaminants)

# Flatten and count all tokens
all_tokens = [token for sublist in df_valid["contaminant_tokens"] for token in sublist]
contaminant_counts = Counter(all_tokens)

# Show the 30 most frequent contaminant terms
contaminant_counts.most_common(30)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid["contaminant_tokens"] = df_valid["Fremd-/Störstoffbeschreibung"].apply(tokenize_contaminants)


[('klebereste', 209),
 ('putze', 107),
 ('klebespachtel', 100),
 ('mit', 96),
 ('gipskarton', 85),
 ('verunreinigt', 65),
 ('kaschierung', 64),
 ('belagsreste', 57),
 ('klebstoff', 48),
 ('metallteile', 43),
 ('bitumenreste', 37),
 ('dämmstoff', 33),
 ('kunststoff/bitumen', 33),
 ('bahnen', 33),
 ('beschichtungen', 33),
 ('bitumenbahnen', 32),
 ('dampfdruckausgleichsschicht', 30),
 ('metalleinlage', 30),
 ('gipsspachtel', 29),
 ('wandfarbe', 29),
 ('geringf', 29),
 ('verunr', 29),
 ('kunststoffen', 29),
 ('dämmstoffen', 29),
 ('metallkasch', 29),
 ('massivbaustoffen', 29),
 ('klebstoffreste', 29),
 ('beschichtung', 29),
 ('geringfügig', 28),
 ('konv', 28)]

In [16]:
import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# -----------------------
# LOAD AND CLEAN DATA
# -----------------------
obd_with_pollutants = pd.read_csv("pollutant_labeled_obd_translated.csv", sep=";", quotechar='"')

# Filter valid pollutant description rows
df_valid = obd_with_pollutants[
    obd_with_pollutants["Fremd-/Störstoffbeschreibung"].notna() &
    (obd_with_pollutants["Fremd-/Störstoffbeschreibung"].str.lower() != "ohne fremd-/störstoffe")
].copy()

# -----------------------
# TOKENIZE CONTAMINANTS
# -----------------------
def tokenize_contaminants(desc):
    return re.findall(r'[\w/]+', desc.lower())

df_valid["contaminant_tokens"] = df_valid["Fremd-/Störstoffbeschreibung"].apply(tokenize_contaminants)

# Count most frequent tokens
all_tokens = [t for tokens in df_valid["contaminant_tokens"] for t in tokens]
top_terms = [term for term, _ in Counter(all_tokens).most_common(30)]

# Create binary label columns
for term in top_terms:
    df_valid[f"label_{term}"] = df_valid["contaminant_tokens"].apply(lambda tokens: int(term in tokens))

label_columns = [f"label_{term}" for term in top_terms]

# -----------------------
# MATERIAL ROLE MAPPING
# -----------------------
role_keywords = {
    "adhesive": ["kleber", "klebstoff", "spachtel"],
    "sealant": ["abdichtung", "dicht", "fuge", "bitumen", "bitumenbahn", "epdm", "eva", "ecb",
                "pvc", "dachbahn", "unterspannbahn", "kunststoffbahn", "dampfbremse", "folie", "vlies"],
    "mortar": ["mörtel", "zement", "putz", "verputz", "fugenmörtel", "kalkzementputz", "leichtputz", "ausgleichsmasse",
               "ziegel", "planstein", "leichtbeton", "dachstein", "glasbaustein"],
    "coating": ["farbe", "beschichtung", "lack", "bodenbelag", "linoleum", "korklinoleum", "gussasphaltestrich", "pvc-bodenbelag"],
    "insulation": ["dämm", "wolle", "schaum", "isolierung"],
    "board": ["platte", "gipskarton", "holzfaser"],
    "aggregate": ["kies", "schotter", "sand", "zuschlag", "granulat", "blähton", "naturbims"],
    "metal": ["stahl", "metall", "blech"],
    "wood": ["holz", "sperrholz"]
}

def infer_role(text):
    if pd.isna(text):
        return None
    for role, keywords in role_keywords.items():
        if any(keyword in text for keyword in keywords):
            return role
    return "other"

def refine_sealant_role(row):
    if row["material_role"] != "sealant":
        return row["material_role"]
    text = row["combined_text"]
    if any(x in text for x in ["dachbahn", "epdm", "bitumen", "ecb", "eva"]):
        return "roofing_sealant"
    elif any(x in text for x in ["dampfbremse", "vlies", "folie", "unterspannbahn"]):
        return "vapor_barrier"
    elif any(x in text for x in ["pvc", "bodenbelag", "belag"]):
        return "flooring_sealant"
    else:
        return "sealant"

# Apply role inference
df_valid["combined_text"] = (
    df_valid["Name (de)"].fillna("") + " " +
    df_valid["Kategorie (original)"].fillna("") + " " +
    df_valid["productName"].fillna("") + " " +
    df_valid["eolCategoryName"].fillna("")
).str.lower()

df_valid["material_role"] = df_valid["combined_text"].apply(infer_role).fillna("other")
df_valid["material_role"] = df_valid.apply(refine_sealant_role, axis=1)

# -----------------------
# PREPARE FEATURES
# -----------------------
# Use core context + pollutant class as input
df_model = df_valid[[
    "material_role", "eolCategoryName", "eolScenarioUnbuiltReal",
    "eolScenarioUnbuiltPotential", "technologyFactor"
]]


# One-hot encode all features
X = pd.get_dummies(df_model, drop_first=True)
y = df_valid[label_columns]

# -----------------------
# TRAINING
# -----------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultiOutputClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
model.fit(X_train, y_train)

# -----------------------
# EVALUATION
# -----------------------
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, target_names=y.columns)

print(report)


                                   precision    recall  f1-score   support

                 label_klebereste       1.00      1.00      1.00        44
                      label_putze       0.92      1.00      0.96        23
              label_klebespachtel       0.88      0.96      0.92        23
                        label_mit       1.00      0.63      0.77        19
                 label_gipskarton       1.00      1.00      1.00        19
               label_verunreinigt       0.00      0.00      0.00        13
                label_kaschierung       1.00      1.00      1.00        13
                label_belagsreste       0.00      0.00      0.00        12
                  label_klebstoff       0.40      0.18      0.25        11
                label_metallteile       1.00      0.20      0.33         5
               label_bitumenreste       0.00      0.00      0.00         5
                  label_dämmstoff       0.00      0.00      0.00         8
         label_kunststof

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Stacked contaminant prediction pipeline
# 1. Train pollutant class model (S0-S4)
# 2. Predict class probabilities
# 3. Train contaminant model using those probabilities

import pandas as pd
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# ------------------
# LOAD & CLEAN DATA
# ------------------
obd_with_pollutants = pd.read_csv("pollutant_labeled_obd_translated.csv", sep=";", quotechar='"')
obd_with_pollutants = obd_with_pollutants.dropna(subset=["Fremd-/Störstoffbeschreibung"])  # drop missing labels

# --------------------------
# TOKENIZATION + LABEL CLEANING
# --------------------------

def tokenize(desc):
    return re.findall(r"[\w/]+", desc.lower())

def apply_label_cleaning(tokens):
    replacements = {
        # unify all glue-related terms under "klebstoff"
        "kleber": "klebstoff",
        "kleberreste": "klebstoff",
        "klebstoffreste": "klebstoff",
        "klebstoffe": "klebstoff",
        "klebereste": "klebstoff",

        # unify bitumen variants
        "bitumenreste": "bitumen",
        "bitumenbahnen": "bitumen",
        "bitumendickschicht": "bitumen",
        "kunststoff/bitumen": "bitumen",

        # coatings
        "beschichtungen": "beschichtung",
        "reaktionsharzbeschichtung": "beschichtung",
        "beschichtet": "beschichtung",

        # insulation
        "dämmstoffe": "dämmstoff",
        "dämmstoffreste": "dämmstoff",
        "dämmstoffen": "dämmstoff",

        # gypsum and plaster
        "gipsspachtel": "gips",
        "gipskarton": "gips",
        "gipsputz": "gips",
        "putze": "putz",

        # mortar
        "kalkmörtel": "mörtel",
        "kalkzementmörtel": "mörtel",

        # sealing
        "feuchteabdichtung": "abdichtung",
        "flüssigabdichtungen": "abdichtung",
        "abdichtungen": "abdichtung",

        # other
        "kunststoffen": "kunststoff",
        "bodenbelagsreste": "belagsreste",
        "klebespachtel": "klebstoff",
        "massivbaustoffen": "massivbaustoff",
        "stahlbewehrung": "bewehrung",
        "bewehrungsstahl": "bewehrung",
        "naturfarbe": "farbe"
    }
    return [replacements.get(t, t) for t in tokens]

# Apply tokenization and cleaning
obd_with_pollutants["tokens"] = obd_with_pollutants["Fremd-/Störstoffbeschreibung"]\
    .apply(tokenize).apply(apply_label_cleaning)

# --------------------------
# TERM SELECTION AND LABEL CREATION
# --------------------------

# Exclude stopwords and noise
skip = {
    "ohne", "mit", "verunr", "geringf", "fremd", "/störstoffe", "verunreinigt",
    "geringfügig", "konv", "in", "z", "b", "wdvs", "geringen", "mengen"
}

# Get cleaned tokens
all_tokens = [t for tokens in obd_with_pollutants["tokens"] for t in tokens if t not in skip]

# Choose top 15 (cleaned and consolidated)
top_terms = [term for term, _ in Counter(all_tokens).most_common(15)]

# Create binary labels for each of the top terms
for term in top_terms:
    obd_with_pollutants[f"label_{term}"] = obd_with_pollutants["tokens"].apply(lambda tokens: int(term in tokens))

# Final contaminant labels to use in y
contaminant_labels = [f"label_{term}" for term in top_terms]

# ----------------------
# ROLE INFERENCE + TEXT
# ----------------------
role_keywords = {
    "adhesive": ["kleber", "klebstoff", "spachtel"],
    "sealant": ["abdichtung", "dicht", "fuge", "bitumen", "bitumenbahn", "epdm", "eva", "ecb",
                 "pvc", "dachbahn", "unterspannbahn", "kunststoffbahn", "dampfbremse", "folie", "vlies"],
    "mortar": ["mörtel", "zement", "putz", "verputz", "fugenmörtel", "kalkzementputz", "leichtputz", "ausgleichsmasse",
               "ziegel", "planstein", "leichtbeton", "dachstein", "glasbaustein"],
    "coating": ["farbe", "beschichtung", "lack", "bodenbelag", "linoleum", "korklinoleum", "gussasphaltestrich", "pvc-bodenbelag"],
    "insulation": ["dämm", "wolle", "schaum", "isolierung"],
    "board": ["platte", "gipskarton", "holzfaser"],
    "aggregate": ["kies", "schotter", "sand", "zuschlag", "granulat", "blähton", "naturbims"],
    "metal": ["stahl", "metall", "blech"],
    "wood": ["holz", "sperrholz"]
}

def infer_role(text):
    for role, keywords in role_keywords.items():
        if any(k in text for k in keywords):
            return role
    return "other"

def refine_sealant(row):
    if row["material_role"] != "sealant": return row["material_role"]
    t = row["combined_text"]
    if any(x in t for x in ["dachbahn", "epdm", "bitumen", "ecb", "eva"]): return "roofing_sealant"
    if any(x in t for x in ["dampfbremse", "vlies", "folie", "unterspannbahn"]): return "vapor_barrier"
    if any(x in t for x in ["pvc", "bodenbelag", "belag"]): return "flooring_sealant"
    return "sealant"

# Combine fields
text = (obd_with_pollutants["Name (de)"].fillna("") + " " + obd_with_pollutants["Kategorie (original)"].fillna("") +
        " " + obd_with_pollutants["productName"].fillna("") + " " + obd_with_pollutants["eolCategoryName"].fillna("")).str.lower()
obd_with_pollutants["combined_text"] = text
obd_with_pollutants["material_role"] = obd_with_pollutants["combined_text"].apply(infer_role)
obd_with_pollutants["material_role"] = obd_with_pollutants.apply(refine_sealant, axis=1)

# --------------------------
# STAGE 1: POLLUTANT CLASS
# # --------------------------

df_class = pd.get_dummies(obd_with_pollutants["Störstoffklasse"])

X_context = obd_with_pollutants[["material_role", "eolCategoryName", "eolScenarioUnbuiltReal", "eolScenarioUnbuiltPotential", "technologyFactor"]]
X_class = pd.get_dummies(X_context, drop_first=True)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_class, df_class, test_size=0.2, random_state=42)
model_class = MultiOutputClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
model_class.fit(X_train1, y_train1)

# Predict class probs (for stage 2 input)
class_probs = model_class.predict_proba(X_class)
class_probs_df = pd.DataFrame({label: prob[:, 1] for label, prob in zip(df_class.columns, class_probs)}, index=obd_with_pollutants.index)

# TF-IDF on productName
tfidf = TfidfVectorizer(max_features=100, stop_words=None)
X_text = tfidf.fit_transform(obd_with_pollutants["productName"].fillna("").astype(str))
X_text_df = pd.DataFrame(X_text.toarray(), index=obd_with_pollutants.index, columns=tfidf.get_feature_names_out())

# Combine all features
X_combined_tfidf = pd.concat([X_context.reset_index(drop=True), class_probs_df.reset_index(drop=True), X_text_df.reset_index(drop=True)], axis=1)
X_final_tfidf = pd.get_dummies(X_combined_tfidf, drop_first=True)
y_final_tfidf = obd_with_pollutants[contaminant_labels]

# --------------------------
# STAGE 2: CONTAMINANT MODEL
# --------------------------
# Train contaminant model with TF-IDF-enhanced features
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_final_tfidf, y_final_tfidf, test_size=0.2, random_state=42)
model_tfidf = MultiOutputClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
model_tfidf.fit(X_train_tfidf, y_train_tfidf)

# Evaluate
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
report_tfidf = classification_report(y_test_tfidf, y_pred_tfidf, target_names=y_final_tfidf.columns, output_dict=True)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
# as dataframe
report_tfidf_df = pd.DataFrame(report_tfidf).transpose()

In [19]:
report_tfidf_df

Unnamed: 0,precision,recall,f1-score,support
label_klebstoff,0.333333,0.282051,0.305556,78.0
label_gips,0.32,0.275862,0.296296,29.0
label_putz,0.387097,0.428571,0.40678,28.0
label_bitumen,0.64,0.615385,0.627451,26.0
label_dämmstoff,0.294118,0.263158,0.277778,19.0
label_beschichtung,0.266667,0.235294,0.25,17.0
label_belagsreste,0.0,0.0,0.0,15.0
label_kaschierung,0.333333,0.3125,0.322581,16.0
label_metallteile,1.0,0.285714,0.444444,7.0
label_bahnen,0.3,0.3,0.3,10.0


In [20]:
# THRESHOLDED PROBABILITIES

# Predict raw probabilities
y_proba = model_tfidf.predict_proba(X_test_tfidf)

proba_df = pd.DataFrame({
    label: probs[:, 1] for label, probs in zip(y_final_tfidf.columns, y_proba)
}, index=X_test_tfidf.index)

# Threshold at 0.3 → likely present
likely_present = proba_df >= 0.3

# Threshold at 0.5 → very likely present
very_likely = proba_df >= 0.5

# Optional: show top N contaminants per row with scores
def rank_top_contaminants(row, threshold=0.25, top_n=3):
    sorted_labels = row[row >= threshold].sort_values(ascending=False)
    return ", ".join([f"{lbl.replace('label_', '')} ({prob:.2f})" for lbl, prob in sorted_labels.items()][:top_n])

proba_df["top_contaminants"] = proba_df.apply(rank_top_contaminants, axis=1)

# Attach productName and UUID from original dataset
proba_df["productName"] = obd_with_pollutants.loc[X_test_tfidf.index, "productName"].values
proba_df["UUID"] = obd_with_pollutants.loc[X_test_tfidf.index, "UUID"].values

# Example output:
proba_df.columns
proba_df[["productName", "UUID", "top_contaminants"]].head(10)

Unnamed: 0,productName,UUID,top_contaminants
708,PE/PP Vlies,91412f3f-6077-44d4-9c9d-95c543bcb419,
215,Gipskartonplatte (imprÃ¤gniert),deeb0bda-20fa-412a-b945-1a589638db21,"klebstoff (0.28), belagsreste (0.28)"
882,Unterspannbahn PUR auf PET-Vlies,ed734c48-a58f-4f92-a9f7-57631b500fdc,klebstoff (0.57)
88,Gipsfaserplatte,1b0a3488-9b02-4c98-b421-8c746d350f97,"klebstoff (0.50), belagsreste (0.50), wandfarb..."
842,PE/PP Vlies,6869f7c1-1b2b-4f30-afc9-823a0104f1d9,klebstoff (0.45)
139,Gipskartonplatte (Lochplatte),7b69e7b4-68dd-49a3-b2dc-ae608b66eece,"klebstoff (0.30), belagsreste (0.30)"
86,Gipsfaserplatte,1b0a3488-9b02-4c98-b421-8c746d350f97,"klebstoff (0.50), belagsreste (0.50), wandfarb..."
486,Drei-Schichtholzplatte,3489c0eb-415f-4dc8-8c2d-b276f6efebc6,"bitumen (1.00), dämmstoff (0.57), beschichtung..."
635,PE-HD mit PP-Vlies zur Abdichtung,d1ab548a-3626-4667-9690-baff659e10d5,klebstoff (0.45)
604,Unterspannbahn PP,498cd894-8ae6-458c-b874-9022cb148409,klebstoff (0.68)


# Predict Pollutants Based on Material Combinations


In [None]:
# Load component-material mapping
uuid_map = pd.read_csv("/Users/pablosoriano/Documents/Data Science/bbsr-challenge/all_uuid_materials_from_components.csv")

# Merge material UUIDs to component IDs
material_with_component = tbs_proba_df.merge(uuid_map[["uuid", "main_component_id"]],
                                         left_on="UUID", right_on="uuid", how="left")

# Compute component-level mean probabilities
component_probs = material_with_component.groupby("main_component_id")[["S0", "S1", "S2", "S3", "S4"]].mean().reset_index()

# Merge back to material-level
blended = material_with_component.merge(component_probs, on="main_component_id", suffixes=("_mat", "_comp"))

# Compute adjusted probabilities
for cls in ["S0", "S1", "S2", "S3", "S4"]:
    blended[f"{cls}_adjusted"] = 0.7 * blended[f"{cls}_mat"] + 0.3 * blended[f"{cls}_comp"]


In [None]:
proba_df

Unnamed: 0,productName,S0,S1,S2,S3,S4
0,Zinkbleche,0.880,0.000,0.950,0.025,0.005
1,CR Profil (Chloropren-Kautschuk),0.605,0.050,0.285,0.305,0.335
2,Holz-Blendrahmen,0.945,0.460,0.625,0.260,0.490
3,Kunstharzputz,0.985,0.055,0.095,0.305,0.015
4,Dachziegel / Ton-,0.920,0.000,0.335,0.275,0.105
...,...,...,...,...,...,...
333,Schaumglasgranulat SchÃ¼ttung,0.900,0.115,0.565,0.370,0.010
334,SchilfrohrdÃ¤mmmatte,0.860,0.145,0.390,0.615,0.035
335,Schilfrohrmatte (PutztrÃ¤ger),0.905,0.105,0.435,0.695,0.060
336,"Splitt 2/8, dauerelastisch gebunden (Latex, so...",0.975,0.140,0.180,0.605,0.015


In [28]:
blended["main_component_id"] = blended["main_component_id"].astype(int)


In [29]:
blended_pollutant_prediction = blended[["UUID", "main_component_id", "productName","S0_adjusted", "S1_adjusted", "S2_adjusted", "S3_adjusted", "S4_adjusted"]]
blended_pollutant_prediction

Unnamed: 0,UUID,main_component_id,productName,S0_adjusted,S1_adjusted,S2_adjusted,S3_adjusted,S4_adjusted
0,5541250a-f8d8-4c67-9f24-47ab54686c30,450081,Kunstharzputz,0.923875,0.061000,0.142250,0.327875,0.065250
1,5541250a-f8d8-4c67-9f24-47ab54686c30,450081,Kunstharzputz,0.923875,0.061000,0.142250,0.327875,0.065250
2,5541250a-f8d8-4c67-9f24-47ab54686c30,450529,Kunstharzputz,0.988000,0.044000,0.076000,0.444000,0.012000
3,5541250a-f8d8-4c67-9f24-47ab54686c30,450529,Kunstharzputz,0.988000,0.044000,0.076000,0.444000,0.012000
4,592ffe6e-4c21-4a24-ba67-273acbfca373,11293,Dachziegel / Ton-,0.938000,0.000000,0.259625,0.438125,0.081375
...,...,...,...,...,...,...,...,...
331,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,4644660,Zementestrich,0.819000,0.195000,0.663250,0.843000,0.061500
332,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,4644667,Zementestrich,0.856625,0.201500,0.558000,0.886250,0.058125
333,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,4644667,Zementestrich,0.856625,0.201500,0.558000,0.886250,0.058125
334,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,4644674,Zementestrich,0.825929,0.193357,0.643500,0.855500,0.060429


In [None]:
# remove duplicates main component id / uuid
blended["main_component_id"] = blended[["main_component_id", "UUID"]].drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  blended["main_component_id"] = blended[["main_component_id", "UUID"]].drop_duplicates(inplace=True)


In [30]:
# example of adjustment for one 
blended[["main_component_id","UUID", "productName","S1_mat", "S1_comp", "S1_adjusted"]]

Unnamed: 0,main_component_id,UUID,productName,S1_mat,S1_comp,S1_adjusted
0,450081,5541250a-f8d8-4c67-9f24-47ab54686c30,Kunstharzputz,0.055,0.075000,0.061000
1,450081,5541250a-f8d8-4c67-9f24-47ab54686c30,Kunstharzputz,0.055,0.075000,0.061000
2,450529,5541250a-f8d8-4c67-9f24-47ab54686c30,Kunstharzputz,0.055,0.018333,0.044000
3,450529,5541250a-f8d8-4c67-9f24-47ab54686c30,Kunstharzputz,0.055,0.018333,0.044000
4,11293,592ffe6e-4c21-4a24-ba67-273acbfca373,Dachziegel / Ton-,0.000,0.000000,0.000000
...,...,...,...,...,...,...
331,4644660,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,Zementestrich,0.260,0.043333,0.195000
332,4644667,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,Zementestrich,0.260,0.065000,0.201500
333,4644667,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,Zementestrich,0.260,0.065000,0.201500
334,4644674,86d919ee-8f30-4ca4-9b7e-717aecba6ac0,Zementestrich,0.260,0.037857,0.193357


In [None]:
## Same for individual pollutants

contaminant_df = pd.read_csv("contaminant_predictions.csv", sep=";", quotechar='"')
# Fix merge by aligning column names
contaminant_with_component = contaminant_df.merge(uuid_map.rename(columns={"uuid": "UUID"})[["UUID", "main_component_id"]],
                                                  on="UUID", how="left")

# Identify contaminant probability columns
contaminant_label_cols = [col for col in contaminant_with_component.columns if col.startswith("label_")]

# Compute component-level averages
component_contaminant_avg = contaminant_with_component.groupby("main_component_id")[contaminant_label_cols].mean().reset_index()

# Merge and blend
blended_contaminant = contaminant_with_component.merge(component_contaminant_avg, on="main_component_id", suffixes=("_mat", "_comp"))

for label in contaminant_label_cols:
    blended_contaminant[f"{label}_adjusted"] = (
        0.7 * blended_contaminant[f"{label}_mat"] + 0.3 * blended_contaminant[f"{label}_comp"]
    )

## Pollutant prediction for material combinations

In [None]:
# remove duplicates (uuid, main_component_id)
blended_contaminant["main_component_id"] = blended_contaminant["main_component_id"].astype(int)
blended_contaminant = blended_contaminant.drop_duplicates(subset=["UUID", "main_component_id"])

blended_contaminant[["UUID", "main_component_id", "productName"] + [f"{label}_adjusted" for label in contaminant_label_cols]]


Unnamed: 0,UUID,main_component_id,productName,label_klebstoff_adjusted,label_gips_adjusted,label_putz_adjusted,label_bitumen_adjusted,label_dämmstoff_adjusted,label_beschichtung_adjusted,label_belagsreste_adjusted,label_kaschierung_adjusted,label_metallteile_adjusted,label_bahnen_adjusted,label_kunststoff_adjusted,label_mörtel_adjusted,label_dampfdruckausgleichsschicht_adjusted,label_metalleinlage_adjusted,label_wandfarbe_adjusted
0,ed734c48-a58f-4f92-a9f7-57631b500fdc,450088,Unterspannbahn PUR auf PET-Vlies,0.487323,0.102948,0.102948,0.000000,0.000000,0.000000,0.000000,0.102948,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,ed734c48-a58f-4f92-a9f7-57631b500fdc,11256,Unterspannbahn PUR auf PET-Vlies,0.489067,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,ed734c48-a58f-4f92-a9f7-57631b500fdc,450166,Unterspannbahn PUR auf PET-Vlies,0.489067,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6,6869f7c1-1b2b-4f30-afc9-823a0104f1d9,558275,PE/PP Vlies,0.485009,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,6869f7c1-1b2b-4f30-afc9-823a0104f1d9,1845,PE/PP Vlies,0.485531,0.049023,0.049023,0.000000,0.000000,0.000000,0.000000,0.049023,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,ed391263-0e6d-43dd-ad3e-43607545f281,4644660,Korklinoleum FuÃbodenbelag,0.455559,0.030602,0.027067,0.077499,0.006690,0.006720,0.380312,0.024512,0.000875,0.003571,0.000632,0.005588,0.001237,0.001237,0.001133
476,ed391263-0e6d-43dd-ad3e-43607545f281,4644671,Korklinoleum FuÃbodenbelag,0.422043,0.000559,0.000357,0.013489,0.013042,0.012547,0.304907,0.000000,0.000000,0.012522,0.000135,0.000852,0.000000,0.000000,0.000083
478,ed391263-0e6d-43dd-ad3e-43607545f281,4644674,Korklinoleum FuÃbodenbelag,0.428190,0.048366,0.065632,0.024473,0.020495,0.002122,0.319456,0.046443,0.000276,0.001128,0.000200,0.020147,0.000391,0.000391,0.000358
494,3abc810c-ef3a-4ab4-b6d8-0217716e213e,4644643,Calciumsulfatestrich,0.139878,0.051205,0.045977,0.348470,0.336925,0.324128,0.027125,0.036767,0.000000,0.323498,0.003484,0.022005,0.000000,0.000000,0.002153
