# Übung 2.1 - Clustering (Beispiellösung)

Environment für dieses Notebook: `ads-ml-light` (*requirements-py3.11-ads-ml-light.txt*).

## Daten einlesen

In [1]:
from repml.datasets.trees_fra import read_trees_fra

data = read_trees_fra()

## Target Variable

In [2]:
tree_counts = data["gattung"].value_counts()
rare_trees = tree_counts[tree_counts < 100].index.to_list()

In [3]:
print("Es gibt", len(rare_trees), "seltene Gattungen.")
print(rare_trees[:10])

Es gibt 88 seltene Gattungen.
['Cedrus', 'Sambucus', 'Zelkova', 'Gymnocladus', 'Cercis', 'Sequoiadendron', 'Buxus', 'Tetradium', 'Cercidiphyllum', 'Cryptomeria']


In [4]:
data["gattung"] = data["gattung"].astype("object")
data.loc[data.query(f"gattung == @rare_trees").index, "gattung"] = "rare"
data["gattung"] = data["gattung"].astype("category")

## Trainings- und Testdaten

In [5]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(
    data, test_size=0.2, random_state=42, stratify=data["gattung"]
)

In [6]:
num_features = ["kr_durchm", "baumhoehe", "st_durchm", "pflanzjahr"]
cat_features = ["gebiet", "baum_statu"]

y = ["gattung"]
X = [*num_features, *cat_features]

In [7]:
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


class CombinedFeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        self.feature_names_in_ = df.columns.tolist()
        # volume as a sphere
        # volume = (4 / 3) * np.pi * pow(df["kr_durchm"] / 2, 3)
        # volume as a cylinder
        volume = np.pi * pow(df["kr_durchm"] / 2, 2) * df["baumhoehe"]

        return np.c_[X, volume]

    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            input_features = self.feature_names_in_
        return np.array([*input_features, "cfa_volume"])

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

num_step = Pipeline(
    steps=[
        ("attr_add", CombinedFeatureAdder()),
        ("std_scaler", StandardScaler()),
    ]
)

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

prep_steps = ColumnTransformer(
    transformers=[
        ("num", num_step, num_features),
        ("ohe", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), cat_features),
    ],
    remainder="drop",
)

In [10]:
prep_steps.fit_transform(X=train_data[X], y=train_data[y])

array([[-0.5195075 , -0.44539314, -0.12519284, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.71836031, -0.6082158 , -0.12519284, ...,  0.        ,
         0.        ,  1.        ],
       [-1.26222819, -1.4223291 , -1.20473142, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.71836031,  0.53154283, -0.03882975, ...,  0.        ,
         0.        ,  1.        ],
       [-0.5195075 , -1.09668378, -0.81609753, ...,  0.        ,
         0.        ,  1.        ],
       [-0.27193394, -0.93386112, -0.3842821 , ...,  0.        ,
         0.        ,  1.        ]])

## Clustering Step
Nach dem One-Hot-Encoding fügen wir das Clustering in einem separaten Colum Transformer hinzu.

In [11]:
from sklearn.cluster import KMeans

n_clusters = 7
cluster_step = ColumnTransformer(
    [("kmeans", KMeans(n_clusters=n_clusters, n_init="auto", random_state=42), [1, 2])],
    remainder="passthrough",
)

Da wir Clustering zur Reduktion der Dimension einbringen möchten, sind wir an dem Output von `KMeans.predict` und nicht von `KMeans.transform` interessiert. Das lösen wir über den ClusterFeature Custom Transformer, welcher mit np.argmin() die Clusterzugehörigkeit identifiziert. Zuerst ein Beispiel zu np.argmin():

In [12]:
import pandas as pd

rng = np.random.default_rng(123)
df_random = rng.random((10, 5))
df_random = pd.DataFrame(
    df_random, columns=["Cluster0", "Cluster1", "Cluster2", "Cluster3", "Cluster4"]
)
df_random

Unnamed: 0,Cluster0,Cluster1,Cluster2,Cluster3,Cluster4
0,0.682352,0.053821,0.22036,0.184372,0.175906
1,0.812095,0.923345,0.276574,0.819755,0.889893
2,0.51297,0.244965,0.824242,0.213763,0.741467
3,0.62994,0.927407,0.231908,0.799125,0.518165
4,0.231556,0.165904,0.497789,0.582725,0.184338
5,0.014895,0.471133,0.728243,0.9186,0.625534
6,0.917123,0.86469,0.218143,0.866127,0.730752
7,0.277865,0.797044,0.865222,0.299438,0.527042
8,0.071487,0.583238,0.237906,0.764964,0.173632
9,0.312742,0.014474,0.032552,0.496702,0.468313


In [13]:
np.array(df_random).argmin(axis=1)

array([1, 2, 3, 2, 1, 0, 2, 0, 0, 1])

In [14]:
class ClusterFeature(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        cluster_label = X[:, : self.n_clusters - 1].argmin(axis=1)
        return np.c_[cluster_label, X[:, self.n_clusters :]]

    def get_feature_names_out(self, input_features=None):
        if input_features is None:
            input_features = self.feature_names_in_
        return np.array(["cluster_label", *input_features[self.n_clusters :]])

In [15]:
pipeline_check_2 = Pipeline(
    [
        ("preprocessing", prep_steps),
        ("cluster", cluster_step),
        ("c_feat", ClusterFeature(n_clusters=7)),
    ]
)

pipeline_check_2.fit_transform(X=train_data[X], y=train_data[y])

array([[ 3.        , -0.5195075 ,  0.21165808, ...,  0.        ,
         0.        ,  1.        ],
       [ 3.        ,  0.71836031, -0.37705167, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , -1.26222819,  0.83961514, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.        ,  0.71836031,  0.01542149, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , -0.5195075 ,  0.91810978, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.        , -0.27193394,  0.40789466, ...,  0.        ,
         0.        ,  1.        ]])

Um die Information mit dem Decision Tree nutzbar zu machen, führen wir auf dieses Ergebnis nochmals ein One-Hot-Encoding durch.

In [16]:
cluster_encoding = ColumnTransformer(
    transformers=[
        ("cohe", OneHotEncoder(sparse_output=False), [0]),
    ],
    remainder="passthrough",
)

In [17]:
from sklearn.tree import DecisionTreeClassifier

pipeline = Pipeline(
    [
        ("preprocessing", prep_steps),
        ("cluster", cluster_step),
        ("c_feat", ClusterFeature(n_clusters=7)),
        ("c_encoding", cluster_encoding),
        ("model", DecisionTreeClassifier(random_state=42)),
    ]
)

In [18]:
pipeline.fit(X=train_data[X], y=train_data[y])

In [19]:
pipeline[:-1].get_feature_names_out()

array(['cohe__cluster_label_0.0', 'cohe__cluster_label_1.0',
       'cohe__cluster_label_2.0', 'cohe__cluster_label_3.0',
       'cohe__cluster_label_4.0', 'cohe__cluster_label_5.0',
       'remainder__remainder__num__kr_durchm',
       'remainder__remainder__num__pflanzjahr',
       'remainder__remainder__num__cfa_volume',
       'remainder__remainder__ohe__gebiet_25 Amt für Bau und Immobilien',
       'remainder__remainder__ohe__gebiet_40 Stadtschulamt',
       'remainder__remainder__ohe__gebiet_41 Kulturamt',
       'remainder__remainder__ohe__gebiet_48 Kita Frankfurt',
       'remainder__remainder__ohe__gebiet_51 Jugend- und Sozialamt',
       'remainder__remainder__ohe__gebiet_52 Sportamt',
       'remainder__remainder__ohe__gebiet_57 Kommunale Kinder-, Jugend- und Familienhilfe',
       'remainder__remainder__ohe__gebiet_67 Mietliegenschaften',
       'remainder__remainder__ohe__gebiet_Ausgleichsfläche',
       'remainder__remainder__ohe__gebiet_Friedhof',
       'remainder__rema

In [20]:
from sklearn.model_selection import cross_val_score

cross_val_score(estimator=pipeline, X=train_data[X], y=train_data[y], cv=5)

array([0.35104044, 0.35689046, 0.35316058, 0.35109349, 0.34842357])

In [21]:
pipeline.score(X=test_data[X], y=test_data[y])

0.3566918993623771