In [1]:
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer

df = pd.read_csv('penguins_simple.csv', sep=';')

In [2]:
# transform a numerical column
kbins = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')
columns = df[['Culmen Length (mm)']]
kbins.fit(columns)
t = kbins.transform(columns)
t = t.todense()  # before this, t is a sparse matrix data type
print(t.shape)

(333, 5)


In [3]:
# create nice labels
edges = kbins.bin_edges_[0].round(1)
labels = []
for i in range(len(edges)-1):
    edge1 = edges[i]
    edge2 = edges[i+1]
    labels.append(f"{edge1} to {edge2}")

In [4]:
# create a DataFrame
df_bins = pd.DataFrame(t, columns=labels)
print(df_bins.head())


# BONUS: set the strategy parameter to 'uniform' and see how the edges change

   32.1 to 38.6  38.6 to 42.0  42.0 to 46.1  46.1 to 49.5  49.5 to 59.6
0           0.0           1.0           0.0           0.0           0.0
1           0.0           1.0           0.0           0.0           0.0
2           0.0           1.0           0.0           0.0           0.0
3           1.0           0.0           0.0           0.0           0.0
4           0.0           1.0           0.0           0.0           0.0
