# Learning how to use apply label columns to DataFrame with `pd.cut()`, `np.select`, `df.map()`

## Imports

In [1]:
import pandas as pd
import random
from datetime import datetime
import numpy as np

In [2]:
random.seed(1)

In [4]:
pd.options.display.float_format = "{:,.3f}".format

## Mock DataFrame 1

In [5]:
nr_of_rand_points = 16

In [6]:
df = pd.DataFrame(
    {
        "float": sorted([random.random() for _ in range(nr_of_rand_points)]),
    },

)

df

Unnamed: 0,float
0,0.002
1,0.028
2,0.094
3,0.134
4,0.255
5,0.433
6,0.445
7,0.449
8,0.495
9,0.652


## The old way

### Create columns with labels to populate

In [7]:
df["label"] = ""

In [8]:
df

Unnamed: 0,float,label
0,0.002,
1,0.028,
2,0.094,
3,0.134,
4,0.255,
5,0.433,
6,0.445,
7,0.449,
8,0.495,
9,0.652,


### Mask 1

In [9]:
mask_1 = df["float"] < 0.2
mask_1

0      True
1      True
2      True
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
Name: float, dtype: bool

In [10]:
df.loc[mask_1, "label"] = "bottom"

In [11]:
df

Unnamed: 0,float,label
0,0.002,bottom
1,0.028,bottom
2,0.094,bottom
3,0.134,bottom
4,0.255,
5,0.433,
6,0.445,
7,0.449,
8,0.495,
9,0.652,


### Mask 2

In [12]:
mask_2 = df["float"] >= 0.8
mask_2

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14     True
15     True
Name: float, dtype: bool

In [13]:
df.loc[mask_2, "label"] = "top"

In [14]:
df

Unnamed: 0,float,label
0,0.002,bottom
1,0.028,bottom
2,0.094,bottom
3,0.134,bottom
4,0.255,
5,0.433,
6,0.445,
7,0.449,
8,0.495,
9,0.652,


### Mask 3

In [15]:
mask_3 = (df["float"] >= 0.2) & (df["float"] < 0.8)
mask_3

0     False
1     False
2     False
3     False
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14    False
15    False
Name: float, dtype: bool

In [16]:
df.loc[mask_3, "label"] = "middle"

In [17]:
df

Unnamed: 0,float,label
0,0.002,bottom
1,0.028,bottom
2,0.094,bottom
3,0.134,bottom
4,0.255,middle
5,0.433,middle
6,0.445,middle
7,0.449,middle
8,0.495,middle
9,0.652,middle


### Label column type

Manual column labeling creates an `object` type which is used for strings.

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   float   16 non-null     float64
 1   label   16 non-null     object 
dtypes: float64(1), object(1)
memory usage: 388.0+ bytes


## The cut way 1

Good for a single condition.

In [19]:
# Create 3 bins
# Each bin must have start and end
# 4 values creates 3 bins
bins = [0, 0.2, 0.8, 1]

In [20]:
labels = ["b", "m", "t"]

In [21]:
df["label_cut"] = pd.cut(
    x=df["float"],
    bins=bins,
    labels=labels
)

In [22]:
df

Unnamed: 0,float,label,label_cut
0,0.002,bottom,b
1,0.028,bottom,b
2,0.094,bottom,b
3,0.134,bottom,b
4,0.255,middle,m
5,0.433,middle,m
6,0.445,middle,m
7,0.449,middle,m
8,0.495,middle,m
9,0.652,middle,m


### Label column type

The `cut` function uses `categorical` type for label column. This is more efficient and faster.

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   float      16 non-null     float64 
 1   label      16 non-null     object  
 2   label_cut  16 non-null     category
dtypes: category(1), float64(1), object(1)
memory usage: 536.0+ bytes


## The cut way 2

Let's add datetime index.

In [24]:
start_date = datetime(
    year=2025,
    month=6,
    day=1,
    hour=11,
    minute=52,
    second=0,
)

start_date

datetime.datetime(2025, 6, 1, 11, 52)

In [None]:
timestamp_index = pd.date_range(
    start=start_date,
    periods=nr_of_rand_points,
    freq="min"
)

In [26]:
timestamp_index

DatetimeIndex(['2025-06-01 11:52:00', '2025-06-01 11:53:00',
               '2025-06-01 11:54:00', '2025-06-01 11:55:00',
               '2025-06-01 11:56:00', '2025-06-01 11:57:00',
               '2025-06-01 11:58:00', '2025-06-01 11:59:00',
               '2025-06-01 12:00:00', '2025-06-01 12:01:00',
               '2025-06-01 12:02:00', '2025-06-01 12:03:00',
               '2025-06-01 12:04:00', '2025-06-01 12:05:00',
               '2025-06-01 12:06:00', '2025-06-01 12:07:00'],
              dtype='datetime64[ns]', freq='min')

In [27]:
mid_point = timestamp_index[8]
mid_point

Timestamp('2025-06-01 12:00:00')

In [28]:
df = pd.DataFrame(
    {
        "float": sorted([random.random() for _ in range(nr_of_rand_points)]),
    },
    index=timestamp_index,
)

df

Unnamed: 0,float
2025-06-01 11:52:00,0.025
2025-06-01 11:53:00,0.029
2025-06-01 11:54:00,0.031
2025-06-01 11:55:00,0.217
2025-06-01 11:56:00,0.222
2025-06-01 11:57:00,0.229
2025-06-01 11:58:00,0.231
2025-06-01 11:59:00,0.233
2025-06-01 12:00:00,0.381
2025-06-01 12:01:00,0.422


In [29]:
bins_t = [
    timestamp_index[0],
    mid_point,
    timestamp_index[-1]
]

In [30]:
bins_t

[Timestamp('2025-06-01 11:52:00'),
 Timestamp('2025-06-01 12:00:00'),
 Timestamp('2025-06-01 12:07:00')]

In [31]:
labels_t = ["morning", "noon"]

In [32]:
df["time_of_day"] = pd.cut(
    x=df.index,
    bins=bins_t,
    labels=labels_t
)

In [33]:
df

Unnamed: 0,float,time_of_day
2025-06-01 11:52:00,0.025,
2025-06-01 11:53:00,0.029,morning
2025-06-01 11:54:00,0.031,morning
2025-06-01 11:55:00,0.217,morning
2025-06-01 11:56:00,0.222,morning
2025-06-01 11:57:00,0.229,morning
2025-06-01 11:58:00,0.231,morning
2025-06-01 11:59:00,0.233,morning
2025-06-01 12:00:00,0.381,morning
2025-06-01 12:01:00,0.422,noon


### Be inclusive of all values

As seen above, the default option is to have every bin open ended on the left  so it's `(start, end]`. 
Adding `include_lowest=True` will make the first bin close ended on both sides: `[start, end]`

In [34]:
df["time_of_day_2"] = pd.cut(
    x=df.index,
    bins=bins_t,
    labels=labels_t,
    include_lowest=True,
)

In [35]:
df

Unnamed: 0,float,time_of_day,time_of_day_2
2025-06-01 11:52:00,0.025,,morning
2025-06-01 11:53:00,0.029,morning,morning
2025-06-01 11:54:00,0.031,morning,morning
2025-06-01 11:55:00,0.217,morning,morning
2025-06-01 11:56:00,0.222,morning,morning
2025-06-01 11:57:00,0.229,morning,morning
2025-06-01 11:58:00,0.231,morning,morning
2025-06-01 11:59:00,0.233,morning,morning
2025-06-01 12:00:00,0.381,morning,morning
2025-06-01 12:01:00,0.422,noon,noon


### Change the the inclusion from right to left

So each bin is `[start, end)`

In [36]:
df["time_of_day_3"] = pd.cut(
    x=df.index,
    bins=bins_t,
    labels=labels_t,
    right=False,
)

In [37]:
df

Unnamed: 0,float,time_of_day,time_of_day_2,time_of_day_3
2025-06-01 11:52:00,0.025,,morning,morning
2025-06-01 11:53:00,0.029,morning,morning,morning
2025-06-01 11:54:00,0.031,morning,morning,morning
2025-06-01 11:55:00,0.217,morning,morning,morning
2025-06-01 11:56:00,0.222,morning,morning,morning
2025-06-01 11:57:00,0.229,morning,morning,morning
2025-06-01 11:58:00,0.231,morning,morning,morning
2025-06-01 11:59:00,0.233,morning,morning,morning
2025-06-01 12:00:00,0.381,morning,morning,noon
2025-06-01 12:01:00,0.422,noon,noon,noon


The last value is now not captured so in order to get it we need to make sure the last value in the bin is bigger than the last timestamp in the index.

In [38]:
bins_tt = [
    timestamp_index[0],
    mid_point,
    timestamp_index[-1] + pd.Timedelta("1ns")
]

In [39]:
bins_tt

[Timestamp('2025-06-01 11:52:00'),
 Timestamp('2025-06-01 12:00:00'),
 Timestamp('2025-06-01 12:07:00.000000001')]

In [40]:
df["time_of_day_4"] = pd.cut(
    x=df.index,
    bins=bins_tt,
    labels=labels_t,
    right=False,
)

In [41]:
df

Unnamed: 0,float,time_of_day,time_of_day_2,time_of_day_3,time_of_day_4
2025-06-01 11:52:00,0.025,,morning,morning,morning
2025-06-01 11:53:00,0.029,morning,morning,morning,morning
2025-06-01 11:54:00,0.031,morning,morning,morning,morning
2025-06-01 11:55:00,0.217,morning,morning,morning,morning
2025-06-01 11:56:00,0.222,morning,morning,morning,morning
2025-06-01 11:57:00,0.229,morning,morning,morning,morning
2025-06-01 11:58:00,0.231,morning,morning,morning,morning
2025-06-01 11:59:00,0.233,morning,morning,morning,morning
2025-06-01 12:00:00,0.381,morning,morning,noon,noon
2025-06-01 12:01:00,0.422,noon,noon,noon,noon


## The cut way 3 with a default value

In [42]:
bins_ttt = [
    timestamp_index[4],
    timestamp_index[9]
]

In [43]:
bins_ttt

[Timestamp('2025-06-01 11:56:00'), Timestamp('2025-06-01 12:01:00')]

In [44]:
df["time_of_day_5"] = pd.cut(
    x=df.index,
    bins=bins_ttt,
    labels=["mid point"],
    right=False,
).add_categories("??").fillna("??")

In [45]:
df

Unnamed: 0,float,time_of_day,time_of_day_2,time_of_day_3,time_of_day_4,time_of_day_5
2025-06-01 11:52:00,0.025,,morning,morning,morning,??
2025-06-01 11:53:00,0.029,morning,morning,morning,morning,??
2025-06-01 11:54:00,0.031,morning,morning,morning,morning,??
2025-06-01 11:55:00,0.217,morning,morning,morning,morning,??
2025-06-01 11:56:00,0.222,morning,morning,morning,morning,mid point
2025-06-01 11:57:00,0.229,morning,morning,morning,morning,mid point
2025-06-01 11:58:00,0.231,morning,morning,morning,morning,mid point
2025-06-01 11:59:00,0.233,morning,morning,morning,morning,mid point
2025-06-01 12:00:00,0.381,morning,morning,noon,noon,mid point
2025-06-01 12:01:00,0.422,noon,noon,noon,noon,??


## The select way

In [46]:
df = pd.DataFrame(
    {
        "float": sorted([random.random() for _ in range(nr_of_rand_points)]),
    },

)

df

Unnamed: 0,float
0,0.021
1,0.121
2,0.186
3,0.219
4,0.29
5,0.333
6,0.422
7,0.46
8,0.556
9,0.642


### Recreate first example with the masks

In [47]:
conditions = [
    df["float"] < 0.2,
    df["float"] >= 0.8,
    (df["float"] >= 0.2) & (df["float"] < 0.8),
]

In [48]:
conditions

[0      True
 1      True
 2      True
 3     False
 4     False
 5     False
 6     False
 7     False
 8     False
 9     False
 10    False
 11    False
 12    False
 13    False
 14    False
 15    False
 Name: float, dtype: bool,
 0     False
 1     False
 2     False
 3     False
 4     False
 5     False
 6     False
 7     False
 8     False
 9     False
 10    False
 11    False
 12     True
 13     True
 14     True
 15     True
 Name: float, dtype: bool,
 0     False
 1     False
 2     False
 3      True
 4      True
 5      True
 6      True
 7      True
 8      True
 9      True
 10     True
 11     True
 12    False
 13    False
 14    False
 15    False
 Name: float, dtype: bool]

In [49]:
labels = ["less than 0.2", "more than 0.8", "middle"]
labels

['less than 0.2', 'more than 0.8', 'middle']

In [50]:
df["labels_select"] = np.select(
    condlist=conditions,
    choicelist=labels,
    default="??"
)

In [51]:
df

Unnamed: 0,float,labels_select
0,0.021,less than 0.2
1,0.121,less than 0.2
2,0.186,less than 0.2
3,0.219,middle
4,0.29,middle
5,0.333,middle
6,0.422,middle
7,0.46,middle
8,0.556,middle
9,0.642,middle


### Label column type

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   float          16 non-null     float64
 1   labels_select  16 non-null     object 
dtypes: float64(1), object(1)
memory usage: 388.0+ bytes


## Mapping

For 1 to 1 mapping of labels.

### Mock DataFrame 2

In [53]:
df = pd.DataFrame(
    {
        "int": sorted([random.randrange(-20, 20) for _ in range(16)])
    },

)

df

Unnamed: 0,int
0,-18
1,-9
2,-8
3,-5
4,-2
5,-1
6,3
7,5
8,5
9,6


### Example 1

In [54]:
df["mapped_1"] = df["int"].map(lambda x: 1 if x > 0 else 0)

In [55]:
df

Unnamed: 0,int,mapped_1
0,-18,0
1,-9,0
2,-8,0
3,-5,0
4,-2,0
5,-1,0
6,3,1
7,5,1
8,5,1
9,6,1


### Example 2

Mapping using dictionary

In [56]:
mapped_vals = {
    0: "negative",
    1: "positive",
}

In [57]:
df["mapped_2"] = df["mapped_1"].map(mapped_vals)

In [58]:
df

Unnamed: 0,int,mapped_1,mapped_2
0,-18,0,negative
1,-9,0,negative
2,-8,0,negative
3,-5,0,negative
4,-2,0,negative
5,-1,0,negative
6,3,1,positive
7,5,1,positive
8,5,1,positive
9,6,1,positive


### Label column type

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   int       16 non-null     int64 
 1   mapped_1  16 non-null     int64 
 2   mapped_2  16 non-null     object
dtypes: int64(2), object(1)
memory usage: 516.0+ bytes
