# Work with categorical variables in Pandas

## Imports

In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype

# For mocking data
import random
from string import ascii_lowercase
import numpy as np

## f: mock_df

Function that mocks the data.

In [2]:
def mock_df(nr_of_samples: int = 100, seed=0):
    random.seed(seed)

    df = pd.DataFrame.from_dict(
        {
            "age": [random.randrange(18, 65) for _ in range(nr_of_samples)],
            "coding_exp": [random.choice(["beginner", "intermediate", "pro"]) for _ in range(nr_of_samples)],
            "title": [random.choice(["peasant", "bishop", "merchant", "knight", "king"]) for _ in range(nr_of_samples)],
            "fav_myth_creature": [random.choice(["Big Foot", "Dragon", "Phoenix", "Unicorn", "Vampire", "Werewolf"]) for _ in range(nr_of_samples)],
            "fav_3_letters": ["".join(random.choices(ascii_lowercase, k=3)) for _ in range(nr_of_samples)],
            "secret_group": [random.randrange(1, 5) for _ in range(nr_of_samples)],
        },
        orient="columns",
    )

    return df

## The Data

In [3]:
df = mock_df()

In [4]:
df

Unnamed: 0,age,coding_exp,title,fav_myth_creature,fav_3_letters,secret_group
0,42,pro,peasant,Big Foot,kcx,3
1,44,intermediate,king,Werewolf,mvz,2
2,20,intermediate,knight,Big Foot,ims,4
3,34,beginner,peasant,Werewolf,lht,4
4,50,beginner,king,Vampire,xxq,1
...,...,...,...,...,...,...
95,29,pro,knight,Big Foot,tlg,4
96,30,intermediate,knight,Phoenix,lyd,4
97,29,intermediate,merchant,Unicorn,mqm,2
98,20,intermediate,peasant,Phoenix,fas,4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                100 non-null    int64 
 1   coding_exp         100 non-null    object
 2   title              100 non-null    object
 3   fav_myth_creature  100 non-null    object
 4   fav_3_letters      100 non-null    object
 5   secret_group       100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


In [6]:
df.dtypes

age                   int64
coding_exp           object
title                object
fav_myth_creature    object
fav_3_letters        object
secret_group          int64
dtype: object

## Check number of unique values in each column

In [7]:
for col in df.select_dtypes(include="object"):
    print(f"{col} - nr of unique entries: {df[col].nunique()} out of {len(df)} ({100* df[col].nunique() / len(df): .1f}%)")

coding_exp - nr of unique entries: 3 out of 100 ( 3.0%)
title - nr of unique entries: 5 out of 100 ( 5.0%)
fav_myth_creature - nr of unique entries: 6 out of 100 ( 6.0%)
fav_3_letters - nr of unique entries: 99 out of 100 ( 99.0%)


## Convert an individual column to categorical

In [8]:
df["coding_exp"] = df["coding_exp"].astype("category")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                100 non-null    int64   
 1   coding_exp         100 non-null    category
 2   title              100 non-null    object  
 3   fav_myth_creature  100 non-null    object  
 4   fav_3_letters      100 non-null    object  
 5   secret_group       100 non-null    int64   
dtypes: category(1), int64(2), object(3)
memory usage: 4.3+ KB


In [10]:
df["coding_exp"]

0              pro
1     intermediate
2     intermediate
3         beginner
4         beginner
          ...     
95             pro
96    intermediate
97    intermediate
98    intermediate
99        beginner
Name: coding_exp, Length: 100, dtype: category
Categories (3, object): ['beginner', 'intermediate', 'pro']

In [11]:
# Convert remaining columns one by one

In [12]:
df["title"] = df["fav_myth_creature"].astype("category")
df["fav_myth_creature"] = df["fav_myth_creature"].astype("category")

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                100 non-null    int64   
 1   coding_exp         100 non-null    category
 2   title              100 non-null    category
 3   fav_myth_creature  100 non-null    category
 4   fav_3_letters      100 non-null    object  
 5   secret_group       100 non-null    int64   
dtypes: category(3), int64(2), object(1)
memory usage: 3.3+ KB


## Convert all columns at once

Based on our previous exploration we know it doesn't make sense for this dataset specifically. But if it makes sense for your dataset, this is how to do it.

In [14]:
# Let's recreate the df
df = mock_df()

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                100 non-null    int64 
 1   coding_exp         100 non-null    object
 2   title              100 non-null    object
 3   fav_myth_creature  100 non-null    object
 4   fav_3_letters      100 non-null    object
 5   secret_group       100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


In [16]:
df = df.astype("category")

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                100 non-null    category
 1   coding_exp         100 non-null    category
 2   title              100 non-null    category
 3   fav_myth_creature  100 non-null    category
 4   fav_3_letters      100 non-null    category
 5   secret_group       100 non-null    category
dtypes: category(6)
memory usage: 5.7 KB


## Explore categorical variables

In [18]:
df = mock_df()

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                100 non-null    int64 
 1   coding_exp         100 non-null    object
 2   title              100 non-null    object
 3   fav_myth_creature  100 non-null    object
 4   fav_3_letters      100 non-null    object
 5   secret_group       100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


### Convert one column to categorical

In [20]:
df["fav_myth_creature"] = df["fav_myth_creature"].astype("category")

In [21]:
df.dtypes

age                     int64
coding_exp             object
title                  object
fav_myth_creature    category
fav_3_letters          object
secret_group            int64
dtype: object

In [22]:
df["fav_myth_creature"]

0     Big Foot
1     Werewolf
2     Big Foot
3     Werewolf
4      Vampire
        ...   
95    Big Foot
96     Phoenix
97     Unicorn
98     Phoenix
99     Phoenix
Name: fav_myth_creature, Length: 100, dtype: category
Categories (6, object): ['Big Foot', 'Dragon', 'Phoenix', 'Unicorn', 'Vampire', 'Werewolf']

### Read categories

`df[column].cat.<method>`

In [23]:
# These are alphabetical
df["fav_myth_creature"].cat.categories

Index(['Big Foot', 'Dragon', 'Phoenix', 'Unicorn', 'Vampire', 'Werewolf'], dtype='object')

### Read codes

In [24]:
df["fav_myth_creature"].cat.codes

0     0
1     5
2     0
3     5
4     4
     ..
95    0
96    2
97    3
98    2
99    2
Length: 100, dtype: int8

## Categories: unordered vs ordered

In [25]:
df = mock_df()

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                100 non-null    int64 
 1   coding_exp         100 non-null    object
 2   title              100 non-null    object
 3   fav_myth_creature  100 non-null    object
 4   fav_3_letters      100 non-null    object
 5   secret_group       100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


### Unordered

In [27]:
# By default it's unordered category
df["coding_exp"] = df["coding_exp"].astype("category")

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                100 non-null    int64   
 1   coding_exp         100 non-null    category
 2   title              100 non-null    object  
 3   fav_myth_creature  100 non-null    object  
 4   fav_3_letters      100 non-null    object  
 5   secret_group       100 non-null    int64   
dtypes: category(1), int64(2), object(3)
memory usage: 4.3+ KB


In [29]:
df["coding_exp"]

0              pro
1     intermediate
2     intermediate
3         beginner
4         beginner
          ...     
95             pro
96    intermediate
97    intermediate
98    intermediate
99        beginner
Name: coding_exp, Length: 100, dtype: category
Categories (3, object): ['beginner', 'intermediate', 'pro']

In [30]:
# Some of the descriptive stat functions won't work with unordered category

df["coding_exp"].min()

TypeError: Categorical is not ordered for operation min
you can use .as_ordered() to change the Categorical to an ordered one


In [31]:
df["coding_exp"].max()

TypeError: Categorical is not ordered for operation max
you can use .as_ordered() to change the Categorical to an ordered one


In [32]:
df["coding_exp"] > "beginner"

TypeError: Unordered Categoricals can only compare equality or not

In [33]:
df["coding_exp"].cat.ordered

False

### Converting to ordered - way 1

In [34]:
# The order is OK
df["coding_exp"].cat.categories

Index(['beginner', 'intermediate', 'pro'], dtype='object')

In [35]:
df["coding_exp"] = df["coding_exp"].cat.as_ordered()

In [36]:
df["coding_exp"]

0              pro
1     intermediate
2     intermediate
3         beginner
4         beginner
          ...     
95             pro
96    intermediate
97    intermediate
98    intermediate
99        beginner
Name: coding_exp, Length: 100, dtype: category
Categories (3, object): ['beginner' < 'intermediate' < 'pro']

In [37]:
# Give the lowest category
df["coding_exp"].min()

'beginner'

In [38]:
# Give the highest category
df["coding_exp"].max()

'pro'

In [39]:
df["coding_exp"] > "beginner"

0      True
1      True
2      True
3     False
4     False
      ...  
95     True
96     True
97     True
98     True
99    False
Name: coding_exp, Length: 100, dtype: bool

### Converting to ordered - way 2 (new cat type)

In [40]:
df["title"]= df["title"].astype("category")

In [41]:
# alphabetical order
df["title"].cat.categories

Index(['bishop', 'king', 'knight', 'merchant', 'peasant'], dtype='object')

In [42]:
new_cat = CategoricalDtype(
    categories=["peasant", "merchant", "knight", "bishop", "king",],
    ordered=True
)

In [43]:
df["title"] = df["title"].astype(new_cat)

In [44]:
# specified order
df["title"].cat.categories

Index(['peasant', 'merchant', 'knight', 'bishop', 'king'], dtype='object')

In [45]:
df["title"]

0      peasant
1         king
2       knight
3      peasant
4         king
        ...   
95      knight
96      knight
97    merchant
98     peasant
99      bishop
Name: title, Length: 100, dtype: category
Categories (5, object): ['peasant' < 'merchant' < 'knight' < 'bishop' < 'king']

### Converting to ordered - way 3 (reorder)

In [46]:
df = mock_df()

In [47]:
df["title"]= df["title"].astype("category")

In [48]:
# alphabetical order
df["title"].cat.categories

Index(['bishop', 'king', 'knight', 'merchant', 'peasant'], dtype='object')

In [49]:
# Specify a new order
new_order = ['peasant', 'merchant', 'knight', 'bishop', 'king']

In [50]:
df["title"] = df["title"].cat.reorder_categories(new_order, ordered=True)

In [51]:
df["title"]

0      peasant
1         king
2       knight
3      peasant
4         king
        ...   
95      knight
96      knight
97    merchant
98     peasant
99      bishop
Name: title, Length: 100, dtype: category
Categories (5, object): ['peasant' < 'merchant' < 'knight' < 'bishop' < 'king']

## Create a new categorical variable.

We can create new categorical columns, not just convert the existing ones.

In [52]:
df = mock_df()

In [53]:
df["secret_group"].unique()

array([3, 2, 4, 1])

We need to remember codes need to start at 0.

So:

1 - code 0  
2 - code 1 etc.

In [54]:
df["secret_group_name"] = pd.Categorical.from_codes(
    codes=df["secret_group"]-1,
    categories=["ones", "twos", "threes", "fours"],
    ordered=True,
)

In [55]:
df

Unnamed: 0,age,coding_exp,title,fav_myth_creature,fav_3_letters,secret_group,secret_group_name
0,42,pro,peasant,Big Foot,kcx,3,threes
1,44,intermediate,king,Werewolf,mvz,2,twos
2,20,intermediate,knight,Big Foot,ims,4,fours
3,34,beginner,peasant,Werewolf,lht,4,fours
4,50,beginner,king,Vampire,xxq,1,ones
...,...,...,...,...,...,...,...
95,29,pro,knight,Big Foot,tlg,4,fours
96,30,intermediate,knight,Phoenix,lyd,4,fours
97,29,intermediate,merchant,Unicorn,mqm,2,twos
98,20,intermediate,peasant,Phoenix,fas,4,fours


In [56]:
df["secret_group_name"]

0     threes
1       twos
2      fours
3      fours
4       ones
       ...  
95     fours
96     fours
97      twos
98     fours
99      twos
Name: secret_group_name, Length: 100, dtype: category
Categories (4, object): ['ones' < 'twos' < 'threes' < 'fours']

## Dealing with missing values

In [57]:
df = mock_df()

In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                100 non-null    int64 
 1   coding_exp         100 non-null    object
 2   title              100 non-null    object
 3   fav_myth_creature  100 non-null    object
 4   fav_3_letters      100 non-null    object
 5   secret_group       100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


In [59]:
df["middle_agers"] = pd.cut(
    x=df["age"],
    bins=[40, 50],
    labels=["middle agers"]
)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                100 non-null    int64   
 1   coding_exp         100 non-null    object  
 2   title              100 non-null    object  
 3   fav_myth_creature  100 non-null    object  
 4   fav_3_letters      100 non-null    object  
 5   secret_group       100 non-null    int64   
 6   middle_agers       18 non-null     category
dtypes: category(1), int64(2), object(4)
memory usage: 5.0+ KB


In [61]:
df

Unnamed: 0,age,coding_exp,title,fav_myth_creature,fav_3_letters,secret_group,middle_agers
0,42,pro,peasant,Big Foot,kcx,3,middle agers
1,44,intermediate,king,Werewolf,mvz,2,middle agers
2,20,intermediate,knight,Big Foot,ims,4,
3,34,beginner,peasant,Werewolf,lht,4,
4,50,beginner,king,Vampire,xxq,1,middle agers
...,...,...,...,...,...,...,...
95,29,pro,knight,Big Foot,tlg,4,
96,30,intermediate,knight,Phoenix,lyd,4,
97,29,intermediate,merchant,Unicorn,mqm,2,
98,20,intermediate,peasant,Phoenix,fas,4,


In [62]:
df["middle_agers"]

0     middle agers
1     middle agers
2              NaN
3              NaN
4     middle agers
          ...     
95             NaN
96             NaN
97             NaN
98             NaN
99             NaN
Name: middle_agers, Length: 100, dtype: category
Categories (1, object): ['middle agers']

In [63]:
df["middle_agers"].cat.codes

0     0
1     0
2    -1
3    -1
4     0
     ..
95   -1
96   -1
97   -1
98   -1
99   -1
Length: 100, dtype: int8

In [64]:
# This doesn't work! Only valid categories are allowed and currently we have one
df["middle_ages"].fillna("yh?")

KeyError: 'middle_ages'

In [65]:
df["middle_agers"].cat.categories

Index(['middle agers'], dtype='object')

In [66]:
df["middle_agers"].cat.ordered

True

In [67]:
# This makes the category column ordered
df["middle_agers"].cat.add_categories("yh?")

0     middle agers
1     middle agers
2              NaN
3              NaN
4     middle agers
          ...     
95             NaN
96             NaN
97             NaN
98             NaN
99             NaN
Name: middle_agers, Length: 100, dtype: category
Categories (2, object): ['middle agers' < 'yh?']

In [68]:
df["middle_agers"].cat.add_categories("yh?").cat.as_unordered()

0     middle agers
1     middle agers
2              NaN
3              NaN
4     middle agers
          ...     
95             NaN
96             NaN
97             NaN
98             NaN
99             NaN
Name: middle_agers, Length: 100, dtype: category
Categories (2, object): ['middle agers', 'yh?']

In [69]:
# All in one go

df["middle_agers"] = (
    df["middle_agers"]
    .cat.add_categories("yh?")
    .cat.as_unordered()
    .fillna("yh?")
)

In [70]:
df["middle_agers"]

0     middle agers
1     middle agers
2              yh?
3              yh?
4     middle agers
          ...     
95             yh?
96             yh?
97             yh?
98             yh?
99             yh?
Name: middle_agers, Length: 100, dtype: category
Categories (2, object): ['middle agers', 'yh?']

In [71]:
df["middle_agers"].value_counts(dropna=False)

middle_agers
yh?             82
middle agers    18
Name: count, dtype: int64

## Using ordered categories with `groupby` and filtering

In [72]:
df = mock_df()

In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                100 non-null    int64 
 1   coding_exp         100 non-null    object
 2   title              100 non-null    object
 3   fav_myth_creature  100 non-null    object
 4   fav_3_letters      100 non-null    object
 5   secret_group       100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


In [74]:
# As column as string
grouped = df[["age", "title"]].groupby(by="title").mean()

In [75]:
# Alphabetical order, OK, but perhaps not great
grouped

Unnamed: 0_level_0,age
title,Unnamed: 1_level_1
bishop,37.842105
king,42.789474
knight,41.833333
merchant,45.611111
peasant,40.45


In [76]:
# Swap "title" column to ordered categorical
new_order = ['peasant', 'merchant', 'knight', 'bishop', 'king']

In [77]:
df["title"] = (
    df["title"]
    .astype("category")
    .cat.reorder_categories(new_order, ordered=True)
)

In [78]:
# groupby once again
grouped = df[["age", "title"]].groupby(by="title", observed=False).mean()

In [79]:
grouped

Unnamed: 0_level_0,age
title,Unnamed: 1_level_1
peasant,40.45
merchant,45.611111
knight,41.833333
bishop,37.842105
king,42.789474


### Add more categories

In [80]:
df["title"]=df["title"].cat.add_categories("god")

In [81]:
grouped = df[["age", "title"]].groupby(by="title", observed=False).mean()

In [82]:
grouped

Unnamed: 0_level_0,age
title,Unnamed: 1_level_1
peasant,40.45
merchant,45.611111
knight,41.833333
bishop,37.842105
king,42.789474
god,


In [83]:
grouped = df[["age", "title"]].groupby(by="title", observed=True).mean()

In [84]:
grouped

Unnamed: 0_level_0,age
title,Unnamed: 1_level_1
peasant,40.45
merchant,45.611111
knight,41.833333
bishop,37.842105
king,42.789474


### Filtering

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                100 non-null    int64   
 1   coding_exp         100 non-null    object  
 2   title              100 non-null    category
 3   fav_myth_creature  100 non-null    object  
 4   fav_3_letters      100 non-null    object  
 5   secret_group       100 non-null    int64   
dtypes: category(1), int64(2), object(3)
memory usage: 4.3+ KB


In [86]:
df.title

0      peasant
1         king
2       knight
3      peasant
4         king
        ...   
95      knight
96      knight
97    merchant
98     peasant
99      bishop
Name: title, Length: 100, dtype: category
Categories (6, object): ['peasant' < 'merchant' < 'knight' < 'bishop' < 'king' < 'god']

In [87]:
mask = df["title"] > "merchant"

In [88]:
only_noblemen = df[mask].copy()

In [89]:
only_noblemen

Unnamed: 0,age,coding_exp,title,fav_myth_creature,fav_3_letters,secret_group
1,44,intermediate,king,Werewolf,mvz,2
2,20,intermediate,knight,Big Foot,ims,4
4,50,beginner,king,Vampire,xxq,1
6,43,beginner,knight,Big Foot,bct,2
7,37,beginner,bishop,Dragon,bak,4
...,...,...,...,...,...,...
91,38,intermediate,king,Phoenix,ifh,3
93,33,beginner,bishop,Vampire,hor,2
95,29,pro,knight,Big Foot,tlg,4
96,30,intermediate,knight,Phoenix,lyd,4


In [90]:
only_noblemen["title"].unique()

['king', 'knight', 'bishop']
Categories (6, object): ['peasant' < 'merchant' < 'knight' < 'bishop' < 'king' < 'god']

In [91]:
only_noblemen["title"].value_counts(dropna=False)

title
knight      24
bishop      19
king        19
peasant      0
merchant     0
god          0
Name: count, dtype: int64

In [92]:
# Remove unwanted categories
only_noblemen["title"] = only_noblemen["title"].cat.remove_unused_categories()

In [93]:
only_noblemen["title"]

1       king
2     knight
4       king
6     knight
7     bishop
       ...  
91      king
93    bishop
95    knight
96    knight
99    bishop
Name: title, Length: 62, dtype: category
Categories (3, object): ['knight' < 'bishop' < 'king']

In [94]:
only_noblemen["title"].unique()

['king', 'knight', 'bishop']
Categories (3, object): ['knight' < 'bishop' < 'king']

## Memory and speed comparison


In [95]:
# df = mock_df(nr_of_samples=20_000_000)
df = mock_df(nr_of_samples=20)

In [96]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                20 non-null     int64 
 1   coding_exp         20 non-null     object
 2   title              20 non-null     object
 3   fav_myth_creature  20 non-null     object
 4   fav_3_letters      20 non-null     object
 5   secret_group       20 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 1.1+ KB


In [97]:
df[["age", "title", "coding_exp"]].groupby(["coding_exp", "title"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,age
coding_exp,title,Unnamed: 2_level_1
beginner,bishop,1
beginner,king,1
beginner,merchant,2
beginner,peasant,1
intermediate,bishop,2
intermediate,king,1
intermediate,merchant,1
intermediate,peasant,3
pro,bishop,2
pro,king,2


In [98]:
df["title"] = df["title"].astype("category")
df["coding_exp"] = df["coding_exp"].astype("category")

In [99]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                20 non-null     int64   
 1   coding_exp         20 non-null     category
 2   title              20 non-null     category
 3   fav_myth_creature  20 non-null     object  
 4   fav_3_letters      20 non-null     object  
 5   secret_group       20 non-null     int64   
dtypes: category(2), int64(2), object(2)
memory usage: 1.1+ KB


In [100]:
df[["age", "title", "coding_exp"]].groupby(["coding_exp", "title"], observed=False).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,age
coding_exp,title,Unnamed: 2_level_1
beginner,bishop,1
beginner,king,1
beginner,knight,0
beginner,merchant,2
beginner,peasant,1
intermediate,bishop,2
intermediate,king,1
intermediate,knight,0
intermediate,merchant,1
intermediate,peasant,3


## Tips

### Read from csv + specify types

### Select columns for convertions

In [101]:
df = mock_df()

In [102]:
df.select_dtypes(include="object").columns

Index(['coding_exp', 'title', 'fav_myth_creature', 'fav_3_letters'], dtype='object')

In [103]:
df.select_dtypes(include="object").columns.drop("fav_3_letters")

Index(['coding_exp', 'title', 'fav_myth_creature'], dtype='object')

In [104]:
cols_to_convert = df.select_dtypes(include="object").columns.drop("fav_3_letters")

In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                100 non-null    int64 
 1   coding_exp         100 non-null    object
 2   title              100 non-null    object
 3   fav_myth_creature  100 non-null    object
 4   fav_3_letters      100 non-null    object
 5   secret_group       100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


In [106]:
df[cols_to_convert] = df[cols_to_convert].astype("category")

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   age                100 non-null    int64   
 1   coding_exp         100 non-null    category
 2   title              100 non-null    category
 3   fav_myth_creature  100 non-null    category
 4   fav_3_letters      100 non-null    object  
 5   secret_group       100 non-null    int64   
dtypes: category(3), int64(2), object(1)
memory usage: 3.3+ KB


### Mapping

In [108]:
# Example DataFrame with NaN values
data = {
    'Category': ['A', 'B', np.nan, 'C', 'B', 'D', 'E', np.nan, 'C'],
    'Value': [10, 20, 30, 40, 50, 60, 70, 80, 90]
}

df = pd.DataFrame(data)

In [109]:
df

Unnamed: 0,Category,Value
0,A,10
1,B,20
2,,30
3,C,40
4,B,50
5,D,60
6,E,70
7,,80
8,C,90


In [110]:
# Include NaN as a group and get first occurrence
first_occurrences = df.groupby('Category', dropna=False, as_index=False).first()

print(first_occurrences)

  Category  Value
0        A     10
1        B     20
2        C     40
3        D     60
4        E     70
5      NaN     30


In [111]:
first_occurrences["Category"].astype("category").cat.codes

0    0
1    1
2    2
3    3
4    4
5   -1
dtype: int8

In [112]:
first_occurrences["Category"].astype("category").cat.codes

0    0
1    1
2    2
3    3
4    4
5   -1
dtype: int8

In [113]:
first_occurrences["Category"].astype("category").cat.categories

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

### Int can be categorical too!

Strings make most sense to be used as categorical but they aren't the only ones that can be converted!

In [114]:
df = mock_df()

In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                100 non-null    int64 
 1   coding_exp         100 non-null    object
 2   title              100 non-null    object
 3   fav_myth_creature  100 non-null    object
 4   fav_3_letters      100 non-null    object
 5   secret_group       100 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 4.8+ KB


In [116]:
df["secret_group"].unique()

array([3, 2, 4, 1])

In [117]:
df["secret_group"] = df["secret_group"].astype("category")

In [118]:
df["secret_group"]

0     3
1     2
2     4
3     4
4     1
     ..
95    4
96    4
97    2
98    4
99    2
Name: secret_group, Length: 100, dtype: category
Categories (4, int64): [1, 2, 3, 4]