# Categorical Data

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.DataFrame(data={
    'user_id': np.arange(1, 11),
    'gender': ['M'] * 5 + ['F'] * 5,
    'score': np.random.randint(1, 6, size=10)
})

In [None]:
df

Unnamed: 0,user_id,gender,score
0,1,M,5
1,2,M,4
2,3,M,4
3,4,M,1
4,5,M,3
5,6,F,3
6,7,F,3
7,8,F,3
8,9,F,4
9,10,F,3


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user_id  10 non-null     int64 
 1   gender   10 non-null     object
 2   score    10 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 372.0+ bytes


In [None]:
df.gender.value_counts()

Unnamed: 0_level_0,count
gender,Unnamed: 1_level_1
M,5
F,5


In [None]:
df.score.value_counts()

Unnamed: 0_level_0,count
score,Unnamed: 1_level_1
3,5
4,3
5,1
1,1


In [None]:
df.gender = df.gender.astype('category')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   user_id  10 non-null     int64   
 1   gender   10 non-null     category
 2   score    10 non-null     int64   
dtypes: category(1), int64(2)
memory usage: 426.0 bytes


In [None]:
df.gender.cat.categories
#> DataFrame.column_name.cat.categories

Index(['F', 'M'], dtype='object')

In [None]:
df.score = df.score.astype('category')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   user_id  10 non-null     int64   
 1   gender   10 non-null     category
 2   score    10 non-null     category
dtypes: category(2), int64(1)
memory usage: 560.0 bytes


In [None]:
df.score.cat.categories

Index([1, 3, 4, 5], dtype='int64')

In [None]:
df.score = df.score.cat.add_categories(2)

In [None]:
df.score.cat.categories

Index([1, 3, 4, 5, 2], dtype='int64')

# 연속형 변수에서 파생된 카테고리 변수 만들기

In [None]:
df = pd.DataFrame(data={
    'id': np.arange(1, 11),
    'age': np.random.randint(0, 100, size=10)
})

In [None]:
df

Unnamed: 0,id,age
0,1,85
1,2,77
2,3,14
3,4,38
4,5,13
5,6,26
6,7,27
7,8,37
8,9,77
9,10,64


In [None]:
df['age_range'] = pd.cut(x=df.age, bins=np.arange(0, 100, 10), right=False)

In [None]:
df

Unnamed: 0,id,age,age_range
0,1,85,"[80, 90)"
1,2,77,"[70, 80)"
2,3,14,"[10, 20)"
3,4,38,"[30, 40)"
4,5,13,"[10, 20)"
5,6,26,"[20, 30)"
6,7,27,"[20, 30)"
7,8,37,"[30, 40)"
8,9,77,"[70, 80)"
9,10,64,"[60, 70)"


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   id         10 non-null     int64   
 1   age        10 non-null     int64   
 2   age_range  10 non-null     category
dtypes: category(1), int64(2)
memory usage: 746.0 bytes


In [None]:
df.age_range.cat.categories

IntervalIndex([ [0, 10), [10, 20), [20, 30), [30, 40), [40, 50), [50, 60),
               [60, 70), [70, 80), [80, 90)],
              dtype='interval[int64, left]')

In [None]:
df['age_cat'] = pd.cut(x=df.age, bins=[0, 19, 40, 60, 100], right=False,
                       labels=['미성년', '청년', '중년', '노년'])

In [None]:
df

Unnamed: 0,id,age,age_range,age_cat
0,1,85,"[80, 90)",노년
1,2,77,"[70, 80)",노년
2,3,14,"[10, 20)",미성년
3,4,38,"[30, 40)",청년
4,5,13,"[10, 20)",미성년
5,6,26,"[20, 30)",청년
6,7,27,"[20, 30)",청년
7,8,37,"[30, 40)",청년
8,9,77,"[70, 80)",노년
9,10,64,"[60, 70)",노년


In [None]:
df.age_cat.cat.categories

Index(['미성년', '청년', '중년', '노년'], dtype='object')