In [1]:
import pandas as pd

In [2]:
print(pd.__version__)

2.1.2


In [3]:
df = pd.read_csv('data/src/sample_pandas_normal.csv', index_col=0)

In [4]:
df['sex'] = ['female', float('nan'), 'male', 'male', 'female', 'male']
df['rank'] = [2, 1, 1, 0, 2, 0]

In [5]:
print(df)

         age state  point     sex  rank
name                                   
Alice     24    NY     64  female     2
Bob       42    CA     92     NaN     1
Charlie   18    CA     70    male     1
Dave      68    TX     70    male     0
Ellen     24    CA     88  female     2
Frank     30    NY     57    male     0


In [6]:
print(pd.get_dummies(df['sex']))

         female   male
name                  
Alice      True  False
Bob       False  False
Charlie   False   True
Dave      False   True
Ellen      True  False
Frank     False   True


In [7]:
print(pd.get_dummies(['female', float('nan'), 'male', 'male', 'female', 'male']))

   female   male
0    True  False
1   False  False
2   False   True
3   False   True
4    True  False
5   False   True


In [8]:
print(pd.get_dummies(df))

         age  point  rank  state_CA  state_NY  state_TX  sex_female  sex_male
name                                                                         
Alice     24     64     2     False      True     False        True     False
Bob       42     92     1      True     False     False       False     False
Charlie   18     70     1      True     False     False       False      True
Dave      68     70     0     False     False      True       False      True
Ellen     24     88     2      True     False     False        True     False
Frank     30     57     0     False      True     False       False      True


In [9]:
print(pd.get_dummies(df, dtype=int))

         age  point  rank  state_CA  state_NY  state_TX  sex_female  sex_male
name                                                                         
Alice     24     64     2         0         1         0           1         0
Bob       42     92     1         1         0         0           0         0
Charlie   18     70     1         1         0         0           0         1
Dave      68     70     0         0         0         1           0         1
Ellen     24     88     2         1         0         0           1         0
Frank     30     57     0         0         1         0           0         1


In [10]:
print(pd.get_dummies(df, drop_first=True))

         age  point  rank  state_NY  state_TX  sex_male
name                                                   
Alice     24     64     2      True     False     False
Bob       42     92     1     False     False     False
Charlie   18     70     1     False     False      True
Dave      68     70     0     False      True      True
Ellen     24     88     2     False     False     False
Frank     30     57     0      True     False      True


In [11]:
print(pd.get_dummies(df, drop_first=True, dummy_na=True))

         age  point  rank  state_NY  state_TX  state_nan  sex_male  sex_nan
name                                                                       
Alice     24     64     2      True     False      False     False    False
Bob       42     92     1     False     False      False     False     True
Charlie   18     70     1     False     False      False      True    False
Dave      68     70     0     False      True      False      True    False
Ellen     24     88     2     False     False      False     False    False
Frank     30     57     0      True     False      False      True    False


In [12]:
print(pd.get_dummies(df, prefix='', prefix_sep=''))

         age  point  rank     CA     NY     TX  female   male
name                                                         
Alice     24     64     2  False   True  False    True  False
Bob       42     92     1   True  False  False   False  False
Charlie   18     70     1   True  False  False   False   True
Dave      68     70     0  False  False   True   False   True
Ellen     24     88     2   True  False  False    True  False
Frank     30     57     0  False   True  False   False   True


In [13]:
print(pd.get_dummies(df, prefix=['ST', 'sex'], prefix_sep='-'))

         age  point  rank  ST-CA  ST-NY  ST-TX  sex-female  sex-male
name                                                                
Alice     24     64     2  False   True  False        True     False
Bob       42     92     1   True  False  False       False     False
Charlie   18     70     1   True  False  False       False      True
Dave      68     70     0  False  False   True       False      True
Ellen     24     88     2   True  False  False        True     False
Frank     30     57     0  False   True  False       False      True


In [14]:
print(pd.get_dummies(df, prefix={'state': 'ST', 'sex': 'sex'}, prefix_sep='-'))

         age  point  rank  ST-CA  ST-NY  ST-TX  sex-female  sex-male
name                                                                
Alice     24     64     2  False   True  False        True     False
Bob       42     92     1   True  False  False       False     False
Charlie   18     70     1   True  False  False       False      True
Dave      68     70     0  False  False   True       False      True
Ellen     24     88     2   True  False  False        True     False
Frank     30     57     0  False   True  False       False      True


In [15]:
print(pd.get_dummies(df, columns=['sex', 'rank']))

         age state  point  sex_female  sex_male  rank_0  rank_1  rank_2
name                                                                   
Alice     24    NY     64        True     False   False   False    True
Bob       42    CA     92       False     False   False    True   False
Charlie   18    CA     70       False      True   False    True   False
Dave      68    TX     70       False      True    True   False   False
Ellen     24    CA     88        True     False   False   False    True
Frank     30    NY     57       False      True    True   False   False


In [16]:
df = pd.read_csv('data/src/sample_pandas_normal.csv', index_col=0)
df_A, df_B = df[:3].copy(), df[3:].copy()

In [17]:
print(df_A)

         age state  point
name                     
Alice     24    NY     64
Bob       42    CA     92
Charlie   18    CA     70


In [18]:
print(df_B)

       age state  point
name                   
Dave    68    TX     70
Ellen   24    CA     88
Frank   30    NY     57


In [19]:
print(pd.get_dummies(df_A))

         age  point  state_CA  state_NY
name                                   
Alice     24     64     False      True
Bob       42     92      True     False
Charlie   18     70      True     False


In [20]:
print(pd.get_dummies(df_B))

       age  point  state_CA  state_NY  state_TX
name                                           
Dave    68     70     False     False      True
Ellen   24     88      True     False     False
Frank   30     57     False      True     False


In [21]:
categories = set(df_A['state'].tolist() + df_B['state'].tolist())
print(categories)

{'NY', 'TX', 'CA'}


In [22]:
df_A['state'] = pd.Categorical(df_A['state'], categories)
df_B['state'] = pd.Categorical(df_B['state'], categories)

In [23]:
print(df_A['state'].dtypes)

category


In [24]:
print(pd.get_dummies(df_A))

         age  point  state_NY  state_TX  state_CA
name                                             
Alice     24     64      True     False     False
Bob       42     92     False     False      True
Charlie   18     70     False     False      True


In [25]:
print(pd.get_dummies(df_B))

       age  point  state_NY  state_TX  state_CA
name                                           
Dave    68     70     False      True     False
Ellen   24     88     False     False      True
Frank   30     57      True     False     False


In [26]:
categories = ['CA', 'NY']

In [27]:
df_A['state'] = pd.Categorical(df_A['state'], categories)
df_B['state'] = pd.Categorical(df_B['state'], categories)

In [28]:
print(df_A)

         age state  point
name                     
Alice     24    NY     64
Bob       42    CA     92
Charlie   18    CA     70


In [29]:
print(df_B)

       age state  point
name                   
Dave    68   NaN     70
Ellen   24    CA     88
Frank   30    NY     57


In [30]:
print(pd.get_dummies(df_A))

         age  point  state_CA  state_NY
name                                   
Alice     24     64     False      True
Bob       42     92      True     False
Charlie   18     70      True     False


In [31]:
print(pd.get_dummies(df_B))

       age  point  state_CA  state_NY
name                                 
Dave    68     70     False     False
Ellen   24     88      True     False
Frank   30     57     False      True
