# Working with Categorical Data in Python

## Introduction to Categorical Data

In [2]:
import pandas as pd

In [3]:
adult = pd.read_csv('adult.csv')

In [4]:
adult.head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Explore the Above/Below 50k variable
print(adult['Above/Below 50k'].describe())

count      32561
unique         2
top        <=50K
freq       24720
Name: Above/Below 50k, dtype: object


In [6]:
# Print a frequency table of "Above/Below 50k"
print(adult['Above/Below 50k'].value_counts())

 <=50K    24720
 >50K      7841
Name: Above/Below 50k, dtype: int64


In [7]:
print(adult['Above/Below 50k'].value_counts(normalize=True))

 <=50K    0.75919
 >50K     0.24081
Name: Above/Below 50k, dtype: float64


### Setting dtypes and saving memory

In [9]:
adult['Occupation'].value_counts()

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3770
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: Occupation, dtype: int64

In [10]:
list_of_occupations = adult['Occupation'].values.tolist()

In [11]:
series1 = pd.Series(list_of_occupations)

In [12]:
# Print out the data type and number of bytes for series1
print("series1 data type:", series1.dtype)
print("series1 number of bytes:", series1.nbytes)

series1 data type: object
series1 number of bytes: 260488


In [13]:
# Create a Series, "category" dtype
series2 = pd.Series(list_of_occupations, dtype="category")

In [14]:
print("series2 data type:", series2.dtype)
print("series2 number of bytes:", series2.nbytes)

series2 data type: category
series2 number of bytes: 32681


### Creating a categorical pandas Series

In [40]:
medal_won = pd.read_csv('medals_won.txt').values.tolist()

In [41]:
# Create a categorical Series and specify the categories (let pandas know the order matters!)
medals = pd.Categorical(medals_won, categories=['Bronze', 'Silver', 'Gold'], ordered=True)
print(medals)

['Silver', 'Silver', 'Bronze', 'Silver', 'Gold', ..., 'Gold', 'Bronze', 'Silver', 'Gold', 'Gold']
Length: 2828
Categories (3, object): ['Bronze' < 'Silver' < 'Gold']


### Setting dtype when reading data


In [43]:
adult.head()

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [42]:
print(adult.dtypes)

Age                 int64
Workclass          object
fnlgwt              int64
Education          object
Education Num       int64
Marital Status     object
Occupation         object
Relationship       object
Race               object
Sex                object
Capital Gain        int64
Capital Loss        int64
Hours/Week          int64
Country            object
Above/Below 50k    object
dtype: object


In [44]:
adult_dtypes = {
   "Workclass": "category",
   "Education": "category",
   "Relationship": "category",
   "Above/Below 50k": "category" 
}

In [45]:
adult2 = pd.read_csv(
  "adult.csv",
  dtype=adult_dtypes
)
print(adult2.dtypes)

Age                   int64
Workclass          category
fnlgwt                int64
Education          category
Education Num         int64
Marital Status       object
Occupation           object
Relationship       category
Race                 object
Sex                  object
Capital Gain          int64
Capital Loss          int64
Hours/Week            int64
Country              object
Above/Below 50k    category
dtype: object


### Setting up a .groupby() statement

In [46]:
gb = adult.groupby(by=['Sex', 'Above/Below 50k'])

# Print out how many rows are in each created group
print(gb.size())

# Print out the mean of each group for all columns
print(gb.mean())

Sex      Above/Below 50k
 Female   <=50K              9592
          >50K               1179
 Male     <=50K             15128
          >50K               6662
dtype: int64
                               Age         fnlgwt  Education Num  \
Sex     Above/Below 50k                                            
 Female  <=50K           36.210801  185999.381359       9.820475   
         >50K            42.125530  183687.406277      11.787108   
 Male    <=50K           37.147012  193093.609268       9.452142   
         >50K            44.625788  188769.101321      11.580606   

                         Capital Gain  Capital Loss  Hours/Week  
Sex     Above/Below 50k                                          
 Female  <=50K             121.986134     47.364470   35.916701  
         >50K             4200.389313    173.648855   40.426633  
 Male    <=50K             165.723823     56.806782   40.693879  
         >50K             3971.765836    198.780396   46.366106  


### Using pandas functions effectively

In [47]:
# Create a list of user-selected variables
user_list = ['Education', 'Above/Below 50k']

# Create a GroupBy object using this list
gb = adult.groupby(by=user_list)

# Find the mean for the variable "Hours/Week" for each group - Be efficient!
print(gb['Hours/Week'].mean())

Education      Above/Below 50k
 10th           <=50K             36.574053
                >50K              43.774194
 11th           <=50K             33.322870
                >50K              45.133333
 12th           <=50K             35.035000
                >50K              44.818182
 1st-4th        <=50K             37.864198
                >50K              48.833333
 5th-6th        <=50K             38.539432
                >50K              46.000000
 7th-8th        <=50K             38.830033
                >50K              47.500000
 9th            <=50K             37.667351
                >50K              44.851852
 Assoc-acdm     <=50K             39.264339
                >50K              44.256604
 Assoc-voc      <=50K             40.817826
                >50K              43.853186
 Bachelors      <=50K             40.586152
                >50K              45.475462
 Doctorate      <=50K             45.429907
                >50K              47.513072
 

## Categorical pandas Series


In [9]:
dogs = pd.read_csv('ShelterDogs.csv')

In [10]:
print(dogs["keep_in"].value_counts(dropna=False))

both flat and garden    1224
NaN                     1021
garden                   510
flat                     182
Name: keep_in, dtype: int64


In [11]:
dogs["keep_in"] = dogs["keep_in"].astype('category')

In [12]:
new_categories = ["Unknown History", "Open Yard (Countryside)"]
dogs["keep_in"] = dogs["keep_in"].cat.add_categories(new_categories)

In [13]:
print(dogs["keep_in"].value_counts(dropna=False))

both flat and garden       1224
NaN                        1021
garden                      510
flat                        182
Unknown History               0
Open Yard (Countryside)       0
Name: keep_in, dtype: int64


### Removing categories

In [18]:
dogs["likes_children"] = dogs["likes_children"].astype("category")

In [22]:
dogs["likes_children"] = dogs["likes_children"].cat.add_categories(  new_categories=["maybe"])

In [23]:
dogs.loc[dogs["likes_children"] == "maybe", "likes_children"] = "no"

In [24]:
# Print out categories
print(dogs["likes_children"].cat.categories)

Index(['no', 'yes', 'maybe'], dtype='object')


In [25]:
# Print the frequency table
print(dogs["likes_children"].value_counts())

yes      1172
no         47
maybe       0
Name: likes_children, dtype: int64


In [26]:
dogs["likes_children"] = dogs["likes_children"].cat.remove_categories(["maybe"])
print(dogs["likes_children"].value_counts())

yes    1172
no       47
Name: likes_children, dtype: int64


In [27]:
print(dogs["likes_children"].cat.categories)

Index(['no', 'yes'], dtype='object')


### Renaming categories

In [28]:
# Create the my_changes dictionary
my_changes = {"Maybe?": "Maybe"}

# Rename the categories listed in the my_changes dictionary
dogs["likes_children"] = dogs["likes_children"].cat.rename_categories(my_changes)

# Use a lambda function to convert all categories to uppercase using upper()
dogs["likes_children"] =  dogs["likes_children"].cat.rename_categories(lambda c: c.upper())

# Print the list of categories
print(dogs["likes_children"].cat.categories)

Index(['NO', 'YES'], dtype='object')


### Collapsing categories

In [29]:
# Create the update_coats dictionary
update_coats = {"wirehaired": "medium", "medium-long": "medium"}

# Create a new column, coat_collapsed
dogs["coat_collapsed"] = dogs["coat"].replace(update_coats)

# Convert the column to categorical
dogs["coat_collapsed"] = dogs["coat_collapsed"].astype('category')

# Print the frequency table
print(dogs["coat_collapsed"].value_counts())

short     1972
medium     785
long       180
Name: coat_collapsed, dtype: int64


### Reordering categories in a Series

In [32]:
dogs["size"] = dogs["size"].astype("category")

In [33]:
# Print out the current categories of the size variable
print(dogs["size"].cat.categories)

Index(['large', 'medium', 'small'], dtype='object')


In [34]:
# Reorder the categories using the list provided
dogs["size"] = dogs["size"].cat.reorder_categories(
    new_categories=["small", "medium", "large"]
)

In [35]:
# Reorder the categories, specifying the Series is ordinal
dogs["size"] = dogs["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True
)

In [36]:
# Reorder the categories, specifying the Series is ordinal, and overwriting the original series
dogs["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True,
  inplace=True
)

  res = method(*args, **kwargs)


### Using .groupby() after reordering

In [37]:
# Previous code
dogs["size"].cat.reorder_categories(
  new_categories=["small", "medium", "large"],
  ordered=True,
  inplace=True
)

# How many Male/Female dogs are available of each size?
print(dogs.groupby("size")["sex"].value_counts())

# Do larger dogs need more room to roam?
print(dogs.groupby("size")["keep_in"].value_counts(normalize=True))

size    sex   
small   male       260
        female     214
medium  male      1090
        female     854
large   male       331
        female     188
Name: sex, dtype: int64
size                           
small   both flat and garden       0.702065
        flat                       0.235988
        garden                     0.061947
        Unknown History            0.000000
        Open Yard (Countryside)    0.000000
medium  both flat and garden       0.657568
        garden                     0.262200
        flat                       0.080232
        Unknown History            0.000000
        Open Yard (Countryside)    0.000000
large   both flat and garden       0.519022
        garden                     0.467391
        flat                       0.013587
        Unknown History            0.000000
        Open Yard (Countryside)    0.000000
Name: keep_in, dtype: float64


  res = method(*args, **kwargs)


### Cleaning variables

In [38]:
# Fix the misspelled word
replace_map = {"Malez": "male"}

# Update the sex column using the created map
dogs["sex"] = dogs["sex"].replace(replace_map)

# Strip away leading whitespace
dogs["sex"] = dogs["sex"].str.strip()

# Make all responses lowercase
dogs["sex"] = dogs["sex"].str.lower()

# Convert to a categorical Series
dogs["sex"] = dogs["sex"].astype('category')

print(dogs["sex"].value_counts())

male      1681
female    1256
Name: sex, dtype: int64


### Accessing and filtering data

In [40]:
dogs.head()

Unnamed: 0,ID,name,age,sex,breed,date_found,adoptable_from,posted,color,coat,size,neutered,housebroken,likes_people,likes_children,get_along_males,get_along_females,get_along_cats,keep_in,coat_collapsed
0,23807,Gida,0.25,female,Unknown Mix,12/10/19,12/11/19,12/11/19,red,short,small,no,,,,,,,,short
1,533,Frida És Ricsi,0.17,female,Unknown Mix,12/1/19,12/1/19,12/9/19,black and white,short,small,no,,yes,YES,yes,yes,yes,,short
2,23793,,4.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,saddle back,short,medium,no,,,,,,,,short
3,23795,,1.0,male,Unknown Mix,12/8/19,12/23/19,12/8/19,yellow-brown,medium,medium,no,,,,,,,,medium
4,23806,Amy,2.0,female,French Bulldog Mix,12/10/19,12/11/19,12/11/19,black,short,small,no,,,,,,,,short


In [42]:
dogs['coat'] = dogs['coat'].astype('category')

In [43]:
# Print the category of the coat for ID 23807
print(dogs.loc[dogs['ID'] == 23807, "coat"])

0    short
Name: coat, dtype: category
Categories (4, object): ['long', 'medium', 'short', 'wirehaired']


In [44]:
# Find the count of male and female dogs who have a "long" coat
print(dogs.loc[dogs['coat'] == 'long', "sex"].value_counts())

male      124
female     56
Name: sex, dtype: int64


In [45]:
# Print the mean age of dogs with a breed of "English Cocker Spaniel"
print(dogs.loc[dogs['breed'] == "English Cocker Spaniel", "age"].mean())

8.186153846153847


In [46]:
# Count the number of dogs that have "English" in their breed name
print(dogs[dogs["breed"].str.contains('English', regex=False)].shape[0])

35
