In [1]:
import pandas as pd

In [2]:
adult = pd.read_csv('./data/adult.csv')
adult.head(2)

Unnamed: 0,Age,Workclass,fnlgwt,Education,Education Num,Marital Status,Occupation,Relationship,Race,Sex,Capital Gain,Capital Loss,Hours/Week,Country,Above/Below 50k
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [3]:
# Explore the Above/Below 50k variable
print(adult['Above/Below 50k'].describe())

count      32561
unique         2
top        <=50K
freq       24720
Name: Above/Below 50k, dtype: object


In [4]:
# Print a frequency table of "Above/Below 50k"
print(adult["Above/Below 50k"].value_counts())

 <=50K    24720
 >50K      7841
Name: Above/Below 50k, dtype: int64


In [5]:
# Print relative frequency values
print(adult["Above/Below 50k"].value_counts(normalize=True))

 <=50K    0.75919
 >50K     0.24081
Name: Above/Below 50k, dtype: float64


In [6]:
# Check the dtypes
print(adult.dtypes)

# Create a dictionary with column names as keys and "category" as values
adult_dtypes = {
   "Workclass": "category",
   "Education": "category",
   "Relationship": "category",
   "Above/Below 50k": "category" 
}

# Read in the CSV using the dtypes parameter
adult2 = pd.read_csv(
  "./data/adult.csv",
  dtype=adult_dtypes
)
print(adult2.dtypes)

Age                 int64
Workclass          object
fnlgwt              int64
Education          object
Education Num       int64
Marital Status     object
Occupation         object
Relationship       object
Race               object
Sex                object
Capital Gain        int64
Capital Loss        int64
Hours/Week          int64
Country            object
Above/Below 50k    object
dtype: object
Age                   int64
Workclass          category
fnlgwt                int64
Education          category
Education Num         int64
Marital Status       object
Occupation           object
Relationship       category
Race                 object
Sex                  object
Capital Gain          int64
Capital Loss          int64
Hours/Week            int64
Country              object
Above/Below 50k    category
dtype: object


##### Using the categorical dtype is a great way to save memory and boost performance. Specifying the dtypes will lower the amount of memory required to load our dataset. This is quite helpful when dealing with large datasets!

### Adding categories

In [7]:
dogs = pd.read_csv('./data/ShelterDogs.csv')

In [8]:
# Check frequency counts while also printing the NaN count
print(dogs["keep_in"].value_counts(dropna=False))

# Switch to a categorical variable
dogs["keep_in"] = dogs["keep_in"].astype("category")

# Add new categories
new_categories = ["Unknown History", "Open Yard (Countryside)"]
dogs["keep_in"] = dogs["keep_in"].cat.add_categories(new_categories)

# Check frequency counts one more time
print(dogs["keep_in"].value_counts(dropna=False))

both flat and garden    1224
NaN                     1021
garden                   510
flat                     182
Name: keep_in, dtype: int64
both flat and garden       1224
NaN                        1021
garden                      510
flat                        182
Open Yard (Countryside)       0
Unknown History               0
Name: keep_in, dtype: int64


### Removing categories

In [9]:
# dogs.likes_children

In [10]:
# Set "maybe" to be "no"
dogs['likes_children'] = dogs['likes_children'].astype('category')
dogs["likes_children"] = dogs["likes_children"].cat.add_categories(["maybe"])  # add category first

dogs.loc[dogs["likes_children"] == 'NaN', "likes_children"] = "maybe"
# dogs['likes_children'] = dogs['likes_children'].astype('category')

# Print out categories
print(dogs["likes_children"].cat.categories)

# Print the frequency table
print(dogs["likes_children"].value_counts())

# Remove the `"maybe" category
dogs["likes_children"] = dogs["likes_children"].cat.remove_categories(["maybe"])
print(dogs["likes_children"].value_counts())

# Print the categories one more time
print(dogs["likes_children"].cat.categories)

Index(['no', 'yes', 'maybe'], dtype='object')
yes      1172
no         47
maybe       0
Name: likes_children, dtype: int64
yes    1172
no       47
Name: likes_children, dtype: int64
Index(['no', 'yes'], dtype='object')


### Renaming categories

In [11]:
# Create the my_changes dictionary
my_changes = {'Maybe?':'Maybe'}

# Rename the categories listed in the my_changes dictionary
dogs["likes_children"] = dogs["likes_children"].cat.rename_categories(my_changes)

# Use a lambda function to convert all categories to uppercase using upper()
dogs["likes_children"] =  dogs["likes_children"].cat.rename_categories(lambda c: c.upper())

# Print the list of categories
print(dogs["likes_children"].cat.categories)

Index(['NO', 'YES'], dtype='object')
