Categorical
- Finite number of groups
- These categories are usually fixed or known
- Known as qualitative data

    Ordinal
    ° Categorical variables that have a natural order

    Nominal
    ° Categorical variables that cannot be placed into a natural order

Numerical
- Known as quantitative data
- Expressed using a numerical value
- Is usually a measurement 


In [None]:
# course introduction
adult.info()
adult['Marital Status'].describe()
adult['Marital Status'].value_counts(normalize=True)


In [None]:
# Categorical data in pandas
import pandas as pd
adult = pd.read_csv('data/adult.csv')
adult.dtypes

# set dtype as categorical
adult['Marital Status'] = adult['Marital Status'].astype('category')


In [3]:
# creating a categorical Series
import pandas as pd
my_data = ['A', 'A', 'C', 'B', 'C', 'A']
my_series1 = pd.Series(my_data, dtype='category')
print(my_series1)

my_series2 = pd.Categorical(my_data, categories=['C', 'B', 'A'], ordered=True)
print(my_series2)

0    A
1    A
2    C
3    B
4    C
5    A
dtype: category
Categories (3, object): ['A', 'B', 'C']
['A', 'A', 'C', 'B', 'C', 'A']
Categories (3, object): ['C' < 'B' < 'A']


In [None]:
# Why do we use categorical: memory
adult = pd.read_csv('data/adult.csv')
adult['Marital Status'].nbytes
adult['Marital Status'] = adult['Marital Status'].astype('category')
adult['Marital Status'].nbytes

In [None]:
# Grouping data by category in pandas
adult = pd.read_csv('data/adult.csv')
adult1 = adult[adult['Above/Below 50k'] == ' <=50K']
adult2 = adult[adult['Above/Below 50k'] == ' >50K']

# replaced by
groupby_object = adult.groupby(by=['Above/Below 50k'])

# specifying columns
adult.groupby(by=['Above/Below 50k'])['Age', 'Education Num'].sum()
adult.groupby(by=['Above/Below 50k']).sum()['Age', 'Education Num']

# Groupby multiple columns
adult.groupby(by=['Above/Below 50k', 'Martial Status']).size()


In [None]:
# setting category variables
# a dog's coat
dogs['coat'] = dogs['coat'].astype('category')
dogs['coat'].value_counts(dropna=False)

# set category
dogs['coat']=dogs['coat'].cat.set_categories(new_categories=['short', 'medium', 'long'], ordered=True)

# add category
dogs['likes_people']=dogs['likes_people'].astype('category')
dogs['likes_people']=dogs['likes_people'].cat.add_categories(new_categories=['did not check', 'could not tell'])

# check category
dogs['likes_people'].cat.categories

# remove category
dogs['coat']=dogs['coat'].astype('category')
dogs['coat']=dogs['coat'].cat.remove_categories(removals=['wirehaired'])
dogs['coat'].cat.categories


Rename_categories method:
- Series.cat.rename_categories(new_categories=dict)
ex:
dogs['breed'] = dogs['breed'].cat.rename_categories(my_changes)

In [None]:
# Updating categories
my_changes = {'Unknown Mix':'Unknown'}

In [None]:
# renaming categories with a function
dogs['sex'] = dogs['sex'].cat.rename_categories(lambda c:c.title())
dogs['sex'].cat.categories

In [None]:
update_colors = {
    'black and brown':'black',
    'black and tan': 'black',
    'black and white': 'black'
}
dogs['main_color'] = dogs['color'].replace(update_colors)


In [None]:
# reordering categories
dogs['coat'] = dogs['coat'].cat.reorder_categories(
    new_categories = ['short', 'medium', 'wirehand', 'long'],
    ordered=True
)

# using inplace
dogs['coat'].cat.reorder_categories(
    new_categories = ['short', 'medium', 'wirehand', 'long'],
    ordered=True,
    inplace=True
)

Cleaning and accessing data
- Identifying issues
    - Series.cat.categories 
    - Series.value_counts()

In [None]:
# cleaning and accessing data
dogs['get_along_cats'].value_counts()

# fixing issues: whitespace
dogs['get_along_cats'] = dogs['get_along_cats'].str.strip()

# capitalization issues
dogs['get_along_cats'] = dogs['get_along_cats'].str.title()

# misspelled words
replace_map = {'Noo': 'No'}
dogs['get_along_cats'].replace(replace_map, inplace=True)


In [None]:
# Searching for a string
dogs['breed'].str.contains('Shepherd', regrex=False)

# access series values based on category
dogs.loc[dogs['get_along_cats'] == 'Yes', 'size'].value_counts(sort=False)