In [19]:
import pandas as pd
import numpy as np

# Concepts

## categorical

Categorical are a Pandas data type

Often in real-time, data includes the text columns, which are repetitive. Features like gender, country, and codes are always repetitive. These are the examples for categorical data.

Categorical variables can take on only a limited, and usually fixed number of possible values. Besides the fixed length, categorical data might have an order but cannot perform numerical operation. 

### Usages

Categorical can be used when you build objects with Pandas

In [27]:
s = pd.Series(["a","b","c","a"], dtype="category")
print(s)

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]


Using the standard pandas Categorical constructor

`pandas.Categorical(values, categories, ordered)`

In [28]:
cat = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
print(cat)

[a, b, c, a, b, c]
Categories (3, object): [a, b, c]


In [31]:
cat = cat=pd.Categorical(['a','b','c','a','b','c','d'], ['c', 'b', 'a'])
print(cat)

[a, b, c, a, b, c, NaN]
Categories (3, object): [c, b, a]


In [32]:
cat = cat=pd.Categorical(['a','b','c','a','b','c','d'], ['c', 'b', 'a'],ordered=True)
print(cat)

[a, b, c, a, b, c, NaN]
Categories (3, object): [c < b < a]


# Grouping Data

## groupby

#### Splits an dataframe into multiple dataframes on a groupby condition

In [13]:
members = pd.DataFrame({"Name":["Rubble", "Marshal", "Zuma", "Sky", "Chase"],
                        "Color":["Yellow", "Red", "Orange", "Pink", "Blue"],
                        "Expertise":["Land","Land","Water","Air","Air"],
                        "Age":[5,12,8,4,10]})

In [14]:
members

Unnamed: 0,Name,Color,Expertise,Age
0,Rubble,Yellow,Land,5
1,Marshal,Red,Land,12
2,Zuma,Orange,Water,8
3,Sky,Pink,Air,4
4,Chase,Blue,Air,10


In [15]:
for expertise, df in members.groupby('Expertise'):
    print(expertise)
    print(df)
    print("------------------------------")

Air
    Name Color Expertise  Age
3    Sky  Pink       Air    4
4  Chase  Blue       Air   10
------------------------------
Land
      Name   Color Expertise  Age
0   Rubble  Yellow      Land    5
1  Marshal     Red      Land   12
------------------------------
Water
   Name   Color Expertise  Age
2  Zuma  Orange     Water    8
------------------------------


#### It can also help to run summary functions on a dataframe

In [16]:
members.groupby('Expertise').Age.mean()

Expertise
Air      7.0
Land     8.5
Water    8.0
Name: Age, dtype: float64

## cut

# TODO

In [23]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]