# Data Manipulation with Pandas - Chapter 2
# Aggregating DataFrames
## Material provided by Datacamp
### Adapted by Mariana Emerenciano

In [1]:
import pandas as pd

data = {
    'name': ['Bella', 'Charlie', 'Lucy', 'Cooper', 'Max', 'Stella', 'Bernie'],
    'breed': ['Labrador', 'Poodle', 'Chow Chow', 'Schnauzer', 'Labrador', 'Chihuahua', 'St. Bernard'],
    'color': ['Brown', 'Black', 'Brown', 'Gray', 'Black', 'Tan', 'White'],
    'height_cm': [56, 43, 46, 49, 59, 18, 77],
    'weight_kg': [24, 24, 24, 17, 29, 2, 74],
    'date_of_birth': ['2013-07-01', '2016-09-16', '2014-08-25', '2011-12-11', '2017-01-20', '2015-04-20', '2018-02-27']
}

dogs = pd.DataFrame(data)

print(dogs)

      name        breed  color  height_cm  weight_kg date_of_birth
0    Bella     Labrador  Brown         56         24    2013-07-01
1  Charlie       Poodle  Black         43         24    2016-09-16
2     Lucy    Chow Chow  Brown         46         24    2014-08-25
3   Cooper    Schnauzer   Gray         49         17    2011-12-11
4      Max     Labrador  Black         59         29    2017-01-20
5   Stella    Chihuahua    Tan         18          2    2015-04-20
6   Bernie  St. Bernard  White         77         74    2018-02-27


# Summary Statistics

In [2]:
dogs["height_cm"].mean()

np.float64(49.714285714285715)

In [3]:
dogs["height_cm"].median()

np.float64(49.0)

In [4]:
dogs["height_cm"].mode()

0    18
1    43
2    46
3    49
4    56
5    59
6    77
Name: height_cm, dtype: int64

In [5]:
dogs["height_cm"].min()

np.int64(18)

In [6]:
dogs["height_cm"].max()

np.int64(77)

In [7]:
dogs["height_cm"].var()

np.float64(322.5714285714286)

In [8]:
dogs["height_cm"].std()

np.float64(17.960273621841864)

In [9]:
dogs["height_cm"].sum()

np.int64(348)

In [10]:
dogs["height_cm"].quantile()

np.float64(49.0)

## Summarizing Dates

In [11]:
#Oldest dog
dogs["date_of_birth"].min()

'2011-12-11'

In [12]:
#Youngest dog
dogs["date_of_birth"].max()

'2018-02-27'

## The .agg() method

In [13]:
def pct30(column):
    return column.quantile(0.3)

In [14]:
dogs["weight_kg"].agg(pct30)

np.float64(22.599999999999998)

In [15]:
# Summaries on multiple columns
dogs[["weight_kg", "height_cm"]].agg(pct30)

weight_kg    22.6
height_cm    45.4
dtype: float64

In [16]:
# Multiple summaries
def pct40(column):
    return column.quantile(0.4)

In [17]:
dogs["weight_kg"].agg([pct30, pct40])

pct30    22.6
pct40    24.0
Name: weight_kg, dtype: float64

## Cumulative Statistics

In [18]:
dogs["weight_kg"]

0    24
1    24
2    24
3    17
4    29
5     2
6    74
Name: weight_kg, dtype: int64

In [19]:
dogs["weight_kg"].cumsum()

0     24
1     48
2     72
3     89
4    118
5    120
6    194
Name: weight_kg, dtype: int64

In [20]:
dogs["weight_kg"].cummax()

0    24
1    24
2    24
3    24
4    29
5    29
6    74
Name: weight_kg, dtype: int64

In [21]:
dogs["weight_kg"].cummin()

0    24
1    24
2    24
3    17
4    17
5     2
6     2
Name: weight_kg, dtype: int64

In [22]:
dogs["weight_kg"].cumprod()

0            24
1           576
2         13824
3        235008
4       6815232
5      13630464
6    1008654336
Name: weight_kg, dtype: int64