# Pandas's `group_by`

In [None]:
from pandas import DataFrame
from pandas import Series
import pandas as pd
from numpy.random import randint
from numpy.random import randn
import numpy as np

## Simple Example

### Create the Data

In [None]:
df = DataFrame({'data1': randint(1, 11, 5000), 'data2': randint(1, 11, 5000),
   'key': Series(randint(0, 10, 5000)).map(lambda x: 'abcdefghij'[x]).values})
print(df.head())
print(df.tail())

- This is better using `50000000` except it take a few seconds to do it

### `group_by` `key`

In [None]:
group_by_object = df.groupby(df['key'])
print(group_by_object)

In [None]:
print(group_by_object.describe())

In [None]:
print(group_by_object.mean())

## Interating Over Groups

In [None]:
for key, data in group_by_object:
    print(key, end = ' ')
    print('\n', len(data))
    print('\n', data)

## Column Selection

- The two different methods are identical in result

In [None]:
group_by_object_data1 = df.groupby('key')['data1']
print(group_by_object_data1.mean())

In [None]:
data1_group_by_object = df['data1'].groupby(df['key'])
print(data1_group_by_object.mean())  

- In `df['data1'].groupby(df['key'])`the `df['key']` is needed because the matching Series is needed for grouping

## Grouping with `Series` and `dict`

### With a `Series`

In [None]:
df2 = DataFrame(randint(1, 11, (5, 6)), \
                index = ['ab', 'cd', 'e', 'f', 'qr'], columns = list('ABCDEF'))
print(df2)

In [None]:
d1 = {'A':'red', 'B':'blue', 'C':'blue', 'D':'red', 'E':'red', 'F':'red'}
values1 = d1.values()
print(values1)

In [None]:
df2_groupby_list = df2.groupby(list(d1.values()), axis = 1)
print(df2_groupby_list.mean())

### With `dict`

In [None]:
df2_groupby_dict = df2.groupby(d1, axis = 1)
print(df2_groupby_dict.mean())

## Grouping by Functions

- Any function passed as a group key will be called once per index value,
  with the return values being used as the group names

In [None]:
df2_groupby_len = df2.groupby(len, axis = 0)
print(df2_groupby_len.sum())

## Data Aggregation
- Aggregation (`agg`) accepts functions that reduce a one-dimensional array to a scalar value

### Setup

In [None]:
df_tips = pd.read_csv('./tips.csv')
df_tips.head()

In [None]:
df_tips['tip_pct'] = df_tips['tip'] / df_tips['total_bill']
print(df_tips.head())

### Single Aggregation

In [None]:
# Double index sex and within sex smoker
grouped = df_tips.groupby(['sex', 'smoker'])
print(grouped)

In [None]:
# get tip_pct as the only column 
grouped_pct = grouped['tip_pct']
print(grouped_pct)

In [None]:
# print the mean for the column
print(grouped_pct.agg(['mean']))

- `agg` is an alias for `aggregate`. Use the alias.
- **NOTICE:** `sex` is the primary key and `smoker` is the secondary key

### List of Functions


In [None]:
grouped_pct_stats1 = \
    grouped_pct.agg(['mean', 'std', lambda column: column.max() - column.min()])
print(grouped_pct_stats1)

- The `<lambda>` is not the most descriptive method of doing this 

In [None]:
grouped_pct_stats2 = grouped_pct.agg([('mean', 'mean'), ('std', 'std'), 
            ('peak_to_peak', lambda column: column.max() - column.min())])
print(grouped_pct_stats2)

- **Or** you have done it the old fashioned way (probably simpler)
- The above technique is useful when you do not line the column names

In [None]:
def top_to_bottom(column):
    return column.max() - column.min()

In [None]:
grouped_pct_stats3 = grouped_pct.agg(['mean', 'std', top_to_bottom])
print(grouped_pct_stats3)

- **NOTICE:** My function was passed in not as a string but as the actual identifier of the function

### Multiple Columns with Multiple Functions

In [None]:
tip_and_bill = grouped['tip_pct', 'total_bill'].agg(['count', 'mean', 'max'])
print(tip_and_bill)

In [None]:
type(tip_and_bill)

- Pulling just one "column" from the DataFrame

In [None]:
print(tip_and_bill[['tip_pct']])

- For one column without the column name do not put it in a list 

### Extended Example

In [None]:
df2 = DataFrame({'data1': randint(1, 11, 10), 
   'data2': randint(1, 11, 10),
   'key1': Series(randint(0, 2, 10)).map(lambda x: 'ab'[x]).values,
   'key2': Series(randint(0, 2, 10)).map(lambda x: ['one', 'two'][x]).values})
print(df2)

In [None]:
key1_means = df2.groupby('key1').mean().add_prefix('key1_mean_')
print(key1_means)

- The `key2` is not shown because of `mean` function only works on numbers

In [None]:
key1_means2 = df2.groupby('key1').agg('mean').add_prefix('key1_mean_')
print(key1_means2)

In [None]:
df2_k1_means = pd.merge(df2, key1_means, left_on='key1', right_index=True) \
    [['key1', 'key1_mean_data1', 'key1_mean_data2']].drop_duplicates()
print(df2_k1_means)

# End of Notebook