## Pandas - Data Aggregation and Group Operations
 - Split, apply, and combine

<img src="http://people.bu.edu/kalathur/figs/split-apply-combine.png" width="600"/>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.options.display.max_rows = 10

In [None]:
np.random.seed(12345)

In [None]:
section_ids = np.repeat(['A','B','C','D'], [10,20,30,40])
print(section_ids)

In [None]:
quiz1_scores = np.random.randint(50, 80, 100)
print(quiz1_scores)

In [None]:
quiz2_scores = np.random.randint(50, 90, 100)
print(quiz2_scores)

In [None]:
section_data = pd.DataFrame({'Section': section_ids, 
                             'Quiz1': quiz1_scores,
                             'Quiz2': quiz2_scores
                            })
section_data.head()

In [None]:
section_data.tail()

In [None]:
section_data.groupby('Section')

In [None]:
section_data.groupby('Section').size()

In [None]:
for (section, group) in section_data.groupby('Section'):
    print("{0:5s} shape={1}, type={2}".format(section, group.shape, type(group)))

In [None]:
section_data.describe().round(2)

In [None]:
type(section_data.describe())

In [None]:
section_data.describe().unstack().round(2)

In [None]:
type(section_data.describe().unstack())

In [None]:
section_data.groupby('Section').mean().round(2)

In [None]:
section_data.groupby('Section').Quiz1.mean().round(2)

In [None]:
section_data.groupby('Section')['Quiz1'].mean().round(2)

In [None]:
section_data.groupby('Section').max()

In [None]:
section_data.groupby('Section').min()

In [None]:
section_data.groupby('Section').count()

In [None]:
section_data.groupby('Section').size()

In [None]:
section_data.groupby('Section').first()

In [None]:
section_data.groupby('Section').last()

In [None]:
section_data.groupby('Section').nth(1)

In [None]:
section_data.groupby('Section')['Quiz1'].describe().round(2)

In [None]:
section_data.groupby('Section').aggregate(
    ['count', 'min', 'max', 'mean', 'median']).round(2)

In [None]:
section_data.groupby('Section').aggregate(
    [np.size, np.min, np.max, np.mean, np.median]).round(2)

In [None]:
section_data.groupby('Section')['Quiz1'].aggregate(
    ['count', 'mean', 'median']).round(2)

In [None]:
data = section_data.groupby('Section').aggregate(
    ['count', 'mean', 'median']).round(2)

data

In [None]:
data.plot(kind='bar');

In [None]:
data_u = data.unstack()

In [None]:
data_u

In [None]:
data_u.plot(kind='bar');

#### student's gender data for 4 sections

In [None]:
a = np.tile(['F', 'M'], 4)
a

In [None]:
# Female and male students for 4 sections

b = np.repeat(a, [10, 0, 5, 15, 20, 10, 15, 25])
b

In [None]:
gender_data = pd.DataFrame({'Section': section_ids, 
                             'Gender' : b,
                             'Quiz1': quiz1_scores,
                             'Quiz2': quiz2_scores
                            })
gender_data

In [None]:
grouped_data = gender_data.groupby(['Section', 'Gender'])

grouped_data.size()

In [None]:
grouped_data.describe().round(2)

In [None]:
grouped_data.agg(
    ['count', 'min', 'max', 'mean', 'median']).round(2)

In [None]:
data = grouped_data.aggregate(['min','mean','max']).round(2)
data

In [None]:
data.plot(kind='bar');

In [None]:
data.index

In [None]:
data.columns

In [None]:
data.xs('F', level='Gender')

In [None]:
data.xs('F', level='Gender').plot(kind='bar');

In [None]:
idx = pd.IndexSlice

In [None]:
data.loc[idx[:,'F'],idx[:,['min','max']]]

In [None]:
data.loc[idx[:,'F'],idx[:,['min','max']]].plot(kind='bar');

In [None]:
data

In [None]:
data_u = data.unstack()
data_u

In [None]:
data_u.plot(kind='bar');

In [None]:
data_u.columns

In [None]:
data_u.loc[:, idx[:,['min','max'],'F']]

In [None]:
data_u.loc[:, idx[:,['min','max'],'F']].plot(kind='bar');

### Selecting a Group

In [None]:
gender_data

In [None]:
grouped_data = gender_data.groupby(['Section', 'Gender'])
grouped_data

In [None]:
grouped_data.get_group(('B','F'))

In [None]:
grouped_data['Quiz1']

In [None]:
grouped_data['Quiz1'].describe()

### Grouping with Index levels and columns

In [None]:
np.random.seed(123)
data = [['Alice', 'Alice', 'Bob', 'Bob', 'Charlie', 'Charlie', 'Dave', 'Dave'],
        ['cs1', 'cs2', 'cs1', 'cs2', 'cs1', 'cs2', 'cs1', 'cs2']]

df = pd.DataFrame(np.random.randint(60,80,(8, 4)), index = data,
                 columns = ['Quiz1', 'Quiz2', 'Quiz3', 'Quiz4'])
df.index.names = ['Student', 'Class']

df

In [None]:
print(df.groupby([pd.Grouper(level=1), 'Quiz1']).mean())

In [None]:
print(df.groupby([pd.Grouper(level='Class'), 'Quiz2']).mean())

In [None]:
print(df.groupby(['Class', 'Quiz2']).mean())

### Filtering
 - The argument to filter() must be a function or lambda that will take a group and return True or False to determine whether rows belonging to that group should be included in the output

In [None]:
gender_data.groupby(['Section', 'Gender']).size()

In [None]:
df1 = gender_data.groupby('Section').filter(
    lambda x: (x['Gender'] == 'F').sum() > 10)
df1

In [None]:
df1.groupby(['Section', 'Gender']).size()

In [None]:
gender_data.groupby(['Section', 'Gender']).mean().round(2)

In [None]:
df2 = gender_data.groupby(['Section', 'Gender']).filter(
    lambda x: x['Quiz2'].mean() > 70)
df2

In [None]:
df2.groupby(['Section', 'Gender']).size()

### Transformation


In [None]:
gender_data

In [None]:
gender_data.groupby('Section').describe().round(2)

In [None]:
def foo(x):
    return (x - x.mean())

df3 = gender_data.groupby('Section').transform(foo)
df3

In [None]:
gender_data.groupby('Section').transform(
    lambda x: x - x.mean()
)

### apply

In [None]:
def diff_from_mean(x):
    # x is a DataFrame of group values
    x['Quiz1'] = (x['Quiz1'] - np.mean(x['Quiz1']))
    x['Quiz2'] = (x['Quiz2'] - np.mean(x['Quiz2']))
    return x

In [None]:
df4 = gender_data.groupby('Section').apply(diff_from_mean)
df4

In [None]:
np.allclose(df3['Quiz1'], df4['Quiz1'])

In [None]:
df4.groupby('Section').agg(np.sum).round(2)

In [None]:
np.allclose(df4.groupby('Section').agg(np.sum), 0)

## Tips Dataset

In [None]:
tips = sns.load_dataset('tips')
tips

In [None]:
tips.groupby('day').size()

In [None]:
# Average total bill by day

tips.groupby('day')['total_bill'].aggregate('mean').round(2)

In [None]:
tips.groupby('day')['total_bill'].mean().round(2)

In [None]:
tips1 = tips.groupby('day').filter(
    lambda x : x['total_bill'].mean() > 20)

In [None]:
tips1.groupby('day').size()

In [None]:
tips1['size'].mean().round(2)

In [None]:
tips1.groupby('day')['size'].mean().round(2)

In [None]:
tips2 = tips.groupby('day').filter(
    lambda x : x['total_bill'].mean() <= 20)

In [None]:
tips2.groupby('day').size()

In [None]:
tips2['size'].mean()

In [None]:
tips2.groupby('day')['size'].mean().round(2)

In [None]:
tips.groupby('day')['total_bill'].transform(
    lambda x : x.mean().round(2)
).value_counts()

In [None]:
tips['scaled_bill'] = tips.groupby('day')['total_bill'].transform(
    lambda x : (x/x.mean()).round(2)
)
tips

In [None]:
tips['scaled_bill'] = tips.groupby('day')['total_bill'].transform(
    lambda x : ((x - x.mean())/x.std()).round(2)
)
tips

In [None]:
for day, group in tips.groupby('day'):
    print('\nDay is ' + str(day))
    print(group[['total_bill', 'day', 'scaled_bill']])

In [None]:
tips.groupby('day').nth(10).boxplot();

In [None]:
tips.groupby('day').first()

In [None]:
tips['tip_pct'] = (100* tips['tip'] / tips['total_bill']).round(2)
tips

In [None]:
tips.groupby(['day', 'sex'])['total_bill', 'tip_pct'].mean()

In [None]:
# Returning data withour row indices

tips.groupby(['day', 'sex'], as_index=False)['total_bill', 'tip_pct'].mean()

In [None]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column, ascending=False)[:n]

In [None]:
top(tips)

In [None]:
tips.groupby('day').apply(top)

In [None]:
tips.groupby('day').apply(top, n=2)

In [None]:
tips.groupby('day').apply(top, n=2, column='total_bill')

In [None]:
tips.groupby(['day', 'sex']).apply(top, n=2)

In [None]:
# Suppress group keys

tips.groupby(['day', 'sex'], group_keys=False).apply(top, n=2)

### Quantile and Bucket Analysis

In [None]:
gender_data

In [None]:
quartiles = pd.cut(gender_data['Quiz1'], 4)
quartiles

In [None]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}

In [None]:
gender_data['Quiz1'].groupby(quartiles).apply(get_stats).unstack()

In [None]:
gender_data['Quiz2'].groupby(quartiles).apply(get_stats).unstack()

In [None]:
quartiles = pd.qcut(gender_data['Quiz1'], 4)
quartiles

In [None]:
gender_data['Quiz1'].groupby(quartiles).apply(get_stats).unstack()

In [None]:
gender_data['Quiz2'].groupby(quartiles).apply(get_stats).unstack()

### Filling missing values with Group-Specific values

In [None]:
df = section_data[['Quiz1', 'Quiz2']].loc[::2].reindex(np.arange(100))
df['Section'] = section_data['Section']
df.reindex(['Section', 'Quiz1', 'Quiz2'], axis=1)

In [None]:
df.groupby('Section').mean().round(2)

In [None]:
df2 = df.groupby('Section').apply(lambda x : x.fillna(x.mean().round(2)))
df2