# Pandas Documentation on Group By

In this notebook, you will work through the Pandas documentation on "group by".

## Imports

In [1]:
import numpy as np
import pandas as pd

## Pandas group by: split-apply-combine

In this notebook, you are going to learn how to use Pandas by typing the code from the Pandas documentation into this notebook.

* Go to the Pandas Documentation for [Group By](http://pandas.pydata.org/pandas-docs/stable/groupby.html#group-by-split-apply-combine).
* Type all of the code from that section of the documentation into this notebook and get it working.
* **To learn this API well, you must type the code rather than copy and pasting it**.
* Create a new cell in this section for each `In[]` prompt in the documentation.
* Ignore the cells in the **Grading** section below.
* No Markdown comments are needed.
* Skip the following sub-sections:
  - Grouping with a Grouper specification
  - Plotting
  - Examples

In [2]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                   'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                   'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

In [3]:
df

Unnamed: 0,A,B,C,D
0,foo,one,0.49553,-0.490011
1,bar,one,1.4754,-2.189469
2,foo,two,-1.916738,-1.469815
3,bar,three,0.410655,1.611494
4,foo,two,-3.341792,0.837025
5,bar,two,-0.701573,-0.468766
6,foo,one,-0.52554,0.72709
7,foo,three,1.574792,-0.912449


In [4]:
grouped = df.groupby('A')

In [5]:
grouped = df.groupby(['A', 'B'])

In [6]:
def get_letter_type(letter):
    if letter.lower() in 'aeiou':
        return 'vowel'
    else:
        return 'consonant'

grouped = df.groupby(get_letter_type, axis=1)

In [7]:
   lst = [1, 2, 3, 1, 2, 3]

In [8]:
   s = pd.Series([1, 2, 3, 10, 20, 30], lst)

In [9]:
   grouped = s.groupby(level=0)

In [10]:
   grouped.first()

1    1
2    2
3    3
dtype: int64

In [11]:
   grouped.last()

1    10
2    20
3    30
dtype: int64

In [12]:
   grouped.sum()

1    11
2    22
3    33
dtype: int64

In [13]:
df2 = pd.DataFrame({'X' : ['B', 'B', 'A', 'A'], 'Y' : [1, 2, 3, 4]})
   

In [14]:
df2.groupby(['X']).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
A,7
B,3


In [15]:
   df2.groupby(['X'], sort=False).sum()

Unnamed: 0_level_0,Y
X,Unnamed: 1_level_1
B,3
A,7


In [16]:
   df3 = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})

In [17]:
   df3.groupby(['X']).get_group('A')

Unnamed: 0,X,Y
0,A,1
2,A,3


In [18]:
   df3.groupby(['X']).get_group('B')

Unnamed: 0,X,Y
1,B,4
3,B,2


In [19]:
df.groupby('A').groups

{'bar': [1, 3, 5], 'foo': [0, 2, 4, 6, 7]}

In [20]:
   df.groupby(get_letter_type, axis=1).groups

{'consonant': ['B', 'C', 'D'], 'vowel': ['A']}

In [21]:
grouped = df.groupby(['A', 'B'])

In [22]:
   grouped.groups

{('bar', 'one'): [1],
 ('bar', 'three'): [3],
 ('bar', 'two'): [5],
 ('foo', 'one'): [0, 6],
 ('foo', 'three'): [7],
 ('foo', 'two'): [2, 4]}

In [23]:
   len(grouped)

6

In [24]:
n = 10

In [25]:
   weight = np.random.normal(166, 20, size=n)

In [26]:
   height = np.random.normal(60, 10, size=n)

In [27]:
   time = pd.date_range('1/1/2000', periods=n)

In [28]:
   gender = np.random.choice(['male', 'female'], size=n)

In [29]:
   df = pd.DataFrame({'height': height, 'weight': weight,
                      'gender': gender}, index=time)

In [30]:
df

Unnamed: 0,gender,height,weight
2000-01-01,male,48.956185,190.435807
2000-01-02,female,65.938857,146.702389
2000-01-03,male,76.34415,145.138824
2000-01-04,male,62.856481,174.781601
2000-01-05,male,75.164436,179.198644
2000-01-06,male,58.518745,142.077526
2000-01-07,female,52.818346,158.630697
2000-01-08,female,57.577466,165.213205
2000-01-09,male,57.314443,165.839201
2000-01-10,female,74.153177,154.47249


In [31]:
   gb = df.groupby('gender')

In [32]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                             'foo', 'bar', 'foo', 'foo'],
                      'B' : ['one', 'one', 'two', 'three',
                             'two', 'two', 'one', 'three'],
                      'C' : np.random.randn(8),
                      'D' : np.random.randn(8)})

In [33]:
arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
             ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]

In [34]:
   index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])

In [35]:
   s = pd.Series(np.random.randn(8), index=index)

In [36]:
   s

first  second
bar    one      -1.716741
       two      -0.644268
baz    one      -0.365728
       two       0.095335
foo    one       0.439217
       two      -0.118856
qux    one      -0.542351
       two      -1.375542
dtype: float64

In [37]:
grouped = s.groupby(level=0)

In [38]:
   grouped.sum()

first
bar   -2.361009
baz   -0.270393
foo    0.320360
qux   -1.917893
dtype: float64

In [39]:
s.groupby(level='second').sum()

second
one   -2.185603
two   -2.043331
dtype: float64

In [40]:
s.sum(level='second')


second
one   -2.185603
two   -2.043331
dtype: float64

In [41]:
s

first  second
bar    one      -1.716741
       two      -0.644268
baz    one      -0.365728
       two       0.095335
foo    one       0.439217
       two      -0.118856
qux    one      -0.542351
       two      -1.375542
dtype: float64

In [42]:
   s.groupby(level=['first', 'second']).sum()

first  second
bar    one      -1.716741
       two      -0.644268
baz    one      -0.365728
       two       0.095335
foo    one       0.439217
       two      -0.118856
qux    one      -0.542351
       two      -1.375542
dtype: float64

In [43]:
grouped = df.groupby(['A'])

In [44]:
   grouped_C = grouped['C']

In [45]:
   grouped_D = grouped['D']

In [46]:
df['C'].groupby(df['A'])

<pandas.core.groupby.SeriesGroupBy object at 0x7f5ce0896080>

In [47]:
grouped.get_group('bar')

Unnamed: 0,A,B,C,D
1,bar,one,-1.65948,-1.036755
3,bar,three,-2.396333,-1.000801
5,bar,two,0.42818,0.109777


In [48]:
df.groupby(['A', 'B']).get_group(('bar', 'one'))

Unnamed: 0,A,B,C,D
1,bar,one,-1.65948,-1.036755


In [49]:
grouped = df.groupby('A')

In [50]:
   grouped.aggregate(np.sum)

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-3.627633,-1.927779
foo,-1.049413,3.351506


In [51]:
   grouped = df.groupby(['A', 'B'])

In [52]:
   grouped.aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.65948,-1.036755
bar,three,-2.396333,-1.000801
bar,two,0.42818,0.109777
foo,one,-0.832389,1.808046
foo,three,0.623852,1.498247
foo,two,-0.840877,0.045212


In [53]:
grouped = df.groupby(['A', 'B'], as_index=False)

In [54]:
   grouped.aggregate(np.sum)

Unnamed: 0,A,B,C,D
0,bar,one,-1.65948,-1.036755
1,bar,three,-2.396333,-1.000801
2,bar,two,0.42818,0.109777
3,foo,one,-0.832389,1.808046
4,foo,three,0.623852,1.498247
5,foo,two,-0.840877,0.045212


In [55]:
   df.groupby('A', as_index=False).sum()

Unnamed: 0,A,C,D
0,bar,-3.627633,-1.927779
1,foo,-1.049413,3.351506


In [56]:
df.groupby(['A', 'B']).sum().reset_index()

Unnamed: 0,A,B,C,D
0,bar,one,-1.65948,-1.036755
1,bar,three,-2.396333,-1.000801
2,bar,two,0.42818,0.109777
3,foo,one,-0.832389,1.808046
4,foo,three,0.623852,1.498247
5,foo,two,-0.840877,0.045212


In [57]:
grouped.size()

A    B    
bar  one      1
     three    1
     two      1
foo  one      2
     three    1
     two      2
dtype: int64

In [58]:
grouped.describe()

Unnamed: 0,Unnamed: 1,C,D
0,count,1.0,1.0
0,mean,-1.65948,-1.036755
0,std,,
0,min,-1.65948,-1.036755
0,25%,-1.65948,-1.036755
0,50%,-1.65948,-1.036755
0,75%,-1.65948,-1.036755
0,max,-1.65948,-1.036755
1,count,1.0,1.0
1,mean,-2.396333,-1.000801


In [59]:
grouped = df.groupby('A')

In [60]:
   grouped['C'].agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,sum,mean,std
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,-3.627633,-1.209211,1.465102
foo,-1.049413,-0.209883,0.632059


In [61]:
grouped['D'].agg({'result1' : np.sum,
                     'result2' : np.mean})

Unnamed: 0_level_0,result2,result1
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.642593,-1.927779
foo,0.670301,3.351506


In [62]:
grouped.agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,C,C,C,D,D,D
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
A,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
bar,-3.627633,-1.209211,1.465102,-1.927779,-0.642593,0.65182
foo,-1.049413,-0.209883,0.632059,3.351506,0.670301,1.00991


In [63]:
grouped.agg({'C' : np.sum,'D' : lambda x: np.std(x, ddof=1)})

Unnamed: 0_level_0,D,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.65182,-3.627633
foo,1.00991,-1.049413


In [64]:
grouped.agg({'C' : 'sum', 'D' : 'std'})

Unnamed: 0_level_0,D,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,0.65182,-3.627633
foo,1.00991,-1.049413


In [65]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-3.627633,-1.927779
foo,-1.049413,3.351506


In [66]:
   df.groupby(['A', 'B']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.65948,-1.036755
bar,three,-2.396333,-1.000801
bar,two,0.42818,0.109777
foo,one,-0.416195,0.904023
foo,three,0.623852,1.498247
foo,two,-0.420438,0.022606


In [67]:
index = pd.date_range('10/1/1999', periods=1100)

In [68]:
   ts = pd.Series(np.random.normal(0.5, 2, 1100), index)

In [2]:
   ts = ts.rolling(window=100,min_periods=100).mean().dropna()

NameError: name 'ts' is not defined

In [3]:
   ts.head()

NameError: name 'ts' is not defined

In [4]:
   ts.tail()

NameError: name 'ts' is not defined

In [5]:
   key = lambda x: x.year

In [6]:
   zscore = lambda x: (x - x.mean()) / x.std()

In [7]:
   transformed = ts.groupby(key).transform(zscore)

NameError: name 'ts' is not defined

In [8]:
   grouped = ts.groupby(key)

NameError: name 'ts' is not defined

In [None]:
   grouped.mean()

In [None]:
   grouped.std()

In [None]:
   grouped_trans = transformed.groupby(key)

In [None]:
   grouped_trans.mean()

In [None]:
   grouped_trans.std()

In [None]:
compare = pd.DataFrame({'Original': ts, 'Transformed': transformed})

In [None]:
compare.plot()

In [None]:
    data_df

In [None]:
   countries = np.array(['US', 'UK', 'GR', 'JP'])

In [None]:
   key = countries[np.random.randint(0, 4, 1000)]

In [None]:
   grouped = data_df.groupby(key)

In [None]:
   grouped.count()

In [None]:
   f = lambda x: x.fillna(x.mean())

In [None]:
   transformed = grouped.transform(f)

In [None]:
   grouped_trans = transformed.groupby(key)

In [None]:
   grouped.mean() # original group means

In [None]:
   grouped_trans.mean() # transformation did not change group means

In [None]:
   grouped.count() # original has some missing data points

In [None]:
   grouped_trans.count() # counts after transformation

In [None]:
   grouped_trans.size() # Verify non-NA count equals group size

In [None]:
grouped.ffill()

In [None]:
   sf = pd.Series([1, 1, 2, 3, 3, 3])

In [None]:
   sf.groupby(sf).filter(lambda x: x.sum() > 2)

In [None]:
   dff = pd.DataFrame({'A': np.arange(8), 'B': list('aabbbbcc')})

In [None]:
   dff.groupby('B').filter(lambda x: len(x) > 2)

In [None]:
dff.groupby('B').filter(lambda x: len(x) > 2, dropna=False)

In [None]:
dff['C'] = np.arange(8)

In [None]:
   dff.groupby('B').filter(lambda x: len(x['C']) > 2)

In [None]:
dff.groupby('B').head(2)

In [None]:
grouped = df.groupby('A')

In [None]:
   grouped.agg(lambda x: x.std())

In [None]:
grouped.std()

In [None]:
tsdf = pd.DataFrame(np.random.randn(1000, 3),
                       index=pd.date_range('1/1/2000', periods=1000),
                       columns=['A', 'B', 'C'])

In [None]:
   tsdf.ix[::2] = np.nan

In [None]:
   grouped = tsdf.groupby(lambda x: x.year)

In [None]:
   grouped.fillna(method='pad')

In [None]:
s = pd.Series([9, 8, 7, 5, 19, 1, 4.2, 3.3])

In [None]:
   g = pd.Series(list('abababab'))

In [None]:
   gb = s.groupby(g)

In [None]:
   gb.nlargest(3)

In [None]:
   gb.nsmallest(3)

In [None]:
df

In [None]:
   grouped = df.groupby('A')

In [None]:
   grouped['C'].apply(lambda x: x.describe())

In [None]:
grouped = df.groupby('A')['C']

In [None]:
def f(group):

In [None]:
     return pd.DataFrame({'original' : group,

In [None]:
                          'demeaned' : group - group.mean()})

In [None]:
grouped.apply(f)

In [None]:
def f(x):

In [None]:
      return pd.Series([ x, x**2 ], index = ['x', 'x^s'])

In [None]:
    s = pd.Series(np.random.rand(5))

In [None]:
    s

In [None]:
    s.apply(f)

In [None]:
d = pd.DataFrame({"a":["x", "y"], "b":[1,2]})

In [None]:
        def identity(df):

In [None]:
            print df

In [None]:
            return df

In [None]:
        d.groupby("a").apply(identity)

In [None]:
df

In [None]:
df.groupby('A').std()

In [None]:
   data = pd.Series(np.random.randn(100))

In [None]:
   factor = pd.qcut(data, [0, .25, .5, .75, 1.])

In [None]:
   data.groupby(factor).mean()

In [None]:
import datetime

In [None]:
   df = pd.DataFrame({
            'Branch' : 'A A A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1,3,5,1,8,1,9,3],
            'Date' : [
                datetime.datetime(2013,1,1,13,0),
                datetime.datetime(2013,1,1,13,5),
                datetime.datetime(2013,10,1,20,0),
                datetime.datetime(2013,10,2,10,0),
                datetime.datetime(2013,10,1,20,0),
                datetime.datetime(2013,10,2,10,0),
                datetime.datetime(2013,12,2,12,0),
                datetime.datetime(2013,12,2,14,0),
                ]
            })

In [None]:
   df

In [None]:
df.groupby([pd.Grouper(freq='1M',key='Date'),'Buyer']).sum()

In [None]:
df = df.set_index('Date')

In [None]:
   df['Date'] = df.index + pd.offsets.MonthEnd(2)

In [None]:
   df.groupby([pd.Grouper(freq='6M',key='Date'),'Buyer']).sum()

In [None]:
   df.groupby([pd.Grouper(freq='6M',level='Date'),'Buyer']).sum()

In [None]:
df = pd.DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])

In [None]:
   df

In [None]:
   g = df.groupby('A')

In [None]:
   g.head(1)

In [None]:
   g.tail(1)

In [None]:
df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])

In [None]:
   g = df.groupby('A')

In [None]:
   g.nth(0)

In [None]:
   g.nth(-1)

In [None]:
   g.nth(1)

In [None]:
   g.nth(0, dropna='any')

In [None]:
   g.first()

In [None]:
   # nth(-1) is the same as g.last()

In [None]:
   g.nth(-1, dropna='any')  # NaNs denote group exhausted when using dropna

In [None]:
   g.last()

In [None]:
   g.B.nth(0, dropna=True)

In [None]:
df = pd.DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])

In [None]:
   g = df.groupby('A',as_index=False)

In [None]:
   g.nth(0)

In [None]:
   g.nth(-1)

In [None]:
business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B')

In [None]:
   df = pd.DataFrame(1, index=business_dates, columns=['a', 'b'])

In [None]:
   # get the first, 4th, and last date index for each month

In [None]:
   df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])

In [None]:
df = pd.DataFrame(list('aaabba'), columns=['A'])

In [None]:
   df

In [None]:
   df.groupby('A').cumcount()

In [None]:
   df.groupby('A').cumcount(ascending=False)  # kwarg only

In [None]:
np.random.seed(1234)

In [None]:
   df = pd.DataFrame(np.random.randn(50, 2))

In [None]:
   df['g'] = np.random.choice(['A', 'B'], size=50)

In [None]:
   df.loc[df['g'] == 'B', 1] += 3

In [None]:
df.groupby('g').boxplot()

In [None]:
df = pd.DataFrame({'a':[1,0,0], 'b':[0,1,0], 'c':[1,0,0], 'd':[2,3,4]})

In [None]:
   df

In [None]:
   df.groupby(df.sum(), axis=1).sum()

In [None]:
df = pd.DataFrame({
            'a':  [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2],
            'b':  [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1],
            'c':  [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
            'd':  [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1],
            })

In [None]:
   def compute_metrics(x):

In [None]:
       result = {'b_sum': x['b'].sum(), 'c_mean': x['c'].mean()}

In [None]:
       return pd.Series(result, name='metrics')

In [None]:
   result = df.groupby('a').apply(compute_metrics)

In [None]:
   result

In [None]:
   result.stack()

## Grading

YOUR ANSWER HERE