In [1]:
#some generic imports
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
planets = sns.load_dataset('planets')        

In [4]:
planets.sample(5)

Unnamed: 0,method,number,orbital_period,mass,distance,year
410,Radial Velocity,1,4.113775,0.45,28.98,2006
955,Transit,1,3.553945,,492.0,2007
612,Radial Velocity,1,3724.7,1.45,48.43,2011
207,Radial Velocity,1,62.218,0.229,11.11,2003
283,Radial Velocity,1,2819.654,9.17,54.71,2002


Ways to use **`df.groupby`** - 
```python
df.groupby(['list', 'of', 'grouping', 'columns'])
df.groupby('single_column') # when grouping by a single column
```
Also - 
```python
df.groupby(['grouping', 'columns']).agg({'agg_cols1':['list', 'of', 'functions'],'agg_cols2':['other', 'functions']})

df.groupby(['grouping', 'columns'])['aggregating', 'columns'].agg([aggregating, functions])

df.groupby(['grouping', 'columns'])['aggregating', 'columns'].aggregating_method()

#If aggregating columns not specified, then the aggregating method will be applied to all the non-grouping columns -
df.groupby(['grouping', 'columns']).aggregating_method()

```

The groupby object has four methods that accept a function (or functions) to perform a
calculation on each group. These four methods are `agg`, `filter`, `transform`, and `apply`.
Each of the first three of these methods has a very specific output that the function must
return. `agg` must return a scalar value, `filter` must return a boolean, and `transform`
must return a Series with the same length as the passed group. The `apply` method,
however, may return a scalar value, a Series, or even a DataFrame of any shape, therefore
making it very flexible. It is also called only once per group, which contrasts with
`transform` and `agg` that get called once for each non-grouping column. The `apply`
method's ability to return a single object when operating on multiple columns at the same
time makes the calculation in some recipes possible.

---
## Aggregation

**`.agg`** method is same as **`.aggregate`** method

**API signature**

`DataFrameGroupBy`.agg(func, axis = 0, *****args, ****kwargs)

Note that all the ways of aggregation mentioned below are identical -

 - `.agg('mean')`
 - `.aggregate('mean')`
 - `.aggregate(np.mean)`
 - `.mean()`
 - `.agg({'column': 'function'})`

See below for examples.

In [5]:
planets.groupby('method')['orbital_period'].agg('mean')

method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orbital Brightness Modulation         0.709307
Pulsar Timing                      7343.021201
Pulsation Timing Variations        1170.000000
Radial Velocity                     823.354680
Transit                              21.102073
Transit Timing Variations            79.783500
Name: orbital_period, dtype: float64

In [5]:
planets.groupby('method')['orbital_period'].aggregate('mean')

method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orbital Brightness Modulation         0.709307
Pulsar Timing                      7343.021201
Pulsation Timing Variations        1170.000000
Radial Velocity                     823.354680
Transit                              21.102073
Transit Timing Variations            79.783500
Name: orbital_period, dtype: float64

In [6]:
planets.groupby('method')['orbital_period'].mean()

method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orbital Brightness Modulation         0.709307
Pulsar Timing                      7343.021201
Pulsation Timing Variations        1170.000000
Radial Velocity                     823.354680
Transit                              21.102073
Transit Timing Variations            79.783500
Name: orbital_period, dtype: float64

In [7]:
planets.groupby('method')['orbital_period'].agg(np.mean)

method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orbital Brightness Modulation         0.709307
Pulsar Timing                      7343.021201
Pulsation Timing Variations        1170.000000
Radial Velocity                     823.354680
Transit                              21.102073
Transit Timing Variations            79.783500
Name: orbital_period, dtype: float64

In [8]:
planets.groupby('method').agg({'orbital_period': 'mean'}) #returns a dataframe

Unnamed: 0_level_0,orbital_period
method,Unnamed: 1_level_1
Astrometry,631.18
Eclipse Timing Variations,4751.644444
Imaging,118247.7375
Microlensing,3153.571429
Orbital Brightness Modulation,0.709307
Pulsar Timing,7343.021201
Pulsation Timing Variations,1170.0
Radial Velocity,823.35468
Transit,21.102073
Transit Timing Variations,79.7835


In [9]:
planets.groupby('method').agg({'orbital_period':'mean', 'number':'max'}) #different agg functions for different columns

Unnamed: 0_level_0,orbital_period,number
method,Unnamed: 1_level_1,Unnamed: 2_level_1
Astrometry,631.18,1
Eclipse Timing Variations,4751.644444,2
Imaging,118247.7375,4
Microlensing,3153.571429,2
Orbital Brightness Modulation,0.709307,2
Pulsar Timing,7343.021201,3
Pulsation Timing Variations,1170.0,1
Radial Velocity,823.35468,6
Transit,21.102073,7
Transit Timing Variations,79.7835,3


In [10]:
planets.groupby('method')['orbital_period','number'].agg([np.mean, np.max]) #multiple agg functions for multiple columns

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,orbital_period,orbital_period,number,number
Unnamed: 0_level_1,mean,amax,mean,amax
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Astrometry,631.18,1016.0,1.0,1
Eclipse Timing Variations,4751.644444,10220.0,1.666667,2
Imaging,118247.7375,730000.0,1.315789,4
Microlensing,3153.571429,5100.0,1.173913,2
Orbital Brightness Modulation,0.709307,1.544929,1.666667,2
Pulsar Timing,7343.021201,36525.0,2.2,3
Pulsation Timing Variations,1170.0,1170.0,1.0,1
Radial Velocity,823.35468,17337.5,1.721519,6
Transit,21.102073,331.60059,1.95466,7
Transit Timing Variations,79.7835,160.0,2.25,3


The following table summarizes some other built-in Pandas aggregations:

| Aggregation              | Description                     |
|--------------------------|---------------------------------|
| ``count()``              | Total number of items           |
| ``first()``, ``last()``  | First and last item             |
| ``mean()``, ``median()`` | Mean and median                 |
| ``min()``, ``max()``     | Minimum and maximum             |
| ``std()``, ``var()``     | Standard deviation and variance |
| ``mad()``                | Mean absolute deviation         |
| ``prod()``               | Product of all items            |
| ``sum()``                | Sum of all items                |

These are all methods of ``DataFrame`` and ``Series`` objects.



In [11]:
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))

Astrometry                     shape=(2, 6)
Eclipse Timing Variations      shape=(9, 6)
Imaging                        shape=(38, 6)
Microlensing                   shape=(23, 6)
Orbital Brightness Modulation  shape=(3, 6)
Pulsar Timing                  shape=(5, 6)
Pulsation Timing Variations    shape=(1, 6)
Radial Velocity                shape=(553, 6)
Transit                        shape=(397, 6)
Transit Timing Variations      shape=(4, 6)


In [12]:
a = planets.groupby('method')
list(a)

[('Astrometry',
           method  number  orbital_period  mass  distance  year
  113  Astrometry       1          246.36   NaN     20.77  2013
  537  Astrometry       1         1016.00   NaN     14.98  2010),
 ('Eclipse Timing Variations',
                         method  number  orbital_period  mass  distance  year
  32  Eclipse Timing Variations       1        10220.00  6.05       NaN  2009
  37  Eclipse Timing Variations       2         5767.00   NaN    130.72  2008
  38  Eclipse Timing Variations       2         3321.00   NaN    130.72  2008
  39  Eclipse Timing Variations       2         5573.55   NaN    500.00  2010
  40  Eclipse Timing Variations       2         2883.50   NaN    500.00  2010
  41  Eclipse Timing Variations       1         2900.00   NaN       NaN  2011
  42  Eclipse Timing Variations       1         4343.50  4.20       NaN  2012
  43  Eclipse Timing Variations       2         5840.00   NaN       NaN  2011
  44  Eclipse Timing Variations       2         1916.25  

In [13]:
planets.groupby('method')['year'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,2011.5,2.12132,2010.0,2010.75,2011.5,2012.25,2013.0
Eclipse Timing Variations,9.0,2010.0,1.414214,2008.0,2009.0,2010.0,2011.0,2012.0
Imaging,38.0,2009.131579,2.781901,2004.0,2008.0,2009.0,2011.0,2013.0
Microlensing,23.0,2009.782609,2.859697,2004.0,2008.0,2010.0,2012.0,2013.0
Orbital Brightness Modulation,3.0,2011.666667,1.154701,2011.0,2011.0,2011.0,2012.0,2013.0
Pulsar Timing,5.0,1998.4,8.38451,1992.0,1992.0,1994.0,2003.0,2011.0
Pulsation Timing Variations,1.0,2007.0,,2007.0,2007.0,2007.0,2007.0,2007.0
Radial Velocity,553.0,2007.518987,4.249052,1989.0,2005.0,2009.0,2011.0,2014.0
Transit,397.0,2011.236776,2.077867,2002.0,2010.0,2012.0,2013.0,2014.0
Transit Timing Variations,4.0,2012.5,1.290994,2011.0,2011.75,2012.5,2013.25,2014.0


In [13]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)})
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


##### Specifying the split key

So far, we grouped the data based on values of a given column. But we can be more flexible.

In [14]:
L = [0, 1, 0, 1, 2, 0]
df.groupby(L).sum()

#It is to be noted that length of L must be equal to that of DataFrame.

Unnamed: 0,data1,data2
0,7,17
1,4,3
2,4,7


In [15]:
df2 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
df2.groupby(mapping).sum()

Unnamed: 0,data1,data2
consonant,12,19
vowel,3,8


In [16]:
df2.groupby(str.lower).mean()

Unnamed: 0,data1,data2
a,1.5,4.0
b,2.5,3.5
c,3.5,6.0


#### Removing the MultiIndex after grouping

Inevitably, when using `groupby`, you will likely create a MultiIndex in the columns or rows
or both. DataFrames with MultiIndexes are more difficult to navigate and occasionally have
confusing column names as well.

In [17]:
data = pd.DataFrame({'Dept': ['Accounting', 'HR', 'Project', 'Project', 'HR', 'Accounting'],
                   'Location': ['Delhi','Mumbai', 'Mumbai', 'Delhi', 'Banglore','Banglore'],
                   'Employees': rng.randint(0, 10, 6), 'Clients': [2,4,3,5,3,2]})
data

Unnamed: 0,Dept,Location,Employees,Clients
0,Accounting,Delhi,3,2
1,HR,Mumbai,5,4
2,Project,Mumbai,2,3
3,Project,Delhi,4,5
4,HR,Banglore,7,3
5,Accounting,Banglore,6,2


In [18]:
grouped = data.groupby(['Dept','Location']).agg({'Employees':['sum', 'mean'], 'Clients':[min, max]}).astype(int)
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Employees,Employees,Clients,Clients
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
Dept,Location,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Accounting,Banglore,6,6,2,2
Accounting,Delhi,3,3,2,2
HR,Banglore,7,7,3,3
HR,Mumbai,5,5,4,4
Project,Delhi,4,4,5,5
Project,Mumbai,2,2,3,3


In [19]:
level0 = grouped.columns.get_level_values(0)
level0

Index(['Employees', 'Employees', 'Clients', 'Clients'], dtype='object')

In [20]:
level1 = grouped.columns.get_level_values(1)
level1

Index(['sum', 'mean', 'min', 'max'], dtype='object')

In [21]:
grouped.columns = level0 + '_' + level1

In [22]:
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Employees_sum,Employees_mean,Clients_min,Clients_max
Dept,Location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Accounting,Banglore,6,6,2,2
Accounting,Delhi,3,3,2,2
HR,Banglore,7,7,3,3
HR,Mumbai,5,5,4,4
Project,Delhi,4,4,5,5
Project,Mumbai,2,2,3,3


In [23]:
grouped.reset_index()

Unnamed: 0,Dept,Location,Employees_sum,Employees_mean,Clients_min,Clients_max
0,Accounting,Banglore,6,6,2,2
1,Accounting,Delhi,3,3,2,2
2,HR,Banglore,7,7,3,3
3,HR,Mumbai,5,5,4,4
4,Project,Delhi,4,4,5,5
5,Project,Mumbai,2,2,3,3


When using the `agg` method to perform an aggregation on multiple columns, pandas
creates an index object with two levels. The aggregating columns become the top level and
the aggregating functions become the bottom level. Pandas displays MultiIndex levels
differently than single-level columns. Except for the innermost levels, repeated index
values do not get displayed on the screen. You can inspect the DataFrame from step 1 to
verify this. For instance, the `Employees` column shows up only once but it refers to both of the
first two columns.

The innermost MultiIndex level is the one closest to the data. This would
be the bottom-most column level and the right-most index level.

#### Preventing grouping columns to be index

in `groupby` method, set `set_index` to `False`

In [24]:
data.groupby(['Dept','Location'], as_index = False).agg({'Employees':['sum', 'mean'], 'Clients':[min, max]})


Unnamed: 0_level_0,Dept,Location,Employees,Employees,Clients,Clients
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,min,max
0,Accounting,Banglore,6,6,2,2
1,Accounting,Delhi,3,3,2,2
2,HR,Banglore,7,7,3,3
3,HR,Mumbai,5,5,4,4
4,Project,Delhi,4,4,5,5
5,Project,Mumbai,2,2,3,3


#### Customized aggregating function

In [25]:
data

Unnamed: 0,Dept,Location,Employees,Clients
0,Accounting,Delhi,3,2
1,HR,Mumbai,5,4
2,Project,Mumbai,2,3
3,Project,Delhi,4,5
4,HR,Banglore,7,3
5,Accounting,Banglore,6,2


In [26]:
def norm(s):
    b =(s-s.min())/(s.max()-s.min())
    return b.max()

data.groupby('Dept')['Employees'].agg(norm)

Dept
Accounting    1
HR            1
Project       1
Name: Employees, dtype: int64

Notice that this custom function `norm` accepts a single parameter, `s`. Looking ahead, you will notice that the function name is placed inside the `agg` method without directly being called. Nowhere is the parameter `s` explicitly
passed to `norm`. Instead, pandas implicitly passes the `Employees` column as a Series to `norm`.
The `norm` function is called once for each group. As `s` is a Series, all normal `Series` methods are available.

In [27]:
data.groupby('Dept').agg({'Employees': [norm, min], 'Clients': max})

Unnamed: 0_level_0,Employees,Employees,Clients
Unnamed: 0_level_1,norm,min,max
Dept,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Accounting,1,3,2
HR,1,5,4
Project,1,2,5


Notice the name of column with `norm` values. We can change the column name as shown below -

In [28]:
norm.__name__

'norm'

In [29]:
norm.__name__ = 'Normalized'
data.groupby('Dept').agg({'Employees': [norm, min], 'Clients': max})


Unnamed: 0_level_0,Employees,Employees,Clients
Unnamed: 0_level_1,Normalized,min,max
Dept,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Accounting,1,3,2
HR,1,5,4
Project,1,2,5


#### Customized aggregating function with arguments

In [30]:
#mean of n largest number
def max_n_mean(s,n):
    a = s.sort_values(ascending = False)[:n]
    return a.mean()

s = pd.Series([2,4,3,5,6])
max_n_mean(s,3)

5.0

In [31]:
data.groupby('Dept')['Clients'].agg(max_n_mean,2) 

Dept
Accounting    2.0
HR            3.5
Project       4.0
Name: Clients, dtype: float64

See how aggregating function `max_n_mean`, alongwith its parameter value has been supplied to `agg` method.

#### Examining the groupby object

In [32]:
%pprint
group = data.groupby('Dept')
[attr for attr in dir(group) if '_' not in attr]

Pretty printing has been turned OFF


['Clients', 'Dept', 'Employees', 'Location', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']

In [33]:
group.ngroups

3

In [34]:
group.size()

Dept
Accounting    2
HR            2
Project       2
dtype: int64

In [35]:
list(group.groups.keys())

['Accounting', 'HR', 'Project']

#### See below for a nice trick

In [36]:
group.get_group('HR')

Unnamed: 0,Dept,Location,Employees,Clients
1,HR,Mumbai,5,4
4,HR,Banglore,7,3


In [37]:
data

Unnamed: 0,Dept,Location,Employees,Clients
0,Accounting,Delhi,3,2
1,HR,Mumbai,5,4
2,Project,Mumbai,2,3
3,Project,Delhi,4,5
4,HR,Banglore,7,3
5,Accounting,Banglore,6,2


In [38]:
from IPython.display import display

for name, group in group:
    print(name)
    display(group)

Accounting


Unnamed: 0,Dept,Location,Employees,Clients
0,Accounting,Delhi,3,2
5,Accounting,Banglore,6,2


HR


Unnamed: 0,Dept,Location,Employees,Clients
1,HR,Mumbai,5,4
4,HR,Banglore,7,3


Project


Unnamed: 0,Dept,Location,Employees,Clients
2,Project,Mumbai,2,3
3,Project,Delhi,4,5


---

## Filtering

It is possible to mark entire groups of data as either `True` or `False` before filtering out the `False` groups. To do this, we first form groups with the `groupby` method and then apply the `filter` method. The `filter` method
accepts a function that must return either `True` or `False` to indicate whether a group is kept or not.

**Method signature**

`DataFrame.GroupBy`.filter(func, dropna = True, *****args, ***kwargs)

func - function to apply to each subframe. Should return boolean

dropna - drop groups that do not pass the filter,  True by default

##### Basic examples

In [39]:
df = pd.DataFrame({'A':['foo','bar']*3, 'B': range(1,7), 'C':[2,5,8,1,2,9]})
df

Unnamed: 0,A,B,C
0,foo,1,2
1,bar,2,5
2,foo,3,8
3,bar,4,1
4,foo,5,2
5,bar,6,9


In [40]:
df.groupby('A').filter(lambda x: x['B'].mean() > 3)

Unnamed: 0,A,B,C
1,bar,2,5
3,bar,4,1
5,bar,6,9


A very important aspect to filter is that it passes the entire DataFrame for that particular
group to the user-defined function and returns a single boolean for each group.

## Transformation

In [41]:
df

Unnamed: 0,A,B,C
0,foo,1,2
1,bar,2,5
2,foo,3,8
3,bar,4,1
4,foo,5,2
5,bar,6,9


In [42]:
df.groupby('A')['B'].transform(lambda x: x-x.mean()) #x is a sub-group

0   -2
1   -2
2    0
3    0
4    2
5    2
Name: B, dtype: int64

In [43]:
df.groupby('A').transform(lambda x: x-x.mean()) #x is a sub-group

Unnamed: 0,B,C
0,-2.0,-2.0
1,-2.0,0.0
2,0.0,4.0
3,0.0,-4.0
4,2.0,-2.0
5,2.0,4.0


## Apply

The function passed to `apply` must take a dataframe as its first argument and return a dataframe, a series or a sacalar. `apply` will then take care of combining the results back together into a single dataframe or series. 

In [44]:
df = pd.DataFrame({'A': 'a a b'.split(), 'B': range(1,4), 'C':[4,6,5]})
df

Unnamed: 0,A,B,C
0,a,1,4
1,a,2,6
2,b,3,5


Below `apply` returns a dataframe. `apply` combines the result for each group together into a new dataframe.

In [45]:
g = df.groupby('A')

g.apply(lambda x: x/x.sum())

Unnamed: 0,B,C
0,0.333333,0.4
1,0.666667,0.6
2,1.0,1.0


Below `apply` returns a series. `apply` combines the result for each group together into a new dataframe.

In [46]:
g.apply(lambda x: x.max()-x.min())

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,2
b,0,0


Below `apply` returns a scalar. `apply` combines the result for each group together into a new series, including setting the index as appropriate.

In [47]:
g.apply(lambda x: x.C.max() - x.B.min())

A
a    5
b    2
dtype: int64

In [48]:
df

Unnamed: 0,A,B,C
0,a,1,4
1,a,2,6
2,b,3,5


In [49]:
df

Unnamed: 0,A,B,C
0,a,1,4
1,a,2,6
2,b,3,5


In [50]:
def norm_by_data2(x):
    # x is a DataFrame of group values
    x['B'] /= x['C'].sum()
    return x

df.groupby('A').apply(norm_by_data2)

Unnamed: 0,A,B,C
0,a,0.1,4
1,a,0.2,6
2,b,0.6,5


### Grouping by continuous variables
When grouping in pandas, you typically use columns with discrete repeating values. If
there are no repeated values, then grouping would be pointless as there would only be one
row per group. Continuous numeric columns typically have few repeated values and are
generally not used to form groups. However, if we can transform columns with continuous
values into a discrete column by placing each value into a bin, rounding them, or using
some other mapping, then grouping with them makes sense.

In [51]:
np.random.seed(123)
a = np.random.randint(1,40,100)
a

array([ 3, 29, 35, 39, 18, 20, 23, 34, 33, 10, 33, 33, 26, 20, 15, 37, 33,
       17,  5,  4,  3, 21,  3, 21,  8, 36, 29, 39, 34, 22, 31, 28, 35, 34,
       13,  4,  6,  1, 12, 35, 11, 23, 14, 19, 37, 16, 28, 31,  7, 27, 17,
        7, 15, 12,  8,  2, 38, 26, 21, 13, 19, 18,  2, 28, 23,  4,  4, 12,
       22, 26, 35,  4, 12,  4, 31,  7, 10, 24, 15, 39, 20,  7, 13, 28, 39,
       18, 11, 36, 36,  2, 17,  6, 23, 16, 26,  1, 36, 30,  2, 20])

In [52]:
foo = pd.DataFrame({'A': a, 'B': np.random.randn(100)})
foo.head()

Unnamed: 0,A,B
0,3,0.975424
1,29,-0.376174
2,35,-0.780835
3,39,1.541836
4,18,-0.593199


In [53]:
bins = [0,10,20,30,40]
cuts =pd.cut(foo['A'], bins = bins)
cuts.head()

0     (0, 10]
1    (20, 30]
2    (30, 40]
3    (30, 40]
4    (10, 20]
Name: A, dtype: category
Categories (4, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40]]

In [54]:
cuts.value_counts()

(10, 20]    27
(0, 10]     26
(30, 40]    25
(20, 30]    22
Name: A, dtype: int64

In [55]:
foo.groupby(cuts).count()

Unnamed: 0_level_0,A,B
A,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 10]",26,26
"(10, 20]",27,27
"(20, 30]",22,22
"(30, 40]",25,25


In [56]:
foo.groupby(cuts)['B'].quantile(q = [.25,.5,.75])

A             
(0, 10]   0.25   -0.976957
          0.50   -0.141458
          0.75    0.903343
(10, 20]  0.25   -0.546554
          0.50    0.495855
          0.75    0.899025
(20, 30]  0.25   -0.487680
          0.50    0.157491
          0.75    1.124305
(30, 40]  0.25   -0.780835
          0.50   -0.206624
          0.75    0.517492
Name: B, dtype: float64

In [57]:
data

Unnamed: 0,Dept,Location,Employees,Clients
0,Accounting,Delhi,3,2
1,HR,Mumbai,5,4
2,Project,Mumbai,2,3
3,Project,Delhi,4,5
4,HR,Banglore,7,3
5,Accounting,Banglore,6,2


In [58]:
df

Unnamed: 0,A,B,C
0,a,1,4
1,a,2,6
2,b,3,5


In [59]:
df.groupby('A').size()

A
a    2
b    1
dtype: int64

In [62]:
data.groupby('Dept').size()

Dept
Accounting    2
HR            2
Project       2
dtype: int64