In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])

In [3]:
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [4]:
df.groupby('key').sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [5]:
df.groupby('key')

<pandas.core.groupby.DataFrameGroupBy object at 0x0000000008570A90>

In [6]:
gb = df.groupby('key')

In [8]:
gb.max()   ### Maximum value in each of the groups.. i.e. max in each of A,B,C

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,4
C,5


In [9]:
gb.count()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,2
B,2
C,2


In [10]:
rng = np.random.RandomState(0)

In [11]:
rng

<mtrand.RandomState at 0x43393a8>

In [12]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])

In [13]:
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [14]:
gb = df.groupby('key')

In [15]:
gb

<pandas.core.groupby.DataFrameGroupBy object at 0x00000000085D0518>

In [16]:
gb.aggregate([min,max,median])

NameError: name 'median' is not defined

In [17]:
gb.aggregate([min,max,np.median])   #### Aggregate takes multiple values and gives only one value as output

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,max,median,min,max,median
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,3,1.5,3,5,4.0
B,1,4,2.5,0,7,3.5
C,2,5,3.5,3,9,6.0


In [18]:
gb.aggregate({'data1':'min', 'data2':'max'})  ### Computes aggreagate of minimum of data1 and maximum of data2

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,7
C,2,9


In [19]:
gb.std()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.12132,1.414214
B,2.12132,4.949747
C,2.12132,4.242641


# Filtering 


# x is a groupby object which is getting to this function
# filter takes any one arg-function, which returns True or False
# Whenever True - data is retained, otherwise false

In [20]:
gb.filter(lambda x: x['data2'].std() > 4)

Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


# Transformation

In [21]:
## Returns full version of the entire transformed data

gb.transform(lambda x : x + x.mean())  ## Gives the value of itself plus the mean of its entire group .. eg : A for the entire gb

Unnamed: 0,data1,data2
0,1.5,9.0
1,3.5,3.5
2,5.5,9.0
3,4.5,7.0
4,6.5,10.5
5,8.5,15.0


In [22]:
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


# Apply
        

In [24]:
def f(x):
    # Using something on top of groupby always works on groupby data
    x['data1'] += x['data2'].sum()  ## data1 + sum of groups 'A','B','C' respectively to its corresponding element in data1
    return x

gb.apply(f)

Unnamed: 0,key,data1,data2
0,A,8,5
1,B,8,0
2,C,14,3
3,A,11,3
4,B,11,7
5,C,17,9


In [25]:
def f(d):
    d['new'] = d['data1'] + d['data2'].sum()
    return d
    
gb.apply(f)

Unnamed: 0,key,data1,data2,new
0,A,0,5,8
1,B,1,0,8
2,C,2,3,14
3,A,3,3,11
4,B,4,7,11
5,C,5,9,17


# Specifying The split Key

In [26]:
d = df['key']

In [27]:
d

0    A
1    B
2    C
3    A
4    B
5    C
Name: key, dtype: object

In [28]:
df.groupby(d[:4]).sum()  ## Adds value of A from index 3 since values are considered upto 3

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3,8
B,1,0
C,2,3


In [29]:
df

Unnamed: 0,key,data1,data2
0,A,0,5
1,B,1,0
2,C,2,3
3,A,3,3
4,B,4,7
5,C,5,9


In [30]:
df2 = df.set_index('key')

In [31]:
df2

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9


In [32]:
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}

In [33]:
mapping

{'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}

In [34]:
df2.groupby(mapping).sum()

Unnamed: 0,data1,data2
consonant,12,19
vowel,3,8


In [35]:
df2

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,5
B,1,0
C,2,3
A,3,3
B,4,7
C,5,9
