In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

from numpy.random import randn
from numpy.random import randint

# Summary

- Group by on dataframes
- Group by on dict and dataframes
- Aggregation
- Splitting applying and combining
- Cross Tabulation

# Development

### Group by on dataframes

In [2]:
#Let's make a dframe
dframe = DataFrame({'k1':['X','X','Y','Y','Z'],
                    'k2':['alpha','beta','alpha','beta','alpha'],
                    'dataset1':np.random.randn(5),
                    'dataset2':np.random.randn(5)})

#Show
print(dframe)

# GROUP BY on ONE Column
group1 = dframe['dataset1'].groupby(dframe['k1'])
print(group1.mean())


# # For example if we only wanted to group the dataset2 column with both sets of keys
# dataset2_group = dframe.groupby(['k1','k2'])[['dataset2']]

# dataset2_group.mean()



# GROUP BY ONE key
print(dframe.groupby('k1').mean())
print(dframe.groupby(['k1']).size())

# GROUP BY MORE keys
print(dframe.groupby(['k1','k2']).mean())
print(dframe.groupby(['k1','k2']).size())


  k1     k2  dataset1  dataset2
0  X  alpha  1.001349 -1.940200
1  X   beta  0.407667  0.749409
2  Y  alpha -0.756223 -0.677726
3  Y   beta -0.899028 -0.342027
4  Z  alpha -0.130698  0.564096
k1
X    0.704508
Y   -0.827626
Z   -0.130698
Name: dataset1, dtype: float64
    dataset1  dataset2
k1                    
X   0.704508 -0.595396
Y  -0.827626 -0.509876
Z  -0.130698  0.564096
k1
X    2
Y    2
Z    1
dtype: int64
          dataset1  dataset2
k1 k2                       
X  alpha  1.001349 -1.940200
   beta   0.407667  0.749409
Y  alpha -0.756223 -0.677726
   beta  -0.899028 -0.342027
Z  alpha -0.130698  0.564096
k1  k2   
X   alpha    1
    beta     1
Y   alpha    1
    beta     1
Z   alpha    1
dtype: int64


In [9]:
# We can also iterate over groups

#For example:
for name,group in dframe.groupby('k1'):
    print("This is the %s group" %name)
    print(group)
    print('\n')

This is the X group
  k1     k2  dataset1  dataset2
0  X  alpha -0.703745 -0.343500
1  X   beta  0.082883 -1.734687


This is the Y group
  k1     k2  dataset1  dataset2
2  Y  alpha   0.87620 -1.255499
3  Y   beta   0.35628  0.266643


This is the Z group
  k1     k2  dataset1  dataset2
4  Z  alpha  0.023806 -0.481867




In [4]:
# TACTIC! is creating a dictionary of the data pieces 
group_dict = dict(list(dframe.groupby('k1')))

print(group_dict)

#Show the group with X
print(group_dict['X'])
print(group_dict['Y'])

{'X':   k1     k2  dataset1  dataset2
0  X  alpha  1.001349 -1.940200
1  X   beta  0.407667  0.749409, 'Y':   k1     k2  dataset1  dataset2
2  Y  alpha -0.756223 -0.677726
3  Y   beta -0.899028 -0.342027, 'Z':   k1     k2  dataset1  dataset2
4  Z  alpha -0.130698  0.564096}
  k1     k2  dataset1  dataset2
0  X  alpha  1.001349 -1.940200
1  X   beta  0.407667  0.749409
  k1     k2  dataset1  dataset2
2  Y  alpha -0.756223 -0.677726
3  Y   beta -0.899028 -0.342027


### Group by on dict and dataframes

In [10]:
# Let's make a Dframe

animals = DataFrame(np.arange(16).reshape(4, 4),
                   columns=['W', 'X', 'Y', 'Z'],
                   index=['Dog', 'Cat', 'Bird', 'Mouse'])

#Now lets add some NAN values
animals.loc['Cat',['W','Y']] = np.nan 

#Show
animals

Unnamed: 0,W,X,Y,Z
Dog,0.0,1,2.0,3
Cat,,5,,7
Bird,8.0,9,10.0,11
Mouse,12.0,13,14.0,15


In [9]:
# animals.iloc[1,[0,2]] = 'samurai'
# animals

Unnamed: 0,W,X,Y,Z
Dog,0,1,2,3
Cat,samurai,5,samurai,7
Bird,8,9,10,11
Mouse,12,13,14,15


In [43]:
# animals.loc['Cat',['W','Y']] = 1000

In [13]:
# Counts how many BED - GOOD according to the mapping in the dictionary

# Now let's say I had a dictionary with ebhavior values in it
behavior_map = {'W': 'good', 'X': 'bad', 'Y': 'good','Z': 'bad'}

# Now we can groupby using that mapping
animal_col = animals.groupby(behavior_map, axis=1)

# Show the sum accroding to the groupby with the mapping
print(animal_col.sum())
print(animal_col.count())
# For example [dog][good] = [dog][Y]+[dog][W]


        bad  good
Dog     4.0   2.0
Cat    12.0   0.0
Bird   20.0  18.0
Mouse  28.0  26.0
       bad  good
Dog      2     2
Cat      2     0
Bird     2     2
Mouse    2     2


In [15]:
# Aggregate according to the length of the index key
animals.groupby(len).sum()

Unnamed: 0,W,X,Y,Z
3,0.0,6,2.0,10
4,8.0,9,10.0,11
5,12.0,13,14.0,15


In [None]:
# We can also use groupby with hierarchaly index levels

#Create a hierarchal column index
hier_col = pd.MultiIndex.from_arrays([['NY','NY','NY','SF','SF'],[1,2,3,1,2]],names=['City','sub_value'])

# Create a dframe with hierarchal index
dframe_hr = DataFrame(np.arange(25).reshape(5,5),columns=hier_col)

#Multiply values by 100 for clarity
dframe_hr = dframe_hr*100

#Show
dframe_hr

### Aggregation

### Splitting applying and combining

### Cross Tabulation

# SUMMARY - RECAP

- Group by on dataframes
- Group by on dict and dataframes
- Aggregation
- Splitting applying and combining
- Cross Tabulation