# Split &bull; Apply &bull; Combine 
 - First the dataframe is split into groups based upon one or more keys. This split can be along either axis (axis=0:rows, or axis=1:columns).
 - Then a function is applied to each group producing a new value
 - Finally the results of these operations are combinded into a result obkect


In [33]:
import numpy as np 
import pandas as pd 

#read some us census data
df = pd.read_csv('../resources/week-3/datasets/census.csv')

#limit the dataset to sumlev > 40
df = df[df['SUMLEV']>40]
#df is equal to df where sumlev of df is greater than 50 
df.columns


Index(['SUMLEV', 'REGION', 'DIVISION', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME',
       'CENSUS2010POP', 'ESTIMATESBASE2010', 'POPESTIMATE2010',
       'POPESTIMATE2011', 'POPESTIMATE2012', 'POPESTIMATE2013',
       'POPESTIMATE2014', 'POPESTIMATE2015', 'NPOPCHG_2010', 'NPOPCHG_2011',
       'NPOPCHG_2012', 'NPOPCHG_2013', 'NPOPCHG_2014', 'NPOPCHG_2015',
       'BIRTHS2010', 'BIRTHS2011', 'BIRTHS2012', 'BIRTHS2013', 'BIRTHS2014',
       'BIRTHS2015', 'DEATHS2010', 'DEATHS2011', 'DEATHS2012', 'DEATHS2013',
       'DEATHS2014', 'DEATHS2015', 'NATURALINC2010', 'NATURALINC2011',
       'NATURALINC2012', 'NATURALINC2013', 'NATURALINC2014', 'NATURALINC2015',
       'INTERNATIONALMIG2010', 'INTERNATIONALMIG2011', 'INTERNATIONALMIG2012',
       'INTERNATIONALMIG2013', 'INTERNATIONALMIG2014', 'INTERNATIONALMIG2015',
       'DOMESTICMIG2010', 'DOMESTICMIG2011', 'DOMESTICMIG2012',
       'DOMESTICMIG2013', 'DOMESTICMIG2014', 'DOMESTICMIG2015', 'NETMIG2010',
       'NETMIG2011', 'NETMIG2012', 'NETMI

to get the average estimated population (POPESTIMATE2010) by state(STNAME), first:
### Split

In [34]:
# first step is to split the data into state name groups. 
grouped = df['POPESTIMATE2010'].groupby(df['STNAME'])
grouped
# This is the end of the split stage. At this stage nothing has been computed yet, but the GroupBy object has been created which has all the information needed to Apply a transformation to each of the groups

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7d59aac403a0>

after the groups have been defined and split, you can
### Apply Transformation & Combine

In [35]:
# calculate the mean of groups (estimated state population grouped by state)
grouped.mean()

STNAME
Alabama                  71420.313433
Alaska                   24621.413793
Arizona                 427213.866667
Arkansas                 38965.253333
California              643691.017241
Colorado                 78878.968750
Connecticut             447464.625000
Delaware                299930.333333
District of Columbia    605126.000000
Florida                 281341.641791
Georgia                  61090.905660
Hawaii                  272796.000000
Idaho                    35704.227273
Illinois                125894.598039
Indiana                  70549.891304
Iowa                     30815.090909
Kansas                   27226.895238
Kentucky                 36232.808333
Louisiana                71014.859375
Maine                    82980.937500
Maryland                241183.708333
Massachusetts           468931.142857
Michigan                119004.445783
Minnesota                61044.862069
Mississippi              36223.365854
Missouri                 52139.582609
Monta

In [36]:
# groupby can be passed multiple keys. When passed as separate df's it produces a hierachical list
# means = df['BIRTHS2010'].groupby([df['REGION'], df['STNAME']]).mean()
means = df['BIRTHS2010'].groupby([df['STNAME'], df['REGION']]).mean()
means

STNAME                REGION
Alabama               3          212.328358
Alaska                4           99.724138
Arizona               4         1395.800000
Arkansas              3          125.853333
California            4         2126.275862
Colorado              4          262.359375
Connecticut           1         1186.125000
Delaware              3          949.000000
District of Columbia  3         2243.000000
Florida               3          759.283582
Georgia               3          204.333333
Hawaii                4          929.200000
Idaho                 4          135.704545
Illinois              2          410.156863
Indiana               2          227.836957
Iowa                  2          100.676768
Kansas                2           95.161905
Kentucky              3          114.091667
Louisiana             3          227.703125
Maine                 1          215.187500
Maryland              3          769.208333
Massachusetts         1         1331.928571
Mic

In [37]:
# the keys can be converted to to colmumns using the unstack function
means.unstack()


REGION,1,2,3,4
STNAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Alabama,,,212.328358,
Alaska,,,,99.724138
Arizona,,,,1395.8
Arkansas,,,125.853333,
California,,,,2126.275862
Colorado,,,,262.359375
Connecticut,1186.125,,,
Delaware,,,949.0,
District of Columbia,,,2243.0,
Florida,,,759.283582,


In [38]:
# The GroupBy object supports iteration, generating a sequence of 2-tuples
#in this example, 

# using the groupby function
# 1) split
for group, frame in df.groupby('STNAME'):
    # groupby returns a tuple. The first value is the key that is being grouped by,
    # the second is a dataframe found for that group

    # now the apply step is to do the actual calculation
    avg = np.average(frame['CENSUS2010POP'])
    print('state: '+ group + " avg pop: " + str(avg))

state: Alabama avg pop: 71339.34328358209
state: Alaska avg pop: 24490.724137931036
state: Arizona avg pop: 426134.4666666667
state: Arkansas avg pop: 38878.90666666667
state: California avg pop: 642309.5862068966
state: Colorado avg pop: 78581.1875
state: Connecticut avg pop: 446762.125
state: Delaware avg pop: 299311.3333333333
state: District of Columbia avg pop: 601723.0
state: Florida avg pop: 280616.5671641791
state: Georgia avg pop: 60928.63522012578
state: Hawaii avg pop: 272060.2
state: Idaho avg pop: 35626.86363636364
state: Illinois avg pop: 125790.50980392157
state: Indiana avg pop: 70476.10869565218
state: Iowa avg pop: 30771.262626262625
state: Kansas avg pop: 27172.55238095238
state: Kentucky avg pop: 36161.39166666667
state: Louisiana avg pop: 70833.9375
state: Maine avg pop: 83022.5625
state: Maryland avg pop: 240564.66666666666
state: Massachusetts avg pop: 467687.78571428574
state: Michigan avg pop: 119080.0
state: Minnesota avg pop: 60964.65517241379
state: Mississi

In [39]:
# you can also group by a function which might be useful for processing a large number of records
#for example, this function will create 3 batches
df=df.set_index('STNAME')

# this is the batching function
def set_batch_number(item):
    if item[0] < 'M':
        return 0
    if item[0] < 'Q':
        return 1
    return 2

# this groups by the batching function
for group, frame in df.groupby(set_batch_number):
    print( 'group' + str(group) + " has " + str(len(frame)) + " records to process")
    
# note that no column was specified in the groupby. If no column is specified, groupby will use the indexed column to group


group0 has 1177 records to process
group1 has 1134 records to process
group2 has 831 records to process


In [40]:
# airbnb example
# grouping by two columns. Cancellation Policy and Review scores
df = pd.read_csv('../resources/week-3/datasets/listings.csv')
df.head()
df.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       'notes', 'transit', 'access', 'interaction', 'house_rules',
       'thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url',
       'host_id', 'host_url', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'street',
       'neighbourhood', 'neighbourhood_cleansed',
       'neighbourhood_group_cleansed', 'city', 'state', 'zipcode', 'market',
       'smart_location', 'country_code', 'country', 'latitude', 'longitude',
       'is_location_exact', 'property_type', 'room_type', 'accommodates',
       'bathrooms',

In [41]:
# method to group both columns would be to promote them to a multi-index
df = df.set_index(['cancellation_policy', 'review_scores_value'])
for group, frame in df.groupby(level=(0,1)):
    print(group)
# in a df with a multi-index, you need to specify the levels (0,1) on which you want to groupby 

('flexible', 2.0)
('flexible', 4.0)
('flexible', 5.0)
('flexible', 6.0)
('flexible', 7.0)
('flexible', 8.0)
('flexible', 9.0)
('flexible', 10.0)
('moderate', 2.0)
('moderate', 4.0)
('moderate', 6.0)
('moderate', 7.0)
('moderate', 8.0)
('moderate', 9.0)
('moderate', 10.0)
('strict', 2.0)
('strict', 3.0)
('strict', 4.0)
('strict', 5.0)
('strict', 6.0)
('strict', 7.0)
('strict', 8.0)
('strict', 9.0)
('strict', 10.0)
('super_strict_30', 6.0)
('super_strict_30', 7.0)
('super_strict_30', 8.0)
('super_strict_30', 9.0)
('super_strict_30', 10.0)


In [42]:
# we can further take this and apply a grouping function to segment a portion of the dataframe that we are interested in, for example, those with values of 10
def grouping_fun(item):
    #item passed is the tuple ('cancellation_policy, review_scores_value)
    if item[1] == 10.0 :
        return (item[0], "10.0")
    else: 
        return (item[0], "not 10.0")
for group, frame in df.groupby(grouping_fun):
    print (group)

('flexible', '10.0')
('flexible', 'not 10.0')
('moderate', '10.0')
('moderate', 'not 10.0')
('strict', '10.0')
('strict', 'not 10.0')
('super_strict_30', '10.0')
('super_strict_30', 'not 10.0')


In [43]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_communication,review_scores_location,requires_license,license,jurisdiction_names,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
cancellation_policy,review_scores_value,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
moderate,,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,,f,,,f,f,f,1,
moderate,9.0,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,Charming and quiet room in a second floor 1910...,none,"The room is in Roslindale, a diverse and prima...",...,10.0,9.0,f,,,t,f,f,1,1.3
moderate,10.0,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",none,The LOCATION: Roslindale is a safe and diverse...,...,10.0,9.0,f,,,f,t,f,1,0.47
moderate,10.0,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,...,10.0,10.0,f,,,f,f,f,1,1.0
flexible,10.0,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...","My comfy, clean and relaxing home is one block...",none,"I love the proximity to downtown, the neighbor...",...,10.0,9.0,f,,,f,f,f,1,2.25


## Aggregation, transformation, filtration of group data:
### aggregation:
apply a function to a group of columns that we are interested interested

In [46]:
# Chris warns to always remember to reset the index to a known state
df = df.reset_index()

# this is how to group by cancellation policy and find average review scores
# note that the nanmean function allows you to ignore na values in the calculation, which are included by default
df.groupby('cancellation_policy').agg({'review_scores_value':np.nanmean})
# note that passing in a dictionary has been deprecated, so in the future may need to do this in a different way


Unnamed: 0_level_0,review_scores_value
cancellation_policy,Unnamed: 1_level_1
flexible,9.237421
moderate,9.307398
strict,9.081441
super_strict_30,8.537313


In [47]:
# you can pass multiple columns and multiple operations
df.groupby('cancellation_policy').agg({'review_scores_value':(np.nanmean, np.nanstd),
                                       'reviews_per_month': np.nanmean})

Unnamed: 0_level_0,review_scores_value,review_scores_value,reviews_per_month
Unnamed: 0_level_1,nanmean,nanstd,nanmean
cancellation_policy,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
flexible,9.237421,1.096271,1.82921
moderate,9.307398,0.859859,2.391922
strict,9.081441,1.040531,1.873467
super_strict_30,8.537313,0.840785,0.340143


## transformation
transformation broadcasts a function you specify over a grouped dataframe

In [51]:
# identify the columns of interest as a list
cols = ['cancellation_policy', 'review_scores_value']

#store the transformation in a new dataframe
transform_df = df[cols].groupby('cancellation_policy').transform(np.nanmean)
transform_df.head()


Unnamed: 0,review_scores_value
0,9.307398
1,9.307398
2,9.307398
3,9.307398
4,9.237421


In [53]:
# the column can be renamed in place
transform_df.rename({'review_scores_value':'mean_review_score'}, axis='columns', inplace=True)

#and this can be merged with the original dataframe
df=df.merge(transform_df, left_index=True, right_index=True)

df.head()
# scroll right on the dataframe below and you will see that the mean review score has been added as a column to the right 

Unnamed: 0,index,cancellation_policy,review_scores_value,id,listing_url,scrape_id,last_scraped,name,summary,space,...,review_scores_location,requires_license,license,jurisdiction_names,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,mean_review_score
0,0,moderate,,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,...,,f,,,f,f,f,1,,9.307398
1,1,moderate,9.0,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,...,9.0,f,,,t,f,f,1,1.3,9.307398
2,2,moderate,10.0,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",...,9.0,f,,,f,t,f,1,0.47,9.307398
3,3,moderate,10.0,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,...,10.0,f,,,f,f,f,1,1.0,9.307398
4,4,flexible,10.0,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...",...,9.0,f,,,f,f,f,1,2.25,9.237421


## filtering
the filter function takes in a function and applies it to each group dataframe, and returns True or False as to whether that group should be included

In [56]:
# for example: filter for groups with mean rating above 9
df.groupby('cancellation_policy'). filter(lambda x: np.nanmean(x['review_scores_value'])>9.2).head()

Unnamed: 0,index,cancellation_policy,review_scores_value,id,listing_url,scrape_id,last_scraped,name,summary,space,...,review_scores_location,requires_license,license,jurisdiction_names,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month,mean_review_score
0,0,moderate,,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,...,,f,,,f,f,f,1,,9.307398
1,1,moderate,9.0,3075044,https://www.airbnb.com/rooms/3075044,20160906204935,2016-09-07,Charming room in pet friendly apt,Charming and quiet room in a second floor 1910...,Small but cozy and quite room with a full size...,...,9.0,f,,,t,f,f,1,1.3,9.307398
2,2,moderate,10.0,6976,https://www.airbnb.com/rooms/6976,20160906204935,2016-09-07,Mexican Folk Art Haven in Boston,"Come stay with a friendly, middle-aged guy in ...","Come stay with a friendly, middle-aged guy in ...",...,9.0,f,,,f,t,f,1,0.47,9.307398
3,3,moderate,10.0,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,...,10.0,f,,,f,f,f,1,1.0,9.307398
4,4,flexible,10.0,7651065,https://www.airbnb.com/rooms/7651065,20160906204935,2016-09-07,Come Home to Boston,"My comfy, clean and relaxing home is one block...","Clean, attractive, private room, one block fro...",...,9.0,f,,,f,f,f,1,2.25,9.237421


# Apply Function
allows you to apply an arbitrary function on each group and stitch the result back into a single dataframe where the index is preserved

In [57]:
# start out with a clean copy of the dataframe
df = pd.read_csv('../resources/week-3/datasets/listings.csv')

# drop all but the columns of interest
df = df[['cancellation_policy', 'review_scores_value']]
df.head()


Unnamed: 0,cancellation_policy,review_scores_value
0,moderate,
1,moderate,9.0
2,moderate,10.0
3,moderate,10.0
4,flexible,10.0


In [60]:
# apply lets you combine a lot of the operations into one place
def calc_mean_review_scores(group):
    # group is the dataframe of what we have grouped and can be treated as the complete dataframe
    avg = np.nanmean(group['review_scores_value'])
    #then broadcast the formula and create a new column in the group with the difference from the mean
    group['review_scores_mean']=np.abs(avg - group["review_scores_value"])   
    return(group)

# now apply the function
df.groupby('cancellation_policy').apply(calc_mean_review_scores).head()

Unnamed: 0,cancellation_policy,review_scores_value,review_scores_mean
0,moderate,,
1,moderate,9.0,0.307398
2,moderate,10.0,0.692602
3,moderate,10.0,0.692602
4,flexible,10.0,0.762579
