In [1]:
import pandas as pd 
df = pd.read_csv('weather_by_cities.csv')
df

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32,6,Rain
1,1/2/2017,new york,36,7,Sunny
2,1/3/2017,new york,28,12,Snow
3,1/4/2017,new york,33,7,Sunny
4,1/1/2017,mumbai,90,5,Sunny
5,1/2/2017,mumbai,85,12,Fog
6,1/3/2017,mumbai,87,15,Fog
7,1/4/2017,mumbai,92,5,Rain
8,1/1/2017,paris,45,20,Sunny
9,1/2/2017,paris,50,13,Cloudy


In [4]:
paris_mean_temp = df.temperature[df['city']== 'paris' ].mean()
print(paris_mean_temp)

47.75


**What was the maximum temperature in each of these 3 cities?**

In [8]:
new_york_mean_temp = df.temperature[df['city']== 'new york' ].max()
mumbai_mean_temp = df.temperature[df['city']== 'mumbai' ].max()
paris_mean_temp = df.temperature[df['city']== 'paris' ].max()
print('NYC',new_york_mean_temp,'/ Mumbai',mumbai_mean_temp,'/ Paris',paris_mean_temp)


NYC 36 / Mumbai 92 / Paris 54


**We can find the solution easily by grouping**

In [9]:
group = df.groupby('city')
group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f8e20bcef40>

**This is the shape of the group**

<img src="group_by_cities.png">

In [14]:
for city, data in group:
    print('city:', city,'\n',data,'\n')

city: mumbai 
         day    city  temperature  windspeed  event
4  1/1/2017  mumbai           90          5  Sunny
5  1/2/2017  mumbai           85         12    Fog
6  1/3/2017  mumbai           87         15    Fog
7  1/4/2017  mumbai           92          5   Rain 

city: new york 
         day      city  temperature  windspeed  event
0  1/1/2017  new york           32          6   Rain
1  1/2/2017  new york           36          7  Sunny
2  1/3/2017  new york           28         12   Snow
3  1/4/2017  new york           33          7  Sunny 

city: paris 
          day   city  temperature  windspeed   event
8   1/1/2017  paris           45         20   Sunny
9   1/2/2017  paris           50         13  Cloudy
10  1/3/2017  paris           54          8  Cloudy
11  1/4/2017  paris           42         10  Cloudy 



In [15]:
new_york = group.get_group('new york')
new_york

Unnamed: 0,day,city,temperature,windspeed,event
0,1/1/2017,new york,32,6,Rain
1,1/2/2017,new york,36,7,Sunny
2,1/3/2017,new york,28,12,Snow
3,1/4/2017,new york,33,7,Sunny


In [16]:
group.max()

Unnamed: 0_level_0,day,temperature,windspeed,event
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mumbai,1/4/2017,92,15,Sunny
new york,1/4/2017,36,12,Sunny
paris,1/4/2017,54,20,Sunny


In [18]:
group.mean()

Unnamed: 0_level_0,temperature,windspeed
city,Unnamed: 1_level_1,Unnamed: 2_level_1
mumbai,88.5,9.25
new york,32.25,8.0
paris,47.75,12.75


**This method of splitting your dataset in smaller groups and then applying an operation 
(such as min or max) to get aggregate result is called Split-Apply-Combine. It is illustrated in a diagram below**

<img src="split_apply_combine.png">

In [22]:
group.temperature.max() # this is definitely easier than calculating the temperature for each city individually

city
mumbai      92
new york    36
paris       54
Name: temperature, dtype: int64

In [23]:
group.describe()

Unnamed: 0_level_0,temperature,temperature,temperature,temperature,temperature,temperature,temperature,temperature,windspeed,windspeed,windspeed,windspeed,windspeed,windspeed,windspeed,windspeed
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
city,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
mumbai,4.0,88.5,3.109126,85.0,86.5,88.5,90.5,92.0,4.0,9.25,5.057997,5.0,5.0,8.5,12.75,15.0
new york,4.0,32.25,3.304038,28.0,31.0,32.5,33.75,36.0,4.0,8.0,2.708013,6.0,6.75,7.0,8.25,12.0
paris,4.0,47.75,5.315073,42.0,44.25,47.5,51.0,54.0,4.0,12.75,5.251984,8.0,9.5,11.5,14.75,20.0


In [24]:
group.size()

city
mumbai      4
new york    4
paris       4
dtype: int64

In [25]:
group.count()

Unnamed: 0_level_0,day,temperature,windspeed,event
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mumbai,4,4,4,4
new york,4,4,4,4
paris,4,4,4,4


### Grouping Using Custom Functions 

In [26]:
def grouper(df, idx, col):
    if 2<= df[col].loc[idx] <=5:
        return '2-5'
    elif 6 <= df[col].loc[idx] <=10:
        return '6-10'
    else:
        return 'others'

In [27]:
group = df.groupby(lambda x: grouper(df,x,'windspeed'))
group

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f8e1008d730>

In [28]:
group.get_group('others')

Unnamed: 0,day,city,temperature,windspeed,event
2,1/3/2017,new york,28,12,Snow
5,1/2/2017,mumbai,85,12,Fog
6,1/3/2017,mumbai,87,15,Fog
8,1/1/2017,paris,45,20,Sunny
9,1/2/2017,paris,50,13,Cloudy
