### `groupby()`

**groupby()** help us to examine data based on a per category. Pandas create _lazy_ group by object when we call `groupby()` function and wait for aggregate function to be called on this **groupby** object

In [1]:
import pandas as pd
import numpy as np

In [62]:
df = pd.read_csv(filepath_or_buffer='../datasets/mpg.csv')

In [63]:
df_copy = df.copy()

In [64]:
df.shape

(398, 9)

In [65]:
df.sample(6)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
165,20.0,8,262.0,110,3221,13.5,75,1,chevrolet monza 2+2
333,32.7,6,168.0,132,2910,11.4,80,3,datsun 280-zx
134,16.0,6,258.0,110,3632,18.0,74,1,amc matador
246,32.8,4,78.0,52,1985,19.4,78,3,mazda glc deluxe
114,26.0,4,98.0,90,2265,15.5,73,2,fiat 124 sport coupe
243,21.5,3,80.0,110,2720,13.5,77,3,mazda rx-4


In [66]:
# groupby can be applied mostly on categorical columns such as cylinders, model_year, origin
df['cylinders'].unique()

array([8, 4, 6, 3, 5])

In [67]:
# groupby can be applied mostly on categorical columns such as cylinders, model_year, origin
df['cylinders'].value_counts()

cylinders
4    204
8    103
6     84
3      4
5      3
Name: count, dtype: int64

#### `groupby` based on single column

In [68]:
# applying group by
df.groupby(by='cylinders') # return lazy groupby object that do not perform opearations unleass we ask

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x168f50b10>

In [69]:
df.groupby(by='cylinders').count()

Unnamed: 0_level_0,mpg,displacement,horsepower,weight,acceleration,model_year,origin,name
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3,4,4,4,4,4,4,4,4
4,204,204,204,204,204,204,204,204
5,3,3,3,3,3,3,3,3
6,84,84,84,84,84,84,84,84
8,103,103,103,103,103,103,103,103


In [70]:
df.groupby(by='cylinders').mean(numeric_only=True)

Unnamed: 0_level_0,mpg,displacement,weight,acceleration,model_year,origin
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,20.55,72.5,2398.5,13.25,75.5,3.0
4,29.286765,109.796569,2308.127451,16.601471,77.073529,1.985294
5,27.366667,145.0,3103.333333,18.633333,79.0,2.0
6,19.985714,218.142857,3198.22619,16.263095,75.928571,1.190476
8,14.963107,345.009709,4114.718447,12.95534,73.902913,1.0


In [71]:
df.groupby(by='cylinders').sum(numeric_only=True)

Unnamed: 0_level_0,mpg,displacement,weight,acceleration,model_year,origin
cylinders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3,82.2,290.0,9594,53.0,302,12
4,5974.5,22398.5,470858,3386.7,15723,405
5,82.1,435.0,9310,55.9,237,6
6,1678.8,18324.0,268651,1366.1,6378,100
8,1541.2,35536.0,423816,1334.4,7612,103


In [72]:
df['origin'].value_counts()

origin
1    249
3     79
2     70
Name: count, dtype: int64

In [73]:
df.groupby(by='origin').sum(numeric_only=True)

Unnamed: 0_level_0,mpg,cylinders,displacement,weight,acceleration,model_year
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,5000.8,1556,61229.5,837121,3743.4,18827
2,1952.4,291,7640.0,169631,1175.1,5307
3,2405.6,324,8114.0,175477,1277.6,6118


In [74]:
df.groupby(by='origin').mean(numeric_only=True)['mpg']

origin
1    20.083534
2    27.891429
3    30.450633
Name: mpg, dtype: float64

In [75]:
df.groupby(by='model_year').sum(numeric_only=True)

Unnamed: 0_level_0,mpg,cylinders,displacement,weight,acceleration,origin
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
70,513.0,196,8161.0,97811,375.5,38
71,595.0,156,5873.0,83872,424.0,40
72,524.0,163,6114.5,90656,423.5,43
73,684.0,255,10275.0,136761,572.5,55
74,613.0,142,4637.0,77704,437.5,45
75,608.0,168,6166.0,95304,481.5,44
76,733.5,192,6725.0,104677,542.0,50
77,654.5,153,5359.0,83926,432.2,44
78,866.2,193,6401.0,103025,569.0,58
79,727.7,169,5994.0,88605,458.6,37


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB


In [77]:
df['horsepower'].unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

In [82]:
df['horsepower'] = df['horsepower'].replace('?', 0)
df['horsepower'] = df['horsepower'].replace(np.nan, 0)

In [83]:
df = df.astype({'horsepower': int}) # make sure we are assigning to dataframe df and not df['horsepower']

In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int64  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 28.1+ KB


In [85]:
# convert displacement to numeric
df['displacement'].unique()

array([307. , 350. , 318. , 304. , 302. , 429. , 454. , 440. , 455. ,
       390. , 383. , 340. , 400. , 113. , 198. , 199. , 200. ,  97. ,
       110. , 107. , 104. , 121. , 360. , 140. ,  98. , 232. , 225. ,
       250. , 351. , 258. , 122. , 116. ,  79. ,  88. ,  71. ,  72. ,
        91. ,  97.5,  70. , 120. ,  96. , 108. , 155. ,  68. , 114. ,
       156. ,  76. ,  83. ,  90. , 231. , 262. , 134. , 119. , 171. ,
       115. , 101. , 305. ,  85. , 130. , 168. , 111. , 260. , 151. ,
       146. ,  80. ,  78. , 105. , 131. , 163. ,  89. , 267. ,  86. ,
       183. , 141. , 173. , 135. ,  81. , 100. , 145. , 112. , 181. ,
       144. ])

In [89]:
df['displacement'] = pd.to_numeric(df['displacement']) # make sure we are assigning to df['displacement'] and not dataframe df 

In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    int64  
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model_year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   name          398 non-null    object 
dtypes: float64(3), int64(5), object(1)
memory usage: 28.1+ KB


#### `grouby` based on multiple columns

In [91]:
df.sample(4)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
241,22.0,6,146.0,97,2815,14.5,77,3,datsun 810
66,17.0,8,304.0,150,3672,11.5,72,1,amc ambassador sst
358,31.6,4,120.0,74,2635,18.3,81,3,mazda 626
117,29.0,4,68.0,49,1867,19.5,73,2,fiat 128


In [92]:
df.groupby(by=['cylinders', 'origin'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1698f6310>

In [94]:
df.groupby(by=['cylinders', 'origin']).sum(numeric_only=True) # returns multi level index

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,displacement,horsepower,weight,acceleration,model_year
cylinders,origin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,3,82.2,290.0,397,9594,53.0,302
4,1,2004.5,8948.5,5586,175476,1189.9,5618
4,2,1789.9,6566.0,4777,146791,1053.5,4757
4,3,2180.1,6884.0,5215,148591,1143.3,5348
5,2,82.1,435.0,247,9310,55.9,237
6,1,1455.1,16745.0,7276,237829,1219.1,5597
6,2,80.4,639.0,454,13530,65.7,313
6,3,143.3,940.0,695,17292,81.3,468
8,1,1541.2,35536.0,16305,423816,1334.4,7612


In [95]:
# to get the index 
df.groupby(by=['cylinders', 'origin']).sum(numeric_only=True).index

MultiIndex([(3, 3),
            (4, 1),
            (4, 2),
            (4, 3),
            (5, 2),
            (6, 1),
            (6, 2),
            (6, 3),
            (8, 1)],
           names=['cylinders', 'origin'])

In [103]:
# we can use this index to grab data
df.groupby(by=['cylinders', 'origin']).sum(numeric_only=True).loc[4]

Unnamed: 0_level_0,mpg,displacement,horsepower,weight,acceleration,model_year
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2004.5,8948.5,5586,175476,1189.9,5618
2,1789.9,6566.0,4777,146791,1053.5,4757
3,2180.1,6884.0,5215,148591,1143.3,5348


In [106]:
df.groupby(by=['cylinders', 'origin']).sum(numeric_only=True).loc[4,2]

mpg               1789.9
displacement      6566.0
horsepower        4777.0
weight          146791.0
acceleration      1053.5
model_year        4757.0
Name: (4, 2), dtype: float64

In [96]:
df.groupby(by=['cylinders', 'origin']).sum(numeric_only=True).transpose()

cylinders,3,4,4,4,5,6,6,6,8
origin,3,1,2,3,2,1,2,3,1
mpg,82.2,2004.5,1789.9,2180.1,82.1,1455.1,80.4,143.3,1541.2
displacement,290.0,8948.5,6566.0,6884.0,435.0,16745.0,639.0,940.0,35536.0
horsepower,397.0,5586.0,4777.0,5215.0,247.0,7276.0,454.0,695.0,16305.0
weight,9594.0,175476.0,146791.0,148591.0,9310.0,237829.0,13530.0,17292.0,423816.0
acceleration,53.0,1189.9,1053.5,1143.3,55.9,1219.1,65.7,81.3,1334.4
model_year,302.0,5618.0,4757.0,5348.0,237.0,5597.0,313.0,468.0,7612.0


In [100]:
df.groupby(by=['cylinders', 'origin']).describe().transpose()

Unnamed: 0_level_0,cylinders,3,4,4,4,5,6,6,6,8
Unnamed: 0_level_1,origin,3,1,2,3,2,1,2,3,1
mpg,count,4.0,72.0,63.0,69.0,3.0,74.0,4.0,6.0,103.0
mpg,mean,20.55,27.840278,28.411111,31.595652,27.366667,19.663514,20.1,23.883333,14.963107
mpg,std,2.564501,4.54956,6.442503,5.435787,8.228204,3.374992,7.074367,4.951936,2.836284
mpg,min,18.0,19.0,18.0,20.0,20.3,15.0,16.2,19.0,9.0
mpg,25%,18.75,24.875,24.0,27.5,22.85,18.0,16.425,20.5,13.0
mpg,50%,20.25,27.0,27.0,32.0,25.4,19.0,16.75,23.1,14.0
mpg,75%,22.05,30.6,30.75,35.0,30.9,20.95,20.425,25.1,16.0
mpg,max,23.7,39.0,44.3,46.6,36.4,38.0,30.7,32.7,26.6
displacement,count,4.0,72.0,63.0,69.0,3.0,74.0,4.0,6.0,103.0
displacement,mean,72.5,124.284722,104.222222,99.768116,145.0,226.283784,159.75,156.666667,345.009709


#### `groupby` based on multilevel index

In [108]:
df.sample(3)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
68,13.0,8,350.0,155,4502,13.5,72,1,buick lesabre custom
92,13.0,8,351.0,158,4363,13.0,73,1,ford ltd
152,19.0,6,225.0,95,3264,16.0,75,1,plymouth valiant custom


In [109]:
df.groupby(by=['cylinders', 'model_year'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x16997a290>

In [111]:
df.groupby(by=['cylinders', 'model_year']).sum(numeric_only=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,displacement,horsepower,weight,acceleration,origin
cylinders,model_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,72,19.0,70.0,97,2330,13.5,3
3,73,18.0,70.0,90,2124,13.5,3
3,77,21.5,80.0,110,2720,13.5,3
3,80,23.7,70.0,100,2420,12.5,3
4,70,177.0,749.0,614,16048,112.0,16
4,71,357.0,1324.0,931,26733,220.5,25
4,72,328.0,1561.5,1192,33357,241.0,27
4,73,250.0,1202.0,912,25719,188.5,22
4,74,417.0,1448.0,1110,32272,246.0,33
4,75,303.0,1378.0,1019,29871,190.0,26


In [112]:
df.groupby(by=['cylinders', 'model_year']).sum(numeric_only=True).index

MultiIndex([(3, 72),
            (3, 73),
            (3, 77),
            (3, 80),
            (4, 70),
            (4, 71),
            (4, 72),
            (4, 73),
            (4, 74),
            (4, 75),
            (4, 76),
            (4, 77),
            (4, 78),
            (4, 79),
            (4, 80),
            (4, 81),
            (4, 82),
            (5, 78),
            (5, 79),
            (5, 80),
            (6, 70),
            (6, 71),
            (6, 73),
            (6, 74),
            (6, 75),
            (6, 76),
            (6, 77),
            (6, 78),
            (6, 79),
            (6, 80),
            (6, 81),
            (6, 82),
            (8, 70),
            (8, 71),
            (8, 72),
            (8, 73),
            (8, 74),
            (8, 75),
            (8, 76),
            (8, 77),
            (8, 78),
            (8, 79),
            (8, 81)],
           names=['cylinders', 'model_year'])

In [114]:
df.groupby(by=['cylinders', 'model_year']).sum(numeric_only=True).loc[6,82]

mpg               85.0
displacement     675.0
horsepower       307.0
weight          8795.0
acceleration      48.1
origin             3.0
Name: (6, 82), dtype: float64

In [115]:
year_cyl = df.groupby(by=['cylinders', 'model_year']).sum(numeric_only=True)

In [116]:
year_cyl

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,displacement,horsepower,weight,acceleration,origin
cylinders,model_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,72,19.0,70.0,97,2330,13.5,3
3,73,18.0,70.0,90,2124,13.5,3
3,77,21.5,80.0,110,2720,13.5,3
3,80,23.7,70.0,100,2420,12.5,3
4,70,177.0,749.0,614,16048,112.0,16
4,71,357.0,1324.0,931,26733,220.5,25
4,72,328.0,1561.5,1192,33357,241.0,27
4,73,250.0,1202.0,912,25719,188.5,22
4,74,417.0,1448.0,1110,32272,246.0,33
4,75,303.0,1378.0,1019,29871,190.0,26


In [121]:
# we can also use xs() - cross section to get the multilevel index
year_cyl.xs(key=3)

Unnamed: 0_level_0,mpg,displacement,horsepower,weight,acceleration,origin
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
72,19.0,70.0,97,2330,13.5,3
73,18.0,70.0,90,2124,13.5,3
77,21.5,80.0,110,2720,13.5,3
80,23.7,70.0,100,2420,12.5,3


In [123]:
year_cyl.xs(key=3)

Unnamed: 0_level_0,mpg,displacement,horsepower,weight,acceleration,origin
model_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
72,19.0,70.0,97,2330,13.5,3
73,18.0,70.0,90,2124,13.5,3
77,21.5,80.0,110,2720,13.5,3
80,23.7,70.0,100,2420,12.5,3


In [128]:
year_cyl.loc[[4,3]]

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,displacement,horsepower,weight,acceleration,origin
cylinders,model_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4,70,177.0,749.0,614,16048,112.0,16
4,71,357.0,1324.0,931,26733,220.5,25
4,72,328.0,1561.5,1192,33357,241.0,27
4,73,250.0,1202.0,912,25719,188.5,22
4,74,417.0,1448.0,1110,32272,246.0,33
4,75,303.0,1378.0,1019,29871,190.0,26
4,76,401.5,1595.0,1134,34599,253.0,28
4,77,407.5,1491.0,1103,30871,224.9,26
4,78,502.8,1906.0,1355,39045,276.8,36
4,79,378.3,1363.0,909,28291,191.9,19


In [132]:
year_cyl.loc[4,70]

mpg               177.0
displacement      749.0
horsepower        614.0
weight          16048.0
acceleration      112.0
origin             16.0
Name: (4, 70), dtype: float64

In [135]:
df[df['cylinders'].isin([3,4])]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
14,24.0,4,113.0,95,2372,15.0,70,3,toyota corona mark ii
18,27.0,4,97.0,88,2130,14.5,70,3,datsun pl510
19,26.0,4,97.0,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan
20,25.0,4,110.0,87,2672,17.5,70,2,peugeot 504
21,24.0,4,107.0,90,2430,14.5,70,2,audi 100 ls
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [140]:
cyl_three_four = df[df['cylinders'].isin([3,4])].groupby(by=['cylinders', 'model_year']).sum(numeric_only=True)

In [143]:
# using loc[] we can access 
cyl_three_four.loc[3,72]

mpg               19.0
displacement      70.0
horsepower        97.0
weight          2330.0
acceleration      13.5
origin             3.0
Name: (3, 72), dtype: float64

In [145]:
# we can do the above using swaplevel()
year_cyl.swaplevel()

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,displacement,horsepower,weight,acceleration,origin
model_year,cylinders,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
72,3,19.0,70.0,97,2330,13.5,3
73,3,18.0,70.0,90,2124,13.5,3
77,3,21.5,80.0,110,2720,13.5,3
80,3,23.7,70.0,100,2420,12.5,3
70,4,177.0,749.0,614,16048,112.0,16
71,4,357.0,1324.0,931,26733,220.5,25
72,4,328.0,1561.5,1192,33357,241.0,27
73,4,250.0,1202.0,912,25719,188.5,22
74,4,417.0,1448.0,1110,32272,246.0,33
75,4,303.0,1378.0,1019,29871,190.0,26


In [147]:
year_cyl.swaplevel().loc[72,3]

mpg               19.0
displacement      70.0
horsepower        97.0
weight          2330.0
acceleration      13.5
origin             3.0
Name: (72, 3), dtype: float64

In [150]:
year_cyl.sort_index(level='cylinders', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,displacement,horsepower,weight,acceleration,origin
cylinders,model_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8,81,26.6,350.0,105,3725,19.0,1
8,79,186.3,3214.0,1319,38629,154.0,10
8,78,114.3,1805.0,813,21380,79.6,6
8,77,128.0,2686.0,1219,33420,109.3,8
8,76,132.0,2916.0,1317,36582,119.0,9
8,75,94.0,1983.0,852,24653,79.0,6
8,74,71.0,1576.0,730,22192,73.5,5
8,73,264.0,7305.0,3400,85581,245.0,20
8,72,177.0,4483.0,2076,54969,169.0,13
8,71,94.0,2602.0,1168,31764,85.5,7


In [151]:
year_cyl.sort_index(level=['cylinders', 'model_year'], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,displacement,horsepower,weight,acceleration,origin
cylinders,model_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8,81,26.6,350.0,105,3725,19.0,1
8,79,186.3,3214.0,1319,38629,154.0,10
8,78,114.3,1805.0,813,21380,79.6,6
8,77,128.0,2686.0,1219,33420,109.3,8
8,76,132.0,2916.0,1317,36582,119.0,9
8,75,94.0,1983.0,852,24653,79.0,6
8,74,71.0,1576.0,730,22192,73.5,5
8,73,264.0,7305.0,3400,85581,245.0,20
8,72,177.0,4483.0,2076,54969,169.0,13
8,71,94.0,2602.0,1168,31764,85.5,7


In [152]:
year_cyl.sort_index(level=['cylinders', 'model_year'], ascending=[False, True])

Unnamed: 0_level_0,Unnamed: 1_level_0,mpg,displacement,horsepower,weight,acceleration,origin
cylinders,model_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8,70,254.0,6616.0,3306,70921,201.5,18
8,71,94.0,2602.0,1168,31764,85.5,7
8,72,177.0,4483.0,2076,54969,169.0,13
8,73,264.0,7305.0,3400,85581,245.0,20
8,74,71.0,1576.0,730,22192,73.5,5
8,75,94.0,1983.0,852,24653,79.0,6
8,76,132.0,2916.0,1317,36582,119.0,9
8,77,128.0,2686.0,1219,33420,109.3,8
8,78,114.3,1805.0,813,21380,79.6,6
8,79,186.3,3214.0,1319,38629,154.0,10


#### Applying different aggregate function on different columns using `agg()`

In [157]:
df.drop('name', axis=1).agg(func=['mean', 'std'])

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
mean,23.514573,5.454774,193.425879,102.894472,2970.424623,15.56809,76.01005,1.572864
std,7.815984,1.701004,104.269838,40.269544,846.841774,2.757689,3.697627,0.802055


In [159]:
df.agg({'cylinders': 'mean', 'horsepower':'max'})

cylinders       5.454774
horsepower    230.000000
dtype: float64

In [160]:
df.agg({'cylinders': ['mean', 'max'], 'horsepower':'max'})

Unnamed: 0,cylinders,horsepower
mean,5.454774,
max,8.0,230.0
