# 第9章：透過分組來進行聚合、過濾和轉換

## 9.1 進行簡單的分組及聚合運算

In [1]:
import pandas as pd
import numpy as np
pd.set_option('max_columns', 4, 'max_rows', 10)

flights = pd.read_csv('data/flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,...,DIVERTED,CANCELLED
0,1,1,...,0,0
1,1,1,...,0,0
2,1,1,...,0,0
3,1,1,...,0,0
4,1,1,...,0,0


In [2]:
(flights
     .groupby('AIRLINE')
     .agg({'ARR_DELAY':'mean'})
)

Unnamed: 0_level_0,ARR_DELAY
AIRLINE,Unnamed: 1_level_1
AA,5.542661
AS,-0.833333
B6,8.692593
DL,0.339691
EV,7.034580
...,...
OO,7.593463
UA,7.765755
US,1.681105
VX,5.348884


In [3]:
(flights
     .groupby('AIRLINE')
     ['ARR_DELAY']
     .agg('mean')
)

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
        ...   
OO    7.593463
UA    7.765755
US    1.681105
VX    5.348884
WN    6.397353
Name: ARR_DELAY, Length: 14, dtype: float64

In [4]:
(flights
     .groupby('AIRLINE')
     ['ARR_DELAY']
     .agg(np.mean)
)

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
        ...   
OO    7.593463
UA    7.765755
US    1.681105
VX    5.348884
WN    6.397353
Name: ARR_DELAY, Length: 14, dtype: float64

In [5]:
(flights
     .groupby('AIRLINE')
     ['ARR_DELAY']
     .mean()
)

AIRLINE
AA    5.542661
AS   -0.833333
B6    8.692593
DL    0.339691
EV    7.034580
        ...   
OO    7.593463
UA    7.765755
US    1.681105
VX    5.348884
WN    6.397353
Name: ARR_DELAY, Length: 14, dtype: float64

In [6]:
# (flights
#    .groupby('AIRLINE')
#    ['ARR_DELAY']
#    .agg(np.sqrt)
# )

## 9.2 對多個欄位執行分組及聚合運算

In [7]:
(flights
    .groupby(['AIRLINE', 'WEEKDAY'])
    ['CANCELLED'] 
    .agg('sum')
)

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
                    ..
WN       3          18
         4          10
         5           7
         6          10
         7           7
Name: CANCELLED, Length: 98, dtype: int64

In [8]:
(flights
    .groupby(['AIRLINE', 'WEEKDAY']) 
    ['CANCELLED', 'DIVERTED']
    .agg(['sum', 'mean'])
)

  (flights


Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,41,0.032106,6,0.004699
AA,2,9,0.007341,2,0.001631
AA,3,16,0.011949,2,0.001494
AA,4,20,0.015004,5,0.003751
AA,5,18,0.014151,1,0.000786
...,...,...,...,...,...
WN,3,18,0.014118,2,0.001569
WN,4,10,0.007911,4,0.003165
WN,5,7,0.005828,0,0.000000
WN,6,10,0.010132,3,0.003040


In [9]:
(flights
    .groupby(['ORG_AIR', 'DEST_AIR'])
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,...,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,...,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0,0.000000,...,96.387097,45.778495
ATL,ABQ,0,0.000000,...,170.500000,87.866667
ATL,ABY,0,0.000000,...,28.578947,6.590643
ATL,ACY,0,0.000000,...,91.333333,11.466667
ATL,AEX,0,0.000000,...,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,...,64.059322,11.338331
SFO,STL,0,0.000000,...,198.900000,101.042105
SFO,SUN,0,0.000000,...,78.000000,25.777778
SFO,TUS,0,0.000000,...,100.200000,35.221053


In [10]:
(flights
    .groupby(['ORG_AIR', 'DEST_AIR'])
    .agg(sum_cancelled=pd.NamedAgg(column='CANCELLED', aggfunc='sum'),
         mean_cancelled=pd.NamedAgg(column='CANCELLED', aggfunc='mean'),
         size_cancelled=pd.NamedAgg(column='CANCELLED', aggfunc='size'),
         mean_air_time=pd.NamedAgg(column='AIR_TIME', aggfunc='mean'),
         var_air_time=pd.NamedAgg(column='AIR_TIME', aggfunc='var'))
)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum_cancelled,mean_cancelled,...,mean_air_time,var_air_time
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,0,0.000000,...,96.387097,45.778495
ATL,ABQ,0,0.000000,...,170.500000,87.866667
ATL,ABY,0,0.000000,...,28.578947,6.590643
ATL,ACY,0,0.000000,...,91.333333,11.466667
ATL,AEX,0,0.000000,...,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,...,64.059322,11.338331
SFO,STL,0,0.000000,...,198.900000,101.042105
SFO,SUN,0,0.000000,...,78.000000,25.777778
SFO,TUS,0,0.000000,...,100.200000,35.221053


In [11]:
res = (flights.groupby(['ORG_AIR', 'DEST_AIR'])
              .agg({'CANCELLED':['sum', 'mean', 'size'],
                    'AIR_TIME':['mean', 'var']})
)
res.columns

MultiIndex([('CANCELLED',  'sum'),
            ('CANCELLED', 'mean'),
            ('CANCELLED', 'size'),
            ( 'AIR_TIME', 'mean'),
            ( 'AIR_TIME',  'var')],
           )

In [12]:
res_flat_column = res.columns.to_flat_index()
res_flat_column

Index([ ('CANCELLED', 'sum'), ('CANCELLED', 'mean'), ('CANCELLED', 'size'),
        ('AIR_TIME', 'mean'),   ('AIR_TIME', 'var')],
      dtype='object')

In [13]:
res.columns = ['_'.join(x) for x in res_flat_column]
res

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED_sum,CANCELLED_mean,...,AIR_TIME_mean,AIR_TIME_var
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,0,0.000000,...,96.387097,45.778495
ATL,ABQ,0,0.000000,...,170.500000,87.866667
ATL,ABY,0,0.000000,...,28.578947,6.590643
ATL,ACY,0,0.000000,...,91.333333,11.466667
ATL,AEX,0,0.000000,...,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,...,64.059322,11.338331
SFO,STL,0,0.000000,...,198.900000,101.042105
SFO,SUN,0,0.000000,...,78.000000,25.777778
SFO,TUS,0,0.000000,...,100.200000,35.221053


In [14]:
def flatten_cols(df):
    df.columns = ['_'.join(x) for x in df.columns.to_flat_index()]
    return df

res = (flights
    .groupby(['ORG_AIR', 'DEST_AIR'])
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
    .pipe(flatten_cols)
)

res

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED_sum,CANCELLED_mean,...,AIR_TIME_mean,AIR_TIME_var
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,0,0.000000,...,96.387097,45.778495
ATL,ABQ,0,0.000000,...,170.500000,87.866667
ATL,ABY,0,0.000000,...,28.578947,6.590643
ATL,ACY,0,0.000000,...,91.333333,11.466667
ATL,AEX,0,0.000000,...,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,4,0.032787,...,64.059322,11.338331
SFO,STL,0,0.000000,...,198.900000,101.042105
SFO,SUN,0,0.000000,...,78.000000,25.777778
SFO,TUS,0,0.000000,...,100.200000,35.221053


In [15]:
res = (flights
    .assign(ORG_AIR=flights.ORG_AIR.astype('category'))
    .groupby(['ORG_AIR', 'DEST_AIR'])
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
)
res

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,...,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,...,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,0.0,0.0,...,96.387097,45.778495
ATL,ABI,,,...,,
ATL,ABQ,0.0,0.0,...,170.500000,87.866667
ATL,ABR,,,...,,
ATL,ABY,0.0,0.0,...,28.578947,6.590643
...,...,...,...,...,...,...
SFO,TYS,,,...,,
SFO,VLD,,,...,,
SFO,VPS,,,...,,
SFO,XNA,0.0,0.0,...,173.500000,0.500000


In [16]:
res = (flights
    .assign(ORG_AIR=flights.ORG_AIR.astype('category'))
    .groupby(['ORG_AIR', 'DEST_AIR'], observed=True)
    .agg({'CANCELLED':['sum', 'mean', 'size'],
          'AIR_TIME':['mean', 'var']})
)
res

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,...,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,...,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
LAX,ABQ,1,0.018182,...,89.259259,29.403215
LAX,ANC,0,0.000000,...,307.428571,78.952381
LAX,ASE,1,0.038462,...,102.920000,102.243333
LAX,ATL,0,0.000000,...,224.201149,127.155837
LAX,AUS,0,0.000000,...,150.537500,57.897310
...,...,...,...,...,...,...
MSP,TTN,1,0.125000,...,124.428571,57.952381
MSP,TUL,0,0.000000,...,91.611111,63.075163
MSP,TUS,0,0.000000,...,176.000000,32.000000
MSP,TVC,0,0.000000,...,56.600000,10.300000


## 9.3 分組後刪除MultiIndex

In [17]:
flights = pd.read_csv('data/flights.csv')
airline_info = (flights
    .groupby(['AIRLINE', 'WEEKDAY'])
    .agg({'DIST':['sum', 'mean'],
          'ARR_DELAY':['min', 'max']}) 
    .astype(int)
)
airline_info

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST,DIST,ARR_DELAY,ARR_DELAY
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
...,...,...,...,...,...
WN,3,997213,782,-38,262
WN,4,1024854,810,-52,284
WN,5,981036,816,-44,244
WN,6,823946,834,-41,290


In [18]:
airline_info.columns.get_level_values(0)

Index(['DIST', 'DIST', 'ARR_DELAY', 'ARR_DELAY'], dtype='object')

In [19]:
airline_info.columns.get_level_values(1)

Index(['sum', 'mean', 'min', 'max'], dtype='object')

In [20]:
airline_info.columns.to_flat_index()

Index([('DIST', 'sum'), ('DIST', 'mean'), ('ARR_DELAY', 'min'),
       ('ARR_DELAY', 'max')],
      dtype='object')

In [21]:
airline_info.columns = ['_'.join(x) for x in
    airline_info.columns.to_flat_index()]

airline_info

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
AIRLINE,WEEKDAY,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
...,...,...,...,...,...
WN,3,997213,782,-38,262
WN,4,1024854,810,-52,284
WN,5,981036,816,-44,244
WN,6,823946,834,-41,290


In [22]:
airline_info.reset_index()

Unnamed: 0,AIRLINE,WEEKDAY,...,ARR_DELAY_min,ARR_DELAY_max
0,AA,1,...,-60,551
1,AA,2,...,-52,725
2,AA,3,...,-45,473
3,AA,4,...,-46,349
4,AA,5,...,-41,732
...,...,...,...,...,...
93,WN,3,...,-38,262
94,WN,4,...,-52,284
95,WN,5,...,-44,244
96,WN,6,...,-41,290


In [23]:
(flights
    .groupby(['AIRLINE', 'WEEKDAY'])
    .agg(dist_sum=pd.NamedAgg(column='DIST', aggfunc='sum'),
         dist_mean=pd.NamedAgg(column='DIST', aggfunc='mean'),
         arr_delay_min=pd.NamedAgg(column='ARR_DELAY', aggfunc='min'),
         arr_delay_max=pd.NamedAgg(column='ARR_DELAY', aggfunc='max'))
    .astype(int)
    .reset_index()
)

Unnamed: 0,AIRLINE,WEEKDAY,...,arr_delay_min,arr_delay_max
0,AA,1,...,-60,551
1,AA,2,...,-52,725
2,AA,3,...,-45,473
3,AA,4,...,-46,349
4,AA,5,...,-41,732
...,...,...,...,...,...
93,WN,3,...,-38,262
94,WN,4,...,-52,284
95,WN,5,...,-44,244
96,WN,6,...,-41,290


In [24]:
(flights
    .groupby(['AIRLINE'], as_index=False)
    ['DIST']
    .agg('mean')
    .round(0)
)

Unnamed: 0,AIRLINE,DIST
0,AA,1114.0
1,AS,1066.0
2,B6,1772.0
3,DL,866.0
4,EV,460.0
...,...,...
9,OO,511.0
10,UA,1231.0
11,US,1181.0
12,VX,1240.0


## 9.4 使用自訂的聚合函式來分組

In [25]:
college = pd.read_csv('data/college.csv')
(college
    .groupby('STABBR')
    ['UGDS']
    .agg(['mean', 'std'])
    .round(0)
)

Unnamed: 0_level_0,mean,std
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,2493.0,4052.0
AL,2790.0,4658.0
AR,1644.0,3143.0
AS,1276.0,
AZ,4130.0,14894.0
...,...,...
VT,1513.0,2194.0
WA,2271.0,4124.0
WI,2655.0,4615.0
WV,1758.0,5957.0


In [26]:
def max_deviation(s):
    std_score = (s - s.mean()) / s.std()
    return std_score.abs().max()

In [27]:
(college
    .groupby('STABBR')
    ['UGDS']
    .agg(max_deviation)
    .round(1)
)

STABBR
AK    2.6
AL    5.8
AR    6.3
AS    NaN
AZ    9.9
     ... 
VT    3.8
WA    6.6
WI    5.8
WV    7.2
WY    2.8
Name: UGDS, Length: 59, dtype: float64

In [28]:
(college
    .groupby('STABBR')
    ['UGDS', 'SATVRMID', 'SATMTMID']
    .agg(max_deviation)
    .round(1)
)

  (college


Unnamed: 0_level_0,UGDS,SATVRMID,SATMTMID
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2.6,,
AL,5.8,1.6,1.8
AR,6.3,2.2,2.3
AS,,,
AZ,9.9,1.9,1.4
...,...,...,...
VT,3.8,1.9,1.9
WA,6.6,2.2,2.0
WI,5.8,2.4,2.2
WV,7.2,1.7,2.1


In [29]:
(college
    .groupby(['STABBR']) 
    ['UGDS'] 
    .agg([max_deviation, 'mean', 'std'])
    .round(1)
)

Unnamed: 0_level_0,max_deviation,mean,std
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,2.6,2493.2,4051.7
AL,5.8,2789.9,4657.9
AR,6.3,1644.1,3142.8
AS,,1276.0,
AZ,9.9,4130.5,14893.6
...,...,...,...
VT,3.8,1512.6,2193.6
WA,6.6,2271.2,4123.7
WI,5.8,2655.5,4615.5
WV,7.2,1758.1,5957.2


In [30]:
max_deviation.__name__

'max_deviation'

In [31]:
max_deviation.__name__ = 'Max Deviation'
(college
    .groupby(['STABBR', 'RELAFFIL']) 
    ['UGDS', 'SATVRMID', 'SATMTMID'] 
    .agg([max_deviation, 'mean', 'std'])
    .round(1)
)

  (college


Unnamed: 0_level_0,Unnamed: 1_level_0,UGDS,UGDS,...,SATMTMID,SATMTMID
Unnamed: 0_level_1,Unnamed: 1_level_1,Max Deviation,mean,...,mean,std
STABBR,RELAFFIL,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AK,0,2.1,3508.9,...,,
AK,1,1.1,123.3,...,503.0,
AL,0,5.2,3248.8,...,515.8,56.7
AL,1,2.4,979.7,...,485.6,61.4
AR,0,5.8,1793.7,...,503.6,39.0
...,...,...,...,...,...,...
WI,0,5.3,2879.1,...,591.2,85.7
WI,1,3.4,1716.2,...,526.6,42.5
WV,0,6.9,1873.9,...,480.0,27.7
WV,1,1.3,716.4,...,484.8,17.7


## 9.5 可接收多個參數的自訂聚合函式

In [32]:
def pct_between_1_3k(s):
    return (s.between(1_000, 3_000)
             .mean()* 100)

In [33]:
(college
    .groupby(['STABBR', 'RELAFFIL'])
    ['UGDS'] 
    .agg(pct_between_1_3k)
    .round(1)
)

STABBR  RELAFFIL
AK      0           14.3
        1            0.0
AL      0           23.6
        1           33.3
AR      0           27.9
                    ... 
WI      0           13.8
        1           36.0
WV      0           24.6
        1           37.5
WY      0           54.5
Name: UGDS, Length: 112, dtype: float64

In [34]:
def pct_between(s, low, high):
    return s.between(low, high).mean() * 100

In [35]:
(college
    .groupby(['STABBR', 'RELAFFIL'])
    ['UGDS'] 
    .agg(pct_between, 1_000, 10_000)
    .round(1)
)

STABBR  RELAFFIL
AK      0           42.9
        1            0.0
AL      0           45.8
        1           37.5
AR      0           39.7
                    ... 
WI      0           31.0
        1           44.0
WV      0           29.2
        1           37.5
WY      0           72.7
Name: UGDS, Length: 112, dtype: float64

In [36]:
(college
    .groupby(['STABBR', 'RELAFFIL'])
    ['UGDS'] 
    .agg(pct_between, low=1_000, high=10_000)
    .round(1)
)

STABBR  RELAFFIL
AK      0           42.9
        1            0.0
AL      0           45.8
        1           37.5
AR      0           39.7
                    ... 
WI      0           31.0
        1           44.0
WV      0           29.2
        1           37.5
WY      0           72.7
Name: UGDS, Length: 112, dtype: float64

In [37]:
def between_n_m(n, m):
    def wrapper(ser):
        return pct_between(ser, n, m)
    wrapper.__name__ = f'between_{n}_{m}'
    return wrapper

(college
    .groupby(['STABBR', 'RELAFFIL'])
    ['UGDS'] 
    .agg([between_n_m(1_000, 10_000), 'max', 'mean'])
    .round(1)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,between_1000_10000,max,mean
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,0,42.9,12865.0,3508.9
AK,1,0.0,275.0,123.3
AL,0,45.8,29851.0,3248.8
AL,1,37.5,3033.0,979.7
AR,0,39.7,21405.0,1793.7
...,...,...,...,...
WI,0,31.0,29302.0,2879.1
WI,1,44.0,8212.0,1716.2
WV,0,29.2,44924.0,1873.9
WV,1,37.5,1375.0,716.4


## 9.6 深入了解groupby物件

In [38]:
college = pd.read_csv('data/college.csv')
grouped = college.groupby(['STABBR', 'RELAFFIL'])
type(grouped)

pandas.core.groupby.generic.DataFrameGroupBy

In [39]:
print([attr for attr in dir(grouped) if not
    attr.startswith('_')])

['CITY', 'CURROPER', 'DISTANCEONLY', 'GRAD_DEBT_MDN_SUPP', 'HBCU', 'INSTNM', 'MD_EARN_WNE_P10', 'MENONLY', 'PCTFLOAN', 'PCTPELL', 'PPTUG_EF', 'RELAFFIL', 'SATMTMID', 'SATVRMID', 'STABBR', 'UG25ABV', 'UGDS', 'UGDS_2MOR', 'UGDS_AIAN', 'UGDS_ASIAN', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_NHPI', 'UGDS_NRA', 'UGDS_UNKN', 'UGDS_WHITE', 'WOMENONLY', 'agg', 'aggregate', 'all', 'any', 'apply', 'backfill', 'bfill', 'boxplot', 'corr', 'corrwith', 'count', 'cov', 'cumcount', 'cummax', 'cummin', 'cumprod', 'cumsum', 'describe', 'diff', 'dtypes', 'expanding', 'ffill', 'fillna', 'filter', 'first', 'get_group', 'groups', 'head', 'hist', 'idxmax', 'idxmin', 'indices', 'last', 'mad', 'max', 'mean', 'median', 'min', 'ndim', 'ngroup', 'ngroups', 'nth', 'nunique', 'ohlc', 'pad', 'pct_change', 'pipe', 'plot', 'prod', 'quantile', 'rank', 'resample', 'rolling', 'sem', 'shift', 'size', 'skew', 'std', 'sum', 'tail', 'take', 'transform', 'tshift', 'var']


In [40]:
grouped.ngroups

112

In [41]:
groups = list(grouped.groups)
groups[:6]

[('AK', 0), ('AK', 1), ('AL', 0), ('AL', 1), ('AR', 0), ('AR', 1)]

In [42]:
grouped.get_group(('FL', 1))

Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
712,The Baptist College of Florida,Graceville,...,30800,20052
713,Barry University,Miami,...,44100,28250
714,Gooding Institute of Nurse Anesthesia,Panama City,...,,PrivacySuppressed
715,Bethune-Cookman University,Daytona Beach,...,29400,36250
724,Johnson University Florida,Kissimmee,...,26300,20199
...,...,...,...,...,...
7486,Strayer University-Coral Springs Campus,Coral Springs,...,49200,36173.5
7487,Strayer University-Fort Lauderdale Campus,Fort Lauderdale,...,49200,36173.5
7488,Strayer University-Miramar Campus,Miramar,...,49200,36173.5
7489,Strayer University-Doral,Miami,...,49200,36173.5


In [43]:
from IPython.display import display
for name, group in grouped:
    print(name)
    display(group.head(3))

('AK', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
60,University of Alaska Anchorage,Anchorage,...,42500,19449.5
62,University of Alaska Fairbanks,Fairbanks,...,36200,19355.0
63,University of Alaska Southeast,Juneau,...,37400,16875.0


('AK', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
61,Alaska Bible College,Palmer,...,,PrivacySuppressed
64,Alaska Pacific University,Anchorage,...,47000.0,23250
5417,Alaska Christian College,Soldotna,...,,PrivacySuppressed


('AL', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,...,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,...,39700,21941.5
3,University of Alabama in Huntsville,Huntsville,...,45500,24097.0


('AL', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2,Amridge University,Montgomery,...,40100,23370
10,Birmingham Southern College,Birmingham,...,44200,27000
12,Concordia College Alabama,Selma,...,19900,PrivacySuppressed


('AR', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
128,University of Arkansas at Little Rock,Little Rock,...,33900,21736
129,University of Arkansas for Medical Sciences,Little Rock,...,61400,12500
130,ABC Beauty College Inc,Arkadelphia,...,PrivacySuppressed,16500


('AR', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
131,Arkansas Baptist College,Little Rock,...,22000,38000.0
134,Lyon College,Batesville,...,38600,25000.0
144,Baptist Health College-Little Rock,Little Rock,...,43200,13393.5


('AS', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4138,American Samoa Community College,Pago Pago,...,19800,PrivacySuppressed


('AZ', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
69,Collins College,Phoenix,...,25700,47000
71,Empire Beauty School-Tucson,Tucson,...,18200,9833
72,Thunderbird School of Global Management,Glendale,...,118900,PrivacySuppressed


('AZ', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
68,Everest College-Phoenix,Phoenix,...,28600,9500
70,Empire Beauty School-Paradise Valley,Phoenix,...,17800,9588
73,American Indian College Inc,Phoenix,...,PrivacySuppressed,PrivacySuppressed


('CA', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
192,Academy of Art University,San Francisco,...,36000.0,35093
193,ITT Technical Institute-Rancho Cordova,Rancho Cordova,...,38800.0,25827.5
194,Academy of Chinese Culture and Health Sciences,Oakland,...,,PrivacySuppressed


('CA', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
200,American Baptist Seminary of the West,Berkeley,...,,PrivacySuppressed
210,Azusa Pacific University,Azusa,...,50000,22500
214,Bethesda University,Anaheim,...,PrivacySuppressed,PrivacySuppressed


('CO', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
565,The Salon Professional Academy-Grand Junction,Grand Junction,...,PrivacySuppressed,9570
566,Adams State University,Alamosa,...,32800,16255
567,Aims Community College,Greeley,...,31400,8773


('CO', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
575,Colorado Christian University,Lakewood,...,36900.0,25808
589,Prince Institute-Rocky Mountains,Westminster,...,33400.0,20992
592,Denver Seminary,Littleton,...,,PrivacySuppressed


('CT', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
629,Paul Mitchell the School-Danbury,Danbury,...,19000,10486
630,Asnuntuck Community College,Enfield,...,30900,5500
631,Branford Hall Career Institute-Branford Campus,Branford,...,27900,9800


('CT', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
628,Albertus Magnus College,New Haven,...,52100.0,27763.5
645,Fairfield University,Fairfield,...,68500.0,26852.5
652,Holy Apostles College and Seminary,Cromwell,...,,PrivacySuppressed


('DC', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
698,University of the District of Columbia,Washington,...,34800,22393.5
700,Gallaudet University,Washington,...,26000,17750.0
701,George Washington University,Washington,...,65400,25350.0


('DC', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
696,American University,Washington,...,55900.0,24589
697,Catholic University of America,Washington,...,53900.0,26000
699,Pontifical Faculty of the Immaculate Conceptio...,Washington,...,,PrivacySuppressed


('DE', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
685,Margaret H Rollins School of Nursing at Beebe ...,Lewes,...,PrivacySuppressed,PrivacySuppressed
686,Dawn Career Institute Inc,Wilmington,...,22400,9500
688,Delaware Technical Community College-Terry,Dover,...,30700,8000


('DE', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
687,Delaware Technical Community College-Owens,Georgetown,...,28800,6750
689,Delaware Technical Community College-Stanton/W...,Wilmington,...,34000,7508
694,Wesley College,Dover,...,41600,31000


('FL', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
709,Wyotech-Daytona,Ormond Beach,...,31800,11600
710,The Art Institute of Fort Lauderdale,Fort Lauderdale,...,28800,29983
711,Atlantic Technical College,Coconut Creek,...,31900,PrivacySuppressed


('FL', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
712,The Baptist College of Florida,Graceville,...,30800.0,20052
713,Barry University,Miami,...,44100.0,28250
714,Gooding Institute of Nurse Anesthesia,Panama City,...,,PrivacySuppressed


('FM', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4214,College of Micronesia-FSM,Pohnpei,...,15700,PrivacySuppressed


('GA', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
860,Abraham Baldwin Agricultural College,Tifton,...,32000,15085.5
862,Interactive College of Technology-Chamblee,Chamblee,...,21100,7376.0
863,Interactive College of Technology-Morrow,Morrow,...,21100,7376.0


('GA', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
785,Luther Rice University & Seminary,Lithonia,...,39400,29500
861,Agnes Scott College,Decatur,...,38800,27000
867,Andrew College,Cuthbert,...,27500,12875


('GU', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4139,Guam Community College,Mangilao,...,22000,PrivacySuppressed
4140,University of Guam,Mangilao,...,29900,PrivacySuppressed


('GU', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
5289,Pacific Islands University,Mangilao,...,PrivacySuppressed,PrivacySuppressed


('HI', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
951,University of Hawaii at Hilo,Hilo,...,33500,19197
952,University of Hawaii at Manoa,Honolulu,...,43000,19000
953,Hawaii Institute of Hair Design,Honolulu,...,17300,5868


('HI', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
949,Heald College-Honolulu,Honolulu,...,35000,11676
950,Chaminade University of Honolulu,Honolulu,...,38400,22000
3805,Brigham Young University-Hawaii,Laie,...,41500,8291


('IA', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1247,Allen College,Waterloo,...,49100,17090.5
1248,AIB College of Business,Des Moines,...,37000,19732.5
1251,Capri College-Dubuque,Dubuque,...,19400,8477.0


('IA', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1249,Briar Cliff University,Sioux City,...,38100,24000
1250,Buena Vista University,Storm Lake,...,38300,23877.5
1253,American College of Hairstyling-Cedar Rapids,Cedar Rapids,...,PrivacySuppressed,PrivacySuppressed


('ID', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
965,Carrington College-Boise,Boise,...,25000,9500
967,Boise State University,Boise,...,35600,23500
968,Eastern Idaho Technical College,Idaho Falls,...,26600,11375


('ID', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
966,Boise Bible College,Boise,...,25500,19596
977,Northwest Nazarene University,Nampa,...,35900,25500
979,Brigham Young University-Idaho,Rexburg,...,38800,11000


('IL', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
43,Prince Institute-Southeast,Elmhurst,...,PrivacySuppressed,20992
981,Adler University,Chicago,...,,PrivacySuppressed
982,Alvareitas College of Cosmetology-Edwardsville,Edwardsville,...,PrivacySuppressed,9911


('IL', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
986,Augustana College,Rock Island,...,47900.0,27000
992,Blackburn College,Carlinville,...,37100.0,26000
1004,Catholic Theological Union at Chicago,Chicago,...,,PrivacySuppressed


('IN', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1165,Apex Academy of Hair Design Inc,Anderson,...,PrivacySuppressed,PrivacySuppressed
1166,Ball State University,Muncie,...,38800,25000
1168,Butler University,Indianapolis,...,55000,27000


('IN', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
990,Bethany Theological Seminary,Richmond,...,,PrivacySuppressed
1163,Ancilla College,Donaldson,...,29400.0,17000
1164,Anderson University,Anderson,...,35600.0,27000


('KS', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1326,Allen County Community College,Iola,...,29100,6900
1328,Barton County Community College,Great Bend,...,32200,8976
1332,Brown Mackie College-Kansas City,Lenexa,...,25200,16000


('KS', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1327,Baker University,Baldwin City,...,48800,25250
1329,Benedictine College,Atchison,...,39600,26000
1330,Bethany College,Lindsborg,...,38100,27000


('KY', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1389,Alice Lloyd College,Pippa Passes,...,33500,16495
1390,Asbury University,Wilmore,...,33600,25250
1392,Ashland Community and Technical College,Ashland,...,23700,11780


('KY', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1391,Asbury Theological Seminary,Wilmore,...,42500,PrivacySuppressed
1394,Bellarmine University,Louisville,...,46600,25000
1398,Brescia University,Owensboro,...,37500,30500


('LA', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1461,Central Louisiana Technical Community College,Alexandria,...,PrivacySuppressed,PrivacySuppressed
1462,American School of Business,Shreveport,...,19400,9500
1463,Ayers Career College,Shreveport,...,25100,9500


('LA', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1470,Centenary College of Louisiana,Shreveport,...,40400,25000.0
1478,Dillard University,New Orleans,...,32800,35000.0
1492,Louisiana College,Pineville,...,39100,23743.5


('MA', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1619,Hult International Business School,Cambridge,...,,PrivacySuppressed
1620,New England College of Business and Finance,Boston,...,,18450
1621,American International College,Springfield,...,38900.0,27000


('MA', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1623,Andover Newton Theological School,Newton Centre,...,,PrivacySuppressed
1624,Anna Maria College,Paxton,...,41900.0,25361
1626,Assumption College,Worcester,...,53600.0,27000


('MD', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1556,Aaron's Academy of Beauty,Waldorf,...,PrivacySuppressed,PrivacySuppressed
1557,Aesthetics Institute of Cosmetology,Gaithersburg,...,PrivacySuppressed,6333
1558,Allegany College of Maryland,Cumberland,...,29300,14072


('MD', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1570,Washington Adventist University,Takoma Park,...,44500,27000
1587,Loyola University Maryland,Baltimore,...,63000,27000
1599,Mount St Mary's University,Emmitsburg,...,49900,25995


('ME', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1526,Kaplan University-Maine Campus,S Portland,...,33400,29493
1527,College of the Atlantic,Bar Harbor,...,26400,19000
1528,Bates College,Lewiston,...,51600,16297


('ME', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1535,Husson University,Bangor,...,36900,26250
1549,Saint Joseph's College of Maine,Standish,...,39100,27000
4515,New England School of Communications,Bangor,...,27400,27000


('MH', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4561,College of the Marshall Islands,Majuro,...,PrivacySuppressed,PrivacySuppressed


('MI', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1224,West Michigan College of Barbering and Beauty,Kalamazoo,...,14800,PrivacySuppressed
1755,Hillsdale Beauty College,Hillsdale,...,PrivacySuppressed,PrivacySuppressed
1756,Northwestern Technological Institute,Southfield,...,30200,9500


('MI', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1753,Adrian College,Adrian,...,37100,27000
1754,Albion College,Albion,...,44900,27000
1757,Alma College,Alma,...,43200,27000


('MN', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
550,Walden University,Minneapolis,...,59700,29125
1863,Academy College,Bloomington,...,38500,29069
1864,Alexandria Technical & Community College,Alexandria,...,35100,12000


('MN', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1868,Augsburg College,Minneapolis,...,45700,27000
1872,Bethany Lutheran College,Mankato,...,34200,25000
1873,Bethel University,Saint Paul,...,45000,24069


('MO', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1357,Concorde Career College-Kansas City,Kansas City,...,22100,9500.0
1999,ITT Technical Institute-Earth City,Earth City,...,38800,25827.5
2001,House of Heavilin Beauty College-Blue Springs,Blue Springs,...,11600,9088.5


('MO', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1996,Aquinas Institute of Theology,Saint Louis,...,,PrivacySuppressed
1997,Assemblies of God Theological Seminary,Springfield,...,PrivacySuppressed,22062
1998,Avila University,Kansas City,...,41100,26625


('MP', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4141,Northern Marianas College,Saipan,...,19600,PrivacySuppressed


('MS', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1956,Alcorn State University,Alcorn State,...,30400,28000
1959,Chris Beauty College,Gulfport,...,15300,PrivacySuppressed
1960,Coahoma Community College,Clarksdale,...,21100,PrivacySuppressed


('MS', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1957,Belhaven University,Jackson,...,36800,29656
1958,Blue Mountain College,Blue Mountain,...,29200,PrivacySuppressed
1963,Creations College of Cosmetology,Tupelo,...,17900,PrivacySuppressed


('MT', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2108,Academy of Cosmetology,Bozeman,...,PrivacySuppressed,PrivacySuppressed
2109,Blackfeet Community College,Browning,...,15600,PrivacySuppressed
2110,Butte Academy of Beauty Culture,Butte,...,PrivacySuppressed,9500


('MT', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2113,Carroll College,Helena,...,45500,27000
2121,University of Great Falls,Great Falls,...,30700,24000
2130,Rocky Mountain College,Billings,...,38900,25626


('NC', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2642,College of the Albemarle,Elizabeth City,...,22300,PrivacySuppressed
2643,The Art Institute of Charlotte,Charlotte,...,28800,25167
2644,South Piedmont Community College,Polkton,...,21700,PrivacySuppressed


('NC', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2647,Barton College,Wilson,...,36000,27000
2649,Belmont Abbey College,Belmont,...,36000,27000
2650,Bennett College,Greensboro,...,26900,37000


('ND', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2772,Rasmussen College-North Dakota,Fargo,...,30900,21163
2773,Bismarck State College,Bismarck,...,38400,11588
2774,Dickinson State University,Dickinson,...,38800,19500


('ND', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2778,University of Jamestown,Jamestown,...,39600,27000
2782,University of Mary,Bismarck,...,45100,22722
2792,Trinity Bible College,Ellendale,...,25500,27592


('NE', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2133,La'James International College,Fremont,...,15900,PrivacySuppressed
2134,Bellevue University,Bellevue,...,52600,17188
2136,Bryan College of Health Sciences,Lincoln,...,50900,24280.5


('NE', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2135,Clarkson College,Omaha,...,47000,26000
2140,Concordia University-Nebraska,Seward,...,36100,26000
2141,Creighton University,Omaha,...,57100,23250


('NH', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2183,Colby-Sawyer College,New London,...,38800,27000
2184,Continental Academie of Hair Design-Hudson,Hudson,...,23200,9075
2185,Daniel Webster College,Nashua,...,50500,26999


('NH', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2193,Northeast Catholic College,Warner,...,,PrivacySuppressed
2210,Rivier University,Nashua,...,41700.0,25500
2211,Saint Anselm College,Manchester,...,52800.0,27000


('NJ', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2215,Eastwick College-Hackensack,Hackensack,...,27300,12519
2216,Atlantic Cape Community College,Mays Landing,...,28100,10005
2217,Fortis Institute-Wayne,Wayne,...,30400,10305


('NJ', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2221,Bloomfield College,Bloomfield,...,36100,30500.0
2224,Caldwell University,Caldwell,...,44400,26040.0
2226,Centenary College,Hackettstown,...,41100,25437.5


('NM', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
114,Pima Medical Institute-Albuquerque,Albuquerque,...,28200,8708
2303,Olympian Academy of Cosmetology,Alamogordo,...,17200,11705
2304,Central New Mexico Community College,Albuquerque,...,29500,10000


('NM', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
7419,Computer Career Center-Las Cruces,Las Cruces,...,21300,14250


('NV', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2170,Academy of Hair Design-Las Vegas,Las Vegas,...,17200,9500.0
2171,Career College of Northern Nevada,Sparks,...,23800,14020.5
2172,College of Southern Nevada,Las Vegas,...,31700,10500.0


('NV', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
6439,Touro University Nevada,Henderson,...,,PrivacySuppressed
7352,Marinello School of Beauty-Henderson,Henderson,...,21200.0,9796.5


('NY', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
678,Tri-State College of Acupuncture,New York,...,PrivacySuppressed,PrivacySuppressed
2334,Vaughn College of Aeronautics and Technology,Flushing,...,48700,22625
2335,Adelphi University,Garden City,...,51300,25000


('NY', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2375,Canisius College,Buffalo,...,45700.0,25000.0
2382,Christ the King Seminary,East Aurora,...,,
2394,Concordia College-New York,Bronxville,...,43200.0,26000.0


('OH', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2796,ETI Technical College,Niles,...,22700,13964
2797,The Art Institute of Cincinnati-AIC College of...,Cincinnati,...,29700,PrivacySuppressed
2798,Miami-Jacobs Career College-Independence,Independence,...,26700,22940


('OH', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2803,Allegheny Wesleyan College,Salem,...,PrivacySuppressed,PrivacySuppressed
2808,Ashland University,Ashland,...,39000,27000
2812,Baldwin Wallace University,Berea,...,44900,27000


('OK', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3009,American Broadcasting School-Oklahoma City,Oklahoma City,...,27300,7023
3013,Broken Arrow Beauty College-Broken Arrow,Broken Arrow,...,16800,9259
3014,Pontotoc Technology Center,Ada,...,28500,PrivacySuppressed


('OK', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3010,Bacone College,Muskogee,...,29700,26350.0
3011,Oklahoma Wesleyan University,Bartlesville,...,46100,21276.5
3012,Southern Nazarene University,Bethany,...,45800,18750.0


('OR', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3069,Academy of Hair Design-Salem,Salem,...,14800,18519
3070,Abdill Career College Inc,Medford,...,PrivacySuppressed,9500
3071,Paul Mitchell the School-Portland,Portland,...,,10194


('OR', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3081,Concordia University-Portland,Portland,...,40400,25000
3086,New Hope Christian College-Eugene,Eugene,...,26400,24921
3087,George Fox University,Newberg,...,41700,22000


('PA', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3136,Abington Memorial Hospital Dixon School of Nur...,Willow Grove,...,63300,15836.0
3137,Jolie Hair and Beauty Academy-Hazleton,Hazleton,...,PrivacySuppressed,8847.5
3138,Keystone Technical Institute,Harrisburg,...,24400,11677.5


('PA', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3139,Bryn Athyn College of the New Church,Bryn Athyn,...,PrivacySuppressed,22294.5
3141,Albright College,Reading,...,45800,28750.0
3144,Allegheny College,Meadville,...,48400,29046.0


('PR', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4142,Institute of Beauty Careers,Arecibo,...,12000,PrivacySuppressed
4143,Educational Technical College-Recinto de Bayamon,Bayamon,...,14500,PrivacySuppressed
4144,American University of Puerto Rico,Bayamon,...,19300,3920


('PR', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4147,Universidad Adventista de las Antillas,Mayaguez,...,18900,13800
4149,Universidad Central de Bayamon,Bayamón,...,18500,8250
4154,Pontifical Catholic University of Puerto Rico-...,Arecibo,...,17900,13195


('PW', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4215,Palau Community College,Koror,...,24700,PrivacySuppressed


('RI', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3402,Brown University,Providence,...,59700,15500
3403,Bryant University,Smithfield,...,64500,27000
3404,Johnson & Wales University-Providence,Providence,...,35300,27000


('RI', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3401,Empire Beauty School-Providence,Providence,...,21000,9833
3408,Providence College,Providence,...,57700,27000
3414,Salve Regina University,Newport,...,49700,27000


('SC', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3417,Aiken Technical College,Graniteville,...,24500,9625
3420,Technical College of the Lowcountry,Beaufort,...,25300,7500
3422,Bob Jones University,Greenville,...,PrivacySuppressed,19000


('SC', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3418,Allen University,Columbia,...,21100,37676
3419,Charleston Southern University,Charleston,...,35700,27741
3421,Benedict College,Columbia,...,21400,44000


('SD', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3479,Black Hills Beauty College,Rapid City,...,16200,11790
3480,Black Hills State University,Spearfish,...,34400,25625
3481,Kilian Community College,Sioux Falls,...,23100,17125


('SD', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3478,Augustana University,Sioux Falls,...,41800,27000
3483,Dakota Wesleyan University,Mitchell,...,34500,27000
3486,Avera McKennan Hospital School of Radiologic T...,Sioux Falls,...,PrivacySuppressed,PrivacySuppressed


('TN', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
1205,ITT Technical Institute-Nashville,Nashville,...,38800,25827.5
3507,Arnolds Beauty School,Milan,...,16000,PrivacySuppressed
3508,Tennessee College of Applied Technology-Athens,Athens,...,26600,PrivacySuppressed


('TN', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3506,American Baptist College,Nashville,...,PrivacySuppressed,25000
3510,Baptist Memorial College of Health Sciences,Memphis,...,54100,30000
3511,Belmont University,Nashville,...,41800,22707


('TX', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3611,Alvin Community College,Alvin,...,34500,6750
3612,Amarillo College,Amarillo,...,31700,10950
3613,Angelina College,Lufkin,...,26900,PrivacySuppressed


('TX', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3610,Abilene Christian University,Abilene,...,40200,25985
3615,Arlington Baptist College,Arlington,...,34200,22905
3618,Austin College,Sherman,...,47800,26000


('UT', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3802,AmeriTech College-Provo,Provo,...,24700,24370
3803,Bridgerland Applied Technology College,Logan,...,24300,PrivacySuppressed
3806,Broadview University-West Jordan,West Jordan,...,25500,28458


('UT', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3804,Brigham Young University-Provo,Provo,...,57200,11000.0
3817,Latter-day Saints Business College,Salt Lake City,...,35100,5799.0
3818,Everest College-Salt Lake City,West Valley City,...,24400,10632.5


('VA', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
704,Medtech Institute,Falls Church,...,26300,9236
3850,Bar Palma Beauty Careers Academy,Roanoke,...,16900,9731
3851,Advanced Technology Institute,Virginia Beach,...,38000,16279


('VA', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3852,Averett University,Danville,...,42400,25000
3853,Bluefield College,Bluefield,...,40000,18873
3854,Bridgewater College,Bridgewater,...,40800,27000


('VI', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4216,University of the Virgin Islands,Charlotte Amalie,...,31800,15150


('VI', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
7404,University of the Virgin Islands-Albert A. Sheen,St. Croix,...,31800,15150


('VT', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3829,Bennington College,Bennington,...,24600,27000
3830,Burlington College,Burlington,...,26000,25000
3831,Castleton University,Castleton,...,34900,25000


('VT', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3835,Green Mountain College,Poultney,...,30100,25449
3843,Saint Michael's College,Colchester,...,46600,27400
3845,College of St Joseph,Rutland,...,34700,24127


('WA', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3943,Beauty Academy,Wenatchee,...,PrivacySuppressed,8718.5
3944,The Art Institute of Seattle,Seattle,...,34100,25937.5
3945,Evergreen Beauty and Barber College-Bellevue,Bellevue,...,PrivacySuppressed,7917.0


('WA', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
3967,Gonzaga University,Spokane,...,53000,25500.0
3981,Trinity Lutheran College,Everett,...,37100,25000.0
3985,Northwest University,Kirkland,...,37700,23724.5


('WI', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4063,Advanced Institute of Hair Design-Glendale,Glendale,...,24000,10314
4064,VICI Aveda Institute,Greenfield,...,24000,10314
4066,Madison Area Technical College,Madison,...,35000,14250


('WI', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4065,Alverno College,Milwaukee,...,37100,32606.5
4070,Cardinal Stritch University,Milwaukee,...,48500,27000.0
4071,Carroll University,Waukesha,...,41300,27000.0


('WV', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
2842,Scott College of Cosmetology,Wheeling,...,14800,9250
4019,B M Spurr School of Practical Nursing,Glen Dale,...,PrivacySuppressed,PrivacySuppressed
4020,Ben Franklin Career Center,Dunbar,...,20800,PrivacySuppressed


('WV', 1)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4016,Alderson Broaddus University,Philippi,...,46000,27000.0
4018,Appalachian Bible College,Mount Hope,...,28700,9300.0
4027,Davis & Elkins College,Elkins,...,35000,23840.5


('WY', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
4128,Casper College,Casper,...,34800,10764
4129,Central Wyoming College,Riverton,...,25200,8757
4130,Eastern Wyoming College,Torrington,...,25900,10000


In [44]:
for name, group in grouped:
    print(name)
    display(group)
    break

('AK', 0)


Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
60,University of Alaska Anchorage,Anchorage,...,42500,19449.5
62,University of Alaska Fairbanks,Fairbanks,...,36200,19355
63,University of Alaska Southeast,Juneau,...,37400,16875
65,AVTEC-Alaska's Institute of Technology,Seward,...,33500,PrivacySuppressed
66,Charter College-Anchorage,Anchorage,...,39200,13875
67,Alaska Career College,Anchorage,...,28700,8994
5171,Ilisagvik College,Barrow,...,24900,PrivacySuppressed


In [45]:
grouped.head(1)

Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,...,30300,33888
2,Amridge University,Montgomery,...,40100,23370
43,Prince Institute-Southeast,Elmhurst,...,PrivacySuppressed,20992
60,University of Alaska Anchorage,Anchorage,...,42500,19449.5
61,Alaska Bible College,Palmer,...,,PrivacySuppressed
...,...,...,...,...,...
4561,College of the Marshall Islands,Majuro,...,PrivacySuppressed,PrivacySuppressed
5289,Pacific Islands University,Mangilao,...,PrivacySuppressed,PrivacySuppressed
6439,Touro University Nevada,Henderson,...,,PrivacySuppressed
7404,University of the Virgin Islands-Albert A. Sheen,St. Croix,...,31800,15150


In [46]:
grouped.nth([1, -1])

Unnamed: 0_level_0,Unnamed: 1_level_0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
STABBR,RELAFFIL,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AK,0,University of Alaska Fairbanks,Fairbanks,...,36200,19355
AK,0,Ilisagvik College,Barrow,...,24900,PrivacySuppressed
AK,1,Alaska Pacific University,Anchorage,...,47000,23250
AK,1,Alaska Christian College,Soldotna,...,,PrivacySuppressed
AL,0,University of Alabama at Birmingham,Birmingham,...,39700,21941.5
...,...,...,...,...,...,...
WV,0,BridgeValley Community & Technical College,South Charleston,...,,9429.5
WV,1,Appalachian Bible College,Mount Hope,...,28700,9300
WV,1,West Virginia Business College-Nutter Fort,Nutter Fort,...,16700,19258
WY,0,Central Wyoming College,Riverton,...,25200,8757


## 9.7 過濾特定的組別

In [47]:
college = pd.read_csv('data/college.csv', index_col='INSTNM')
grouped = college.groupby('STABBR')
grouped.ngroups

59

In [48]:
college['STABBR'].nunique()

59

In [49]:
def check_minority(df, threshold):
    minority_pct = 1 - df['UGDS_WHITE']
    total_minority = (df['UGDS'] * minority_pct).sum()
    total_ugds = df['UGDS'].sum()
    total_minority_pct = total_minority / total_ugds
    return total_minority_pct > threshold

In [50]:
college_filtered = grouped.filter(check_minority, threshold=.5)
college_filtered

Unnamed: 0_level_0,CITY,STABBR,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Everest College-Phoenix,Phoenix,AZ,...,28600,9500
Collins College,Phoenix,AZ,...,25700,47000
Empire Beauty School-Paradise Valley,Phoenix,AZ,...,17800,9588
Empire Beauty School-Tucson,Tucson,AZ,...,18200,9833
Thunderbird School of Global Management,Glendale,AZ,...,118900,PrivacySuppressed
...,...,...,...,...,...
WestMed College - Merced,Merced,CA,...,,15623.5
Vantage College,El Paso,TX,...,,9500
SAE Institute of Technology San Francisco,Emeryville,CA,...,,9500
Bay Area Medical Academy - San Jose Satellite Location,San Jose,CA,...,,PrivacySuppressed


In [51]:
college.shape

(7535, 26)

In [52]:
college_filtered.shape

(3028, 26)

In [53]:
college_filtered['STABBR'].nunique()

20

In [54]:
college_filtered_20 = grouped.filter(check_minority, threshold=.2)
college_filtered_20.shape

(7461, 26)

In [55]:
college_filtered_20['STABBR'].nunique()

57

In [56]:
college_filtered_70 = grouped.filter(check_minority, threshold=.7)
college_filtered_70.shape

(957, 26)

In [57]:
college_filtered_70['STABBR'].nunique()

10

## 9.8 分組轉換特定欄位的資料

In [58]:
weight_loss = pd.read_csv('data/weight_loss.csv')
weight_loss.query('Month == "Jan"')

Unnamed: 0,Name,Month,Week,Weight
0,Bob,Jan,Week 1,291
1,Amy,Jan,Week 1,197
2,Bob,Jan,Week 2,288
3,Amy,Jan,Week 2,189
4,Bob,Jan,Week 3,283
5,Amy,Jan,Week 3,189
6,Bob,Jan,Week 4,283
7,Amy,Jan,Week 4,190


In [59]:
def percent_loss(s):
    return ((s - s.iloc[0]) / s.iloc[0]) * 100

In [60]:
(weight_loss
    .query('Name=="Bob" and Month=="Jan"')
    ['Weight']
    .pipe(percent_loss)
)

0    0.000000
2   -1.030928
4   -2.749141
6   -2.749141
Name: Weight, dtype: float64

In [61]:
(weight_loss
    .groupby(['Name', 'Month'])
    ['Weight'] 
    .transform(percent_loss)
)

0     0.000000
1     0.000000
2    -1.030928
3    -4.060914
4    -2.749141
        ...   
27   -3.529412
28   -3.065134
29   -3.529412
30   -4.214559
31   -5.294118
Name: Weight, Length: 32, dtype: float64

In [62]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Name=="Bob" and Month in ["Jan", "Feb"]')
)

Unnamed: 0,Name,Month,...,Weight,percent_loss
0,Bob,Jan,...,291,0.0
2,Bob,Jan,...,288,-1.0
4,Bob,Jan,...,283,-2.7
6,Bob,Jan,...,283,-2.7
8,Bob,Feb,...,283,0.0
10,Bob,Feb,...,275,-2.8
12,Bob,Feb,...,268,-5.3
14,Bob,Feb,...,268,-5.3


In [63]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
)

Unnamed: 0,Name,Month,...,Weight,percent_loss
6,Bob,Jan,...,283,-2.7
7,Amy,Jan,...,190,-3.6
14,Bob,Feb,...,268,-5.3
15,Amy,Feb,...,173,-8.9
22,Bob,Mar,...,261,-2.6
23,Amy,Mar,...,170,-1.7
30,Bob,Apr,...,250,-4.2
31,Amy,Apr,...,161,-5.3


In [64]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss')
)

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Apr,-5.3,-4.2
Feb,-8.9,-5.3
Jan,-3.6,-2.7
Mar,-1.7,-2.6


In [65]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss')
    .assign(winner=lambda df_:
            np.where(df_.Amy < df_.Bob, 'Amy', 'Bob'))
)

Name,Amy,Bob,winner
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apr,-5.3,-4.2,Amy
Feb,-8.9,-5.3,Amy
Jan,-3.6,-2.7,Amy
Mar,-1.7,-2.6,Bob


In [66]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss')
    .assign(winner=lambda df_:
            np.where(df_.Amy < df_.Bob, 'Amy', 'Bob'))
    .style.highlight_min(axis=1,color='lightgrey')
)

Name,Amy,Bob,winner
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Apr,-5.3,-4.2,Amy
Feb,-8.9,-5.3,Amy
Jan,-3.6,-2.7,Amy
Mar,-1.7,-2.6,Bob


In [67]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss')
    .assign(winner=lambda df_:
            np.where(df_.Amy < df_.Bob, 'Amy', 'Bob'))
    .winner
    .value_counts()
)

Amy    3
Bob    1
Name: winner, dtype: int64

In [68]:
(weight_loss
    .assign(percent_loss=(weight_loss
        .groupby(['Name', 'Month'])
        ['Weight'] 
        .transform(percent_loss)
        .round(1)),
            Month=pd.Categorical(weight_loss.Month,
                  categories=['Jan', 'Feb', 'Mar', 'Apr'],
                  ordered=True))
    .query('Week == "Week 4"')
    .pivot(index='Month', columns='Name',
           values='percent_loss')
)

Name,Amy,Bob
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,-3.6,-2.7
Feb,-8.9,-5.3
Mar,-1.7,-2.6
Apr,-5.3,-4.2


## 9.9 使用apply()計算加權平均數

In [69]:
college = pd.read_csv('data/college.csv')
subset = ['UGDS', 'SATMTMID', 'SATVRMID']
college2 = college.dropna(subset=subset)
college.shape

(7535, 27)

In [70]:
college2.shape

(1184, 27)

In [71]:
def weighted_math_average(df):
    weighted_math = df['UGDS'] * df['SATMTMID']
    return int(weighted_math.sum() / df['UGDS'].sum())

In [72]:
college2.groupby('STABBR').apply(weighted_math_average) 

STABBR
AK    503
AL    536
AR    529
AZ    569
CA    564
     ... 
VT    566
WA    555
WI    593
WV    500
WY    540
Length: 53, dtype: int64

In [73]:
# (college2
#     .groupby('STABBR')
#     .agg(weighted_math_average)
# )

In [74]:
# (college2
#     .groupby('STABBR')
#     ['SATMTMID'] 
#     .agg(weighted_math_average)
# )

In [75]:
def weighted_average(df):
    weight_m = df['UGDS'] * df['SATMTMID']
    weight_v = df['UGDS'] * df['SATVRMID']
    wm_avg = weight_m.sum() / df['UGDS'].sum()
    wv_avg = weight_v.sum() / df['UGDS'].sum()
    data = {'w_math_avg': wm_avg,
            'w_verbal_avg': wv_avg,
            'math_avg': df['SATMTMID'].mean(),
            'verbal_avg': df['SATVRMID'].mean(),
            'count': len(df)
    }
    return pd.Series(data)

In [76]:
weighted_average(college2)

w_math_avg       559.408812
w_verbal_avg     542.989462
math_avg         530.958615
verbal_avg       522.775338
count           1184.000000
dtype: float64

In [77]:
(college2
    .groupby('STABBR')
    .apply(weighted_average)
    .astype(int)
)

Unnamed: 0_level_0,w_math_avg,w_verbal_avg,...,verbal_avg,count
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,503,555,...,555,1
AL,536,533,...,508,21
AR,529,504,...,491,16
AZ,569,557,...,538,6
CA,564,539,...,549,72
...,...,...,...,...,...
VT,566,564,...,527,8
WA,555,541,...,548,18
WI,593,556,...,516,14
WV,500,487,...,473,17


In [78]:
from scipy.stats import gmean, hmean
def calculate_means(df):
    df_means = pd.DataFrame(index=['Arithmetic', 'Weighted',
                                   'Geometric', 'Harmonic'])
    cols = ['SATMTMID', 'SATVRMID']
    for col in cols:
        arithmetic = df[col].mean()
        weighted = np.average(df[col], weights=df['UGDS'])
        geometric = gmean(df[col])
        harmonic = hmean(df[col])
        df_means[col] = [arithmetic, weighted,
                         geometric, harmonic]
    df_means['count'] = len(df)
    return df_means.astype(int)
(college2
    .groupby('STABBR')
    .apply(calculate_means)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,SATMTMID,SATVRMID,count
STABBR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AK,Arithmetic,503,555,1
AK,Weighted,503,555,1
AK,Geometric,503,555,1
AK,Harmonic,503,555,1
AL,Arithmetic,504,508,21
...,...,...,...,...
WV,Harmonic,480,472,17
WY,Arithmetic,540,535,1
WY,Weighted,540,535,1
WY,Geometric,540,534,1


## 9.10 以連續變化的數值進行分組

In [79]:
flights = pd.read_csv('data/flights.csv')
flights

Unnamed: 0,MONTH,DAY,...,DIVERTED,CANCELLED
0,1,1,...,0,0
1,1,1,...,0,0
2,1,1,...,0,0
3,1,1,...,0,0
4,1,1,...,0,0
...,...,...,...,...,...
58487,12,31,...,0,0
58488,12,31,...,0,0
58489,12,31,...,0,0
58490,12,31,...,0,0


In [80]:
bins = [-np.inf, 200, 500, 1000, 2000, np.inf]
cuts = pd.cut(flights['DIST'], bins=bins)
cuts

0         (500.0, 1000.0]
1        (1000.0, 2000.0]
2         (500.0, 1000.0]
3        (1000.0, 2000.0]
4        (1000.0, 2000.0]
               ...       
58487    (1000.0, 2000.0]
58488      (200.0, 500.0]
58489      (200.0, 500.0]
58490     (500.0, 1000.0]
58491     (500.0, 1000.0]
Name: DIST, Length: 58492, dtype: category
Categories (5, interval[float64]): [(-inf, 200.0] < (200.0, 500.0] < (500.0, 1000.0] < (1000.0, 2000.0] < (2000.0, inf]]

In [81]:
cuts.value_counts()

(500.0, 1000.0]     20659
(200.0, 500.0]      15874
(1000.0, 2000.0]    14186
(2000.0, inf]        4054
(-inf, 200.0]        3719
Name: DIST, dtype: int64

In [82]:
(flights
    .groupby(cuts)
    ['AIRLINE']
    .value_counts(normalize=True) 
    .round(3)
)

DIST           AIRLINE
(-inf, 200.0]  OO         0.326
               EV         0.289
               MQ         0.211
               DL         0.086
               AA         0.052
                          ...  
(2000.0, inf]  WN         0.046
               HA         0.028
               NK         0.019
               AS         0.012
               F9         0.004
Name: AIRLINE, Length: 57, dtype: float64

In [83]:
(flights
  .groupby(cuts)
  ['AIR_TIME']
  .quantile(q=[.25, .5, .75]) 
  .div(60)
  .round(2)
)

DIST                  
(-inf, 200.0]     0.25    0.43
                  0.50    0.50
                  0.75    0.57
(200.0, 500.0]    0.25    0.77
                  0.50    0.92
                          ... 
(1000.0, 2000.0]  0.50    2.93
                  0.75    3.40
(2000.0, inf]     0.25    4.30
                  0.50    4.70
                  0.75    5.03
Name: AIR_TIME, Length: 15, dtype: float64

## 9.11 案例演練：計算城市之間的航班總數

In [84]:
flights = pd.read_csv('data/flights.csv')
flights_ct = flights.groupby(['ORG_AIR', 'DEST_AIR']).size()
flights_ct

ORG_AIR  DEST_AIR
ATL      ABE          31
         ABQ          16
         ABY          19
         ACY           6
         AEX          40
                    ... 
SFO      SNA         122
         STL          20
         SUN          10
         TUS          20
         XNA           2
Length: 1130, dtype: int64

In [85]:
flights_ct.loc[[('ATL', 'IAH'), ('IAH', 'ATL')]]

ORG_AIR  DEST_AIR
ATL      IAH         121
IAH      ATL         148
dtype: int64

In [86]:
f_part3 = (flights  
  [['ORG_AIR', 'DEST_AIR']] 
  .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns')
)
f_part3

Unnamed: 0,0,1
0,LAX,SLC
1,DEN,IAD
2,DFW,VPS
3,DCA,DFW
4,LAX,MCI
...,...,...
58487,DFW,SFO
58488,LAS,SFO
58489,SBA,SFO
58490,ATL,MSP


In [87]:
rename_dict = {0:'AIR1', 1:'AIR2'}  
(flights    
  [['ORG_AIR', 'DEST_AIR']]
  .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns')
  .rename(columns=rename_dict)
  .groupby(['AIR1', 'AIR2'])
  .size()
)

AIR1  AIR2
ABE   ATL      31
      ORD      24
ABI   DFW      74
ABQ   ATL      16
      DEN      46
             ... 
SFO   SNA     122
      STL      20
      SUN      10
      TUS      20
      XNA       2
Length: 1085, dtype: int64

In [88]:
(flights     
  [['ORG_AIR', 'DEST_AIR']]
  .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns')
  .rename(columns=rename_dict)
  .groupby(['AIR1', 'AIR2'])
  .size()
  .loc[('ATL', 'IAH')]
)

269

In [89]:
# (flights     
#   [['ORG_AIR', 'DEST_AIR']]
#   .apply(lambda ser:
#          ser.sort_values().reset_index(drop=True),
#          axis='columns')
#   .rename(columns=rename_dict)
#   .groupby(['AIR1', 'AIR2'])
#   .size()
#   .loc[('IAH', 'ATL')]
# )

In [90]:
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
data_sorted[:10]

array([['LAX', 'SLC'],
       ['DEN', 'IAD'],
       ['DFW', 'VPS'],
       ['DCA', 'DFW'],
       ['LAX', 'MCI'],
       ['IAH', 'SAN'],
       ['DFW', 'MSY'],
       ['PHX', 'SFO'],
       ['ORD', 'STL'],
       ['IAH', 'SJC']], dtype=object)

In [91]:
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])
flights_sort2.equals(f_part3.rename(columns={0:'AIR1', 1:'AIR2'}))

True

In [92]:
%%timeit
flights_sort = (flights   # doctest: +SKIP
    [['ORG_AIR', 'DEST_AIR']] 
   .apply(lambda ser:
         ser.sort_values().reset_index(drop=True),
         axis='columns')
)

50.8 s ± 4.86 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [93]:
%%timeit
data_sorted = np.sort(flights[['ORG_AIR', 'DEST_AIR']])
flights_sort2 = pd.DataFrame(data_sorted, columns=['AIR1', 'AIR2'])

13.2 ms ± 911 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 9.12 案例演練：尋找航班的連續準時記錄

In [94]:
s = pd.Series([0, 1, 1, 0, 1, 1, 1, 0])
s

0    0
1    1
2    1
3    0
4    1
5    1
6    1
7    0
dtype: int64

In [95]:
s1 = s.cumsum()
s1

0    0
1    1
2    2
3    2
4    3
5    4
6    5
7    5
dtype: int64

In [96]:
s.mul(s1)

0    0
1    1
2    2
3    0
4    3
5    4
6    5
7    0
dtype: int64

In [97]:
s.mul(s1).diff()

0    NaN
1    1.0
2    1.0
3   -2.0
4    3.0
5    1.0
6    1.0
7   -5.0
dtype: float64

In [98]:
(s.mul(s.cumsum())
  .diff()
  .where(lambda x: x < 0))

0    NaN
1    NaN
2    NaN
3   -2.0
4    NaN
5    NaN
6    NaN
7   -5.0
dtype: float64

In [99]:
(s.mul(s.cumsum())
  .diff()
  .where(lambda x: x < 0)
  .ffill())

0    NaN
1    NaN
2    NaN
3   -2.0
4   -2.0
5   -2.0
6   -2.0
7   -5.0
dtype: float64

In [100]:
(s.mul(s.cumsum())
  .diff()
  .where(lambda x: x < 0)
  .ffill()
  .add(s.cumsum(), fill_value=0))

0    0.0
1    1.0
2    2.0
3    0.0
4    1.0
5    2.0
6    3.0
7    0.0
dtype: float64

In [101]:
flights = pd.read_csv('data/flights.csv')
(flights.assign(ON_TIME=flights['ARR_DELAY'].lt(15).astype(int))
        [['AIRLINE', 'ORG_AIR', 'ON_TIME']])

Unnamed: 0,AIRLINE,ORG_AIR,ON_TIME
0,WN,LAX,0
1,UA,DEN,1
2,MQ,DFW,0
3,AA,DFW,1
4,WN,LAX,0
...,...,...,...
58487,AA,SFO,1
58488,F9,LAS,1
58489,OO,SFO,1
58490,WN,MSP,0


In [102]:
def max_streak(s):
    s1 = s.cumsum()
    return (s.mul(s1)
             .diff()
             .where(lambda x: x < 0) 
             .ffill()
             .add(s1, fill_value=0)
             .max())

In [103]:
(flights
    .assign(ON_TIME=flights['ARR_DELAY'].lt(15).astype(int))
    .sort_values(['MONTH', 'DAY', 'SCHED_DEP']) 
    .groupby(['AIRLINE', 'ORG_AIR'])
    ['ON_TIME'] 
    .agg(['mean', 'size', max_streak])
    .round(2)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,size,max_streak
AIRLINE,ORG_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,ATL,0.82,233,15
AA,DEN,0.74,219,17
AA,DFW,0.78,4006,64
AA,IAH,0.80,196,24
AA,LAS,0.79,374,29
...,...,...,...,...
WN,LAS,0.77,2031,39
WN,LAX,0.70,1135,23
WN,MSP,0.84,237,32
WN,PHX,0.77,1724,33
