In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import zipfile
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
p1 = Path.cwd() / 'back_data'
flights = pd.read_csv(p1 / 'flights.csv')
flights.head()

Unnamed: 0,MONTH,DAY,WEEKDAY,AIRLINE,ORG_AIR,DEST_AIR,SCHED_DEP,DEP_DELAY,AIR_TIME,DIST,SCHED_ARR,ARR_DELAY,DIVERTED,CANCELLED
0,1,1,4,WN,LAX,SLC,1625,58.0,94.0,590,1905,65.0,0,0
1,1,1,4,UA,DEN,IAD,823,7.0,154.0,1452,1333,-13.0,0,0
2,1,1,4,MQ,DFW,VPS,1305,36.0,85.0,641,1453,35.0,0,0
3,1,1,4,AA,DFW,DCA,1555,7.0,126.0,1192,1935,-7.0,0,0
4,1,1,4,WN,LAX,MCI,1720,48.0,166.0,1363,2225,39.0,0,0


In [3]:
(flights
.groupby('AIRLINE')['ARR_DELAY']
.agg('mean')
)

AIRLINE
AA     5.542661
AS    -0.833333
B6     8.692593
DL     0.339691
EV     7.034580
F9    13.630651
HA     4.972973
MQ     6.860591
NK    18.436070
OO     7.593463
UA     7.765755
US     1.681105
VX     5.348884
WN     6.397353
Name: ARR_DELAY, dtype: float64

In [4]:
(flights
 .groupby(['AIRLINE', 'WEEKDAY'])['CANCELLED']
 .sum()
)

AIRLINE  WEEKDAY
AA       1          41
         2           9
         3          16
         4          20
         5          18
                    ..
WN       3          18
         4          10
         5           7
         6          10
         7           7
Name: CANCELLED, Length: 98, dtype: int64

In [5]:
(flights
 .groupby(['AIRLINE', 'WEEKDAY'])[['CANCELLED', 'DIVERTED']]
 .agg(['sum', 'mean'])
)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,DIVERTED,DIVERTED
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,41,0.032106,6,0.004699
AA,2,9,0.007341,2,0.001631
AA,3,16,0.011949,2,0.001494
AA,4,20,0.015004,5,0.003751
AA,5,18,0.014151,1,0.000786
...,...,...,...,...,...
WN,3,18,0.014118,2,0.001569
WN,4,10,0.007911,4,0.003165
WN,5,7,0.005828,0,0.000000
WN,6,10,0.010132,3,0.003040


In [6]:
(flights
 .groupby(['ORG_AIR', 'DEST_AIR'])
 .agg({'CANCELLED':['size', 'sum', 'mean'], 'AIR_TIME':['mean', 'var']})
)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED,CANCELLED,CANCELLED,AIR_TIME,AIR_TIME
Unnamed: 0_level_1,Unnamed: 1_level_1,size,sum,mean,mean,var
ORG_AIR,DEST_AIR,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
ATL,ABE,31,0,0.000000,96.387097,45.778495
ATL,ABQ,16,0,0.000000,170.500000,87.866667
ATL,ABY,19,0,0.000000,28.578947,6.590643
ATL,ACY,6,0,0.000000,91.333333,11.466667
ATL,AEX,40,0,0.000000,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,122,4,0.032787,64.059322,11.338331
SFO,STL,20,0,0.000000,198.900000,101.042105
SFO,SUN,10,0,0.000000,78.000000,25.777778
SFO,TUS,20,0,0.000000,100.200000,35.221053


In [12]:
res = (flights
 .groupby(['ORG_AIR', 'DEST_AIR'])
 .agg({'CANCELLED':['size', 'sum', 'mean'], 'AIR_TIME':['mean', 'var']})
)
res.columns = ['_'.join(col) for col in res.columns.to_flat_index()]
res

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED_size,CANCELLED_sum,CANCELLED_mean,AIR_TIME_mean,AIR_TIME_var
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,31,0,0.000000,96.387097,45.778495
ATL,ABQ,16,0,0.000000,170.500000,87.866667
ATL,ABY,19,0,0.000000,28.578947,6.590643
ATL,ACY,6,0,0.000000,91.333333,11.466667
ATL,AEX,40,0,0.000000,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,122,4,0.032787,64.059322,11.338331
SFO,STL,20,0,0.000000,198.900000,101.042105
SFO,SUN,10,0,0.000000,78.000000,25.777778
SFO,TUS,20,0,0.000000,100.200000,35.221053


In [27]:
def flatten_cols(df):
    df.columns = ['_'.join(col) for col in df.columns.to_flat_index()]
    return df

In [15]:
(flights
 .groupby(['ORG_AIR', 'DEST_AIR'])
 .agg({'CANCELLED':['size', 'sum', 'mean'], 'AIR_TIME':['mean', 'var']})
 .pipe(flatten_cols)
)

Unnamed: 0_level_0,Unnamed: 1_level_0,CANCELLED_size,CANCELLED_sum,CANCELLED_mean,AIR_TIME_mean,AIR_TIME_var
ORG_AIR,DEST_AIR,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ATL,ABE,31,0,0.000000,96.387097,45.778495
ATL,ABQ,16,0,0.000000,170.500000,87.866667
ATL,ABY,19,0,0.000000,28.578947,6.590643
ATL,ACY,6,0,0.000000,91.333333,11.466667
ATL,AEX,40,0,0.000000,78.725000,47.332692
...,...,...,...,...,...,...
SFO,SNA,122,4,0.032787,64.059322,11.338331
SFO,STL,20,0,0.000000,198.900000,101.042105
SFO,SUN,10,0,0.000000,78.000000,25.777778
SFO,TUS,20,0,0.000000,100.200000,35.221053


In [28]:
air_info = (flights
 .groupby(['AIRLINE', 'WEEKDAY'])
 .agg({'DIST':['sum', 'mean'], 'ARR_DELAY':['min', 'max']})
 .astype('int')
)
air_info

Unnamed: 0_level_0,Unnamed: 1_level_0,DIST,DIST,ARR_DELAY,ARR_DELAY
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max
AIRLINE,WEEKDAY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AA,1,1455386,1139,-60,551
AA,2,1358256,1107,-52,725
AA,3,1496665,1117,-45,473
AA,4,1452394,1089,-46,349
AA,5,1427749,1122,-41,732
...,...,...,...,...,...
WN,3,997213,782,-38,262
WN,4,1024854,810,-52,284
WN,5,981036,816,-44,244
WN,6,823946,834,-41,290


In [29]:
(air_info
 .pipe(flatten_cols)
 .reset_index()
)

Unnamed: 0,AIRLINE,WEEKDAY,DIST_sum,DIST_mean,ARR_DELAY_min,ARR_DELAY_max
0,AA,1,1455386,1139,-60,551
1,AA,2,1358256,1107,-52,725
2,AA,3,1496665,1117,-45,473
3,AA,4,1452394,1089,-46,349
4,AA,5,1427749,1122,-41,732
...,...,...,...,...,...,...
93,WN,3,997213,782,-38,262
94,WN,4,1024854,810,-52,284
95,WN,5,981036,816,-44,244
96,WN,6,823946,834,-41,290


In [30]:
college = pd.read_csv(p1 / 'college.csv')
college.head()

Unnamed: 0,INSTNM,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama A & M University,Normal,AL,1.0,0.0,0.0,0,424.0,420.0,0.0,...,0.0,0.0059,0.0138,0.0656,1,0.7356,0.8284,0.1049,30300,33888.0
1,University of Alabama at Birmingham,Birmingham,AL,0.0,0.0,0.0,0,570.0,565.0,0.0,...,0.0368,0.0179,0.01,0.2607,1,0.346,0.5214,0.2422,39700,21941.5
2,Amridge University,Montgomery,AL,0.0,0.0,0.0,1,,,1.0,...,0.0,0.0,0.2715,0.4536,1,0.6801,0.7795,0.854,40100,23370.0
3,University of Alabama in Huntsville,Huntsville,AL,0.0,0.0,0.0,0,595.0,590.0,0.0,...,0.0172,0.0332,0.035,0.2146,1,0.3072,0.4596,0.264,45500,24097.0
4,Alabama State University,Montgomery,AL,1.0,0.0,0.0,0,425.0,430.0,0.0,...,0.0098,0.0243,0.0137,0.0892,1,0.7347,0.7554,0.127,26600,33118.5


In [34]:
def max_dev(ser):
    std_score = (ser - ser.mean()) / ser.std()
    return std_score.abs().max()
max_dev.__name__ = 'Max Deviation'

In [36]:
(college
 .groupby('STABBR')['UGDS']
 .agg(max_dev)
 .round(1)
 .head()
)

STABBR
AK    2.6
AL    5.8
AR    6.3
AS    NaN
AZ    9.9
Name: UGDS, dtype: float64

In [39]:
def pct_btw(ser, min, max):
    return ser.between(min, max).mean() * 100

In [40]:
(college
 .groupby(['STABBR', 'RELAFFIL'])['UGDS']
 .agg(pct_btw, 1000, 3000)
 .round(1)
)

STABBR  RELAFFIL
AK      0           14.3
        1            0.0
AL      0           23.6
        1           33.3
AR      0           27.9
                    ... 
WI      0           13.8
        1           36.0
WV      0           24.6
        1           37.5
WY      0           54.5
Name: UGDS, Length: 112, dtype: float64