In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

# Lesson 11: Pivots and Joins #

## Cross-classification: grouping by two columns

In [2]:
sky = Table.read_table('data/skyscrapers_v2.csv')
sky = (sky.with_column('age', 2021 - sky.column('completed'))
          .drop('completed'))
sky.show(3)

name,material,city,height,age
One World Trade Center,mixed/composite,New York City,541.3,7
Willis Tower,steel,Chicago,442.14,47
432 Park Avenue,concrete,New York City,425.5,6


In [3]:
sky.group(['material','city']).sort('count', descending=True).show(10)

material,city,count
steel,New York City,263
concrete,New York City,249
concrete,Chicago,244
steel,Chicago,79
concrete,Miami,78
steel,San Francisco,48
concrete,Honolulu,47
steel,Los Angeles,37
mixed/composite,Chicago,35
concrete,Las Vegas,33


In [4]:
sky.group(['material','city'], np.average).show(10)

material,city,name average,height average,age average
concrete,Atlanta,,148.775,18.963
concrete,Austin,,127.714,14.5
concrete,Baltimore,,112.833,29.375
concrete,Boston,,105.765,38.75
concrete,Charlotte,,161.348,17.8
concrete,Chicago,,128.285,31.082
concrete,Cincinnati,,102.808,39.6
concrete,Cleveland,,116.755,39.5
concrete,Columbus,,77.825,50.75
concrete,Dallas,,125.19,18.8571


In [5]:
sky.group(['city','material'], np.average).show(10)

city,material,name average,height average,age average
Atlanta,concrete,,148.775,18.963
Atlanta,mixed/composite,,229.24,26.0
Atlanta,steel,,112.414,62.4286
Austin,concrete,,127.714,14.5
Austin,steel,,93.6,84.0
Baltimore,concrete,,112.833,29.375
Baltimore,steel,,110.853,62.5
Boston,concrete,,105.765,38.75
Boston,mixed/composite,,121.6,19.5
Boston,steel,,143.405,43.0769


In [6]:
sky.column(1)

array(['mixed/composite', 'steel', 'concrete', ..., 'concrete', 'concrete',
       'steel'],
      dtype='<U15')

In [7]:
np.unique(sky.column(1))

array(['concrete', 'mixed/composite', 'steel'],
      dtype='<U15')

In [8]:
sky.pivot('material','city').show(10)

city,concrete,mixed/composite,steel
Atlanta,27,4,7
Austin,16,0,1
Baltimore,8,0,6
Boston,4,2,26
Charlotte,5,3,4
Chicago,244,35,79
Cincinnati,5,1,7
Cleveland,2,3,11
Columbus,4,0,9
Dallas,7,2,11


In [9]:
sky.pivot('city','material').show(10)

material,Atlanta,Austin,Baltimore,Boston,Charlotte,Chicago,Cincinnati,Cleveland,Columbus,Dallas,Denver,Detroit,Honolulu,Houston,Indianapolis,Jersey City,Kansas City,Las Vegas,Los Angeles,Miami,Miami Beach,Milwaukee,Minneapolis,New York City,Philadelphia,Phoenix,Pittsburgh,Portland,Sacramento,Salt Lake City,San Diego,San Francisco,Seattle,St. Louis,Sunny Isles Beach
concrete,27,16,8,4,5,244,5,2,4,7,5,6,47,27,9,14,3,33,9,78,19,9,18,249,14,7,2,4,3,6,11,10,14,6,22
mixed/composite,4,0,0,2,3,35,1,3,0,2,2,0,0,7,0,0,0,6,3,4,0,0,9,19,2,4,2,1,1,0,1,3,9,1,0
steel,7,1,6,26,4,79,7,11,9,11,4,14,3,20,7,2,7,5,37,4,0,5,12,263,15,4,32,5,7,7,9,48,31,8,0


In [10]:
sky.pivot('material','city', values='height', collect=np.average).show(10)

city,concrete,mixed/composite,steel
Atlanta,148.775,229.24,112.414
Austin,127.714,0.0,93.6
Baltimore,112.833,0.0,110.853
Boston,105.765,121.6,143.405
Charlotte,161.348,196.01,127.013
Chicago,128.285,160.217,130.985
Cincinnati,102.808,202.69,120.991
Cleveland,116.755,166.363,136.856
Columbus,77.825,0.0,120.902
Dallas,125.19,260.3,178.783


In [11]:
sky_p = sky.pivot('material', 'city', values='height', collect=max)
sky_p.show(10)

city,concrete,mixed/composite,steel
Atlanta,264.25,311.8,169.47
Austin,208.15,0.0,93.6
Baltimore,161.24,0.0,155.15
Boston,121.92,139.0,240.79
Charlotte,265.48,239.7,179.23
Chicago,423.22,306.94,442.14
Cincinnati,125.0,202.69,175.0
Cleveland,125.0,288.65,215.8
Columbus,79.25,0.0,169.3
Dallas,176.48,280.72,270.06


In [12]:
sky_p = sky_p.with_column(
    'difference', 
    abs(sky_p.column('steel') - sky_p.column('concrete'))
)
sky_p

city,concrete,mixed/composite,steel,difference
Atlanta,264.25,311.8,169.47,94.78
Austin,208.15,0.0,93.6,114.55
Baltimore,161.24,0.0,155.15,6.09001
Boston,121.92,139.0,240.79,118.87
Charlotte,265.48,239.7,179.23,86.25
Chicago,423.22,306.94,442.14,18.92
Cincinnati,125.0,202.69,175.0,50.0
Cleveland,125.0,288.65,215.8,90.8
Columbus,79.25,0.0,169.3,90.05
Dallas,176.48,280.72,270.06,93.58


In [13]:
sky_p.sort('difference', True)

city,concrete,mixed/composite,steel,difference
Sunny Isles Beach,196.0,0.0,0.0,196.0
Las Vegas,350.22,195.68,164.6,185.62
Miami Beach,170.39,0.0,0.0,170.39
Pittsburgh,89.3,172.0,256.34,167.04
Los Angeles,145.7,118.26,310.29,164.59
Philadelphia,157.89,296.73,288.04,130.15
Boston,121.92,139.0,240.79,118.87
Austin,208.15,0.0,93.6,114.55
Seattle,138.69,284.38,235.31,96.62
Atlanta,264.25,311.8,169.47,94.78


## Joins ##

In [14]:
drinks = Table().with_columns('Drinks', make_array(), 'Cafe', make_array(), 'Price', make_array())
drinks = drinks.with_rows([
    ['Chai Tea', 'Joe van Gogh', 4],
    ['Espresso', 'Monuts',  2],
    ['Latte',    'Monuts',  3],
    ['Espresso', "Cloche",   2]
])
drinks

Drinks,Cafe,Price
Chai Tea,Joe van Gogh,4
Espresso,Monuts,2
Latte,Monuts,3
Espresso,Cloche,2


In [15]:
discounts = Table().with_columns(
    'Coupon', make_array(10, 20, 5),
    'Location', make_array('Joe van Gogh', 'Monuts', 'Joe van Gogh')
)
discounts

Coupon,Location
10,Joe van Gogh
20,Monuts
5,Joe van Gogh


In [19]:
combined = drinks.join('Cafe', discounts, 'Location')
combined

Cafe,Drinks,Price,Coupon
Joe van Gogh,Chai Tea,4,10
Joe van Gogh,Chai Tea,4,5
Monuts,Espresso,2,20
Monuts,Latte,3,20


In [20]:
discounted_frac = 1 - combined.column('Coupon') / 100

combined.with_column(
    'Discounted Price', 
    combined.column('Price') * discounted_frac
)

Cafe,Drinks,Price,Coupon,Discounted Price
Joe van Gogh,Chai Tea,4,10,3.6
Joe van Gogh,Chai Tea,4,5,3.8
Monuts,Espresso,2,20,1.6
Monuts,Latte,3,20,2.4


In [21]:
drinks.join('Cafe', drinks, 'Cafe')

Cafe,Drinks,Price,Drinks_2,Price_2
Cloche,Espresso,2,Espresso,2
Joe van Gogh,Chai Tea,4,Chai Tea,4
Monuts,Espresso,2,Espresso,2
Monuts,Espresso,2,Latte,3
Monuts,Latte,3,Espresso,2
Monuts,Latte,3,Latte,3
