# SQL Refresher

In [7]:
import math
import numpy as np
import pandas as pd
import psycopg2

In [8]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [9]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [10]:
cursor = connection.cursor()

# Lab: SQL - SELECT Clauses

In [12]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from states

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,state,state_name,latitude,longitude,population,area,density
0,PA,Pennsylvania,40.9479,-77.628,12790950,43851.7864,291.69
1,OH,Ohio,40.1758,-82.6609,11639989,40511.7523,287.32
2,TN,Tennessee,35.8113,-85.9375,6644470,41397.0517,160.51
3,MS,Mississippi,32.6082,-89.7987,2988710,45241.4545,66.06
4,CT,Connecticut,41.5278,-72.73,3581504,4791.4988,747.47
5,NM,New Mexico,34.3614,-106.0363,2089364,93194.3046,22.42
6,WV,West Virginia,38.919,-80.1741,1828786,23062.9882,79.3
7,KS,Kansas,38.4942,-98.3132,2908448,80002.7898,36.35
8,HI,Hawaii,20.637,-157.5532,1422019,5832.0637,243.83
9,IA,Iowa,41.9628,-93.3776,3133061,55067.824,56.89


In [13]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select state, state_name, population
from states

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,state,state_name,population
0,PA,Pennsylvania,12790950
1,OH,Ohio,11639989
2,TN,Tennessee,6644470
3,MS,Mississippi,2988710
4,CT,Connecticut,3581504
5,NM,New Mexico,2089364
6,WV,West Virginia,1828786
7,KS,Kansas,2908448
8,HI,Hawaii,1422019
9,IA,Iowa,3133061


In [39]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  state, population, area, area / 0.386102
from states

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,state,population,area,?column?
0,PA,12790950,43851.7864,113575.65203
1,OH,11639989,40511.7523,104924.999871
2,TN,6644470,41397.0517,107217.915732
3,MS,2988710,45241.4545,117174.877364
4,CT,3581504,4791.4988,12409.930018
5,NM,2089364,93194.3046,241372.239978
6,WV,1828786,23062.9882,59732.889755
7,KS,2908448,80002.7898,207206.359459
8,HI,1422019,5832.0637,15104.981844
9,IA,3133061,55067.824,142625.067987


In [44]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  state,
        '(' || latitude || ',' || longitude || ')' as lat_lon_point,
        population, 
        area as area_square_miles, 
        (area / 0.386102) as area_square_km
from states

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,state,lat_lon_point,population,area_square_miles,area_square_km
0,PA,"(40.9479,-77.6280)",12790950,43851.7864,113575.65203
1,OH,"(40.1758,-82.6609)",11639989,40511.7523,104924.999871
2,TN,"(35.8113,-85.9375)",6644470,41397.0517,107217.915732
3,MS,"(32.6082,-89.7987)",2988710,45241.4545,117174.877364
4,CT,"(41.5278,-72.7300)",3581504,4791.4988,12409.930018
5,NM,"(34.3614,-106.0363)",2089364,93194.3046,241372.239978
6,WV,"(38.9190,-80.1741)",1828786,23062.9882,59732.889755
7,KS,"(38.4942,-98.3132)",2908448,80002.7898,207206.359459
8,HI,"(20.6370,-157.5532)",1422019,5832.0637,15104.981844
9,IA,"(41.9628,-93.3776)",3133061,55067.824,142625.067987


## You try it - select all the columns and all the rows from the zip_codes table

## You try it - select the following columns from the zip_codes table: zip, city, state, population, add a derived column for a 30% increase in population, and give it a meaningful alias

# Lab: SQL - ORDER BY Clauses

In [54]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population, area, density
from cities
order by area

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population,area,density
0,Sand Creek,WI,2,0.0010,2000.00
1,Wilton,AL,5,0.0032,1562.50
2,Eminence,IN,5,0.0033,1515.15
3,Chandlers Valley,PA,3,0.0033,909.09
4,Scuddy,KY,5,0.0055,909.09
...,...,...,...,...,...
27415,Kodiak,AK,12971,4173.4409,3.11
27416,Kotzebue,AK,3370,4337.2125,0.78
27417,Ketchikan,AK,13818,4850.1431,2.85
27418,Fairbanks,AK,63789,8241.7460,7.74


In [55]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population, area, density
from cities
order by area asc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population,area,density
0,Sand Creek,WI,2,0.0010,2000.00
1,Wilton,AL,5,0.0032,1562.50
2,Eminence,IN,5,0.0033,1515.15
3,Chandlers Valley,PA,3,0.0033,909.09
4,Scuddy,KY,5,0.0055,909.09
...,...,...,...,...,...
27415,Kodiak,AK,12971,4173.4409,3.11
27416,Kotzebue,AK,3370,4337.2125,0.78
27417,Ketchikan,AK,13818,4850.1431,2.85
27418,Fairbanks,AK,63789,8241.7460,7.74


In [56]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population, area, density
from cities
order by area desc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population,area,density
0,Dillingham,AK,2494,9629.3839,0.26
1,Fairbanks,AK,63789,8241.7460,7.74
2,Ketchikan,AK,13818,4850.1431,2.85
3,Kotzebue,AK,3370,4337.2125,0.78
4,Kodiak,AK,12971,4173.4409,3.11
...,...,...,...,...,...
27415,Scuddy,KY,5,0.0055,909.09
27416,Eminence,IN,5,0.0033,1515.15
27417,Chandlers Valley,PA,3,0.0033,909.09
27418,Wilton,AL,5,0.0032,1562.50


In [57]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population, area, density
from cities
order by 4

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population,area,density
0,Sand Creek,WI,2,0.0010,2000.00
1,Wilton,AL,5,0.0032,1562.50
2,Eminence,IN,5,0.0033,1515.15
3,Chandlers Valley,PA,3,0.0033,909.09
4,Scuddy,KY,5,0.0055,909.09
...,...,...,...,...,...
27415,Kodiak,AK,12971,4173.4409,3.11
27416,Kotzebue,AK,3370,4337.2125,0.78
27417,Ketchikan,AK,13818,4850.1431,2.85
27418,Fairbanks,AK,63789,8241.7460,7.74


In [58]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population, area, density
from cities
order by 4 desc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population,area,density
0,Dillingham,AK,2494,9629.3839,0.26
1,Fairbanks,AK,63789,8241.7460,7.74
2,Ketchikan,AK,13818,4850.1431,2.85
3,Kotzebue,AK,3370,4337.2125,0.78
4,Kodiak,AK,12971,4173.4409,3.11
...,...,...,...,...,...
27415,Scuddy,KY,5,0.0055,909.09
27416,Eminence,IN,5,0.0033,1515.15
27417,Chandlers Valley,PA,3,0.0033,909.09
27418,Wilton,AL,5,0.0032,1562.50


In [59]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population, area, density
from cities
order by state, city

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population,area,density
0,Adak,AK,203,8.5194,23.83
1,Akiachak,AK,567,1.6435,345.00
2,Akiak,AK,416,1.0816,384.62
3,Akutan,AK,758,10.2331,74.07
4,Alakanuk,AK,837,27.6211,30.30
...,...,...,...,...,...
27415,Wolf,WY,18,9.9283,1.81
27416,Worland,WY,7585,574.2321,13.21
27417,Wright,WY,1823,91.4109,19.94
27418,Wyarno,WY,29,55.9848,0.52


In [60]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population, area, density
from cities
order by state desc, city

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population,area,density
0,Afton,WY,4587,156.7301,29.27
1,Aladdin,WY,270,347.4918,0.78
2,Albin,WY,223,95.6675,2.33
3,Alcova,WY,246,949.8109,0.26
4,Alpine,WY,940,259.2399,3.63
...,...,...,...,...,...
27415,White Mountain,AK,186,0.6840,271.93
27416,Whittier,AK,313,151.0624,2.07
27417,Willow,AK,2624,1447.3309,1.81
27418,Wrangell,AK,2484,2397.6934,1.04


In [61]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population, area, density
from cities
order by area
limit 10

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population,area,density
0,Sand Creek,WI,2,0.001,2000.0
1,Wilton,AL,5,0.0032,1562.5
2,Eminence,IN,5,0.0033,1515.15
3,Chandlers Valley,PA,3,0.0033,909.09
4,Scuddy,KY,5,0.0055,909.09
5,West Newton,IN,15,0.0057,2631.58
6,Pricedale,PA,9,0.0061,1475.41
7,King,WI,96,0.0062,15483.87
8,Ira,IA,17,0.0064,2656.25
9,Ono,PA,93,0.0073,12739.73


In [62]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population, area, density
from cities
order by area desc
limit 10

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population,area,density
0,Dillingham,AK,2494,9629.3839,0.26
1,Fairbanks,AK,63789,8241.746,7.74
2,Ketchikan,AK,13818,4850.1431,2.85
3,Kotzebue,AK,3370,4337.2125,0.78
4,Kodiak,AK,12971,4173.4409,3.11
5,Healy,AK,1041,4019.3218,0.26
6,Winnemucca,NV,16189,3906.6283,4.14
7,Roswell,NM,57285,3830.6619,14.95
8,Anchorage,AK,243654,3806.352,64.01
9,Delta Junction,AK,4821,3722.7955,1.29


## You try it - using the zip_codes table, find top 5 zip codes with highest density, display the zip, city, state, and density

# Lab: SQL - WHERE Clauses

In [64]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, population
from cities
where state = 'CA'
order by city

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,population
0,Acampo,7538
1,Acton,7626
2,Adelanto,33967
3,Adin,380
4,Agoura Hills,25631
...,...,...
1158,Yuba City,77646
1159,Yucaipa,54163
1160,Yucca Valley,26085
1161,Zamora,179


In [65]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, population
from cities
where state = 'CA' and density >= 1000
order by city

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,population
0,Alameda,78462
1,Alamo,15646
2,Albany,21040
3,Alhambra,84864
4,Aliso Viejo,52124
...,...,...
331,Woodbridge,3881
332,Woodland Hills,72622
333,Yolo,420
334,Yorba Linda,70739


In [69]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population
from cities
where (state = 'CA' and density >= 5000)
      or
      (state = 'WY' and density >= 100)
order by state, city

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population
0,Alameda,CA,78462
1,Albany,CA,21040
2,Alhambra,CA,84864
3,Aliso Viejo,CA,52124
4,Anaheim,CA,368626
...,...,...,...
130,Winnetka,CA,49158
131,Diamondville,WY,672
132,Fe Warren Afb,WY,574
133,Mills,WY,1722


In [76]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city, state, population
from cities
where state in ('CA', 'NY', 'TN')
      and density > 3000
order by state, city

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,population
0,Alameda,CA,78462
1,Albany,CA,21040
2,Alhambra,CA,84864
3,Aliso Viejo,CA,52124
4,Altadena,CA,37711
...,...,...,...
354,Woodside,NY,86421
355,Wyandanch,NY,15033
356,Yonkers,NY,180532
357,Collegedale,TN,1566


## You try it - using the zip_codes table, find zip codes that are in Californa, Arizona, or Washington state with a population > 10,000, display the zip, city, state, and population with the lowest population first 

# Lab: SQL - Aggregation, GROUP BY Clauses

In [94]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  count(*)
from zip_codes

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,count
0,32723


In [98]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  count(*) as total_zips,
        sum(population) as total_population, 
        min(population) as min_population, 
        max(population) as max_population, 
        avg(population) as average_population, 
        stddev(population) as standard_deviation_population
from zip_codes

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,total_zips,total_population,min_population,max_population,average_population,standard_deviation_population
0,32723,326246880,1,122814,9969.9563,14663.013029


In [99]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  count(*) as total_zips,
        sum(population) as total_population, 
        min(population) as min_population, 
        max(population) as max_population, 
        avg(population) as average_population, 
        stddev(population) as standard_deviation_population
from zip_codes
where state in ('CA', 'WA', 'OR')

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,total_zips,total_population,min_population,max_population,average_population,standard_deviation_population
0,2735,50501106,1,109414,18464.755393,20757.137677


In [100]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  state,
        count(*) as total_zips,
        sum(population) as total_population, 
        min(population) as min_population, 
        max(population) as max_population, 
        avg(population) as average_population, 
        stddev(population) as standard_deviation_population
from zip_codes
group by state
order by total_population desc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,state,total_zips,total_population,min_population,max_population,average_population,standard_deviation_population
0,CA,1737,39140087,5,109414,22533.153138,22536.927071
1,TX,1911,27883996,6,122814,14591.311355,18664.45761
2,FL,975,20596370,5,75666,21124.482051,16196.552582
3,NY,1755,19618280,5,112425,11178.507123,17898.792718
4,IL,1382,12821487,7,111850,9277.486975,15188.901294
5,PA,1781,12790950,3,74971,7181.892195,10903.551494
6,OH,1188,11639989,13,71451,9797.970539,12692.097701
7,GA,726,10297534,28,90675,14183.931129,16972.506942
8,NC,802,10155624,10,78475,12662.872818,15027.721576
9,MI,976,9957465,6,68102,10202.320697,12114.154991


In [101]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  state,
        count(*) as total_rows,
        sum(population) as total_population, 
        min(population) as min_population, 
        max(population) as max_population, 
        avg(population) as average_population, 
        stddev(population) as standard_deviation_population
from zip_codes
where state like 'T%' or state like 'A%' or state in ('CA', 'WA', 'OR')
group by state
order by total_population desc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,state,total_rows,total_population,min_population,max_population,average_population,standard_deviation_population
0,CA,1737,39140087,5,109414,22533.153138,22536.927071
1,TX,1911,27883996,6,122814,14591.311355,18664.45761
2,WA,586,7279972,1,78767,12423.16041,14902.72942
3,AZ,394,6948967,3,74382,17636.972081,18577.49471
4,TN,622,6644470,17,97819,10682.427653,13837.574991
5,AL,637,4864630,5,55122,7636.78179,9106.072447
6,OR,412,4081047,6,72831,9905.453883,14463.935208
7,AR,587,2990472,6,58553,5094.500852,9099.974932
8,AK,212,732669,4,62135,3455.985849,8311.533944


In [103]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city,
        state,
        count(*) as total_zips,
        sum(population) as total_population, 
        min(population) as min_population, 
        max(population) as max_population, 
        avg(population) as average_population, 
        stddev(population) as standard_deviation_population
from zip_codes
where state in ('CA', 'WA', 'OR') and city like 'B%y'
group by city, state
order by total_population desc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,total_zips,total_population,min_population,max_population,average_population,standard_deviation_population
0,Berkeley,CA,9,127127,2971,29190,14125.222222,7777.207689
1,Brawley,CA,1,27255,27255,27255,27255.0,
2,Buckley,WA,1,15788,15788,15788,15788.0,
3,Baker City,OR,1,12433,12433,12433,12433.0,
4,Big Bear City,CA,1,11361,11361,11361,11361.0,
5,Benton City,WA,1,9644,9644,9644,9644.0,
6,Burney,CA,1,4810,4810,4810,4810.0,
7,Browns Valley,CA,1,2643,2643,2643,2643.0,
8,Bay City,OR,1,1724,1724,1724,1724.0,
9,Bradley,CA,1,1604,1604,1604,1604.0,


## You try it - using the zip_code table, find all cities, their state, and the number of zip codes in that city, in which the city starts with a C

# Lab: SQL - HAVING Clauses, Pre- vs. Post-Aggregation Filtering

In [114]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city,
        state,
        count(*) as total_zips,
        sum(population) as total_population, 
        min(population) as min_population, 
        max(population) as max_population, 
        avg(population) as average_population, 
        stddev(population) as standard_deviation_population
from zip_codes
where city like 'B%'
group by city, state
having sum(population) > 10000
order by total_population desc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,total_zips,total_population,min_population,max_population,average_population,standard_deviation_population
0,Brooklyn,NY,37,2600747,13244,102624,70290.459459,22795.834681
1,Bronx,NY,25,1441455,4434,103732,57658.200000,25879.753776
2,Baltimore,MD,22,630874,933,58448,28676.090909,15749.242554
3,Buffalo,NY,28,562780,1666,54081,20099.285714,11867.085913
4,Bakersfield,CA,11,559247,12028,84809,50840.636364,20025.636750
...,...,...,...,...,...,...,...,...
406,Brooklyn,MI,1,10259,10259,10259,10259.000000,
407,Bradley,IL,1,10121,10121,10121,10121.000000,
408,Breckenridge,CO,1,10120,10120,10120,10120.000000,
409,Belfair,WA,1,10056,10056,10056,10056.000000,


In [115]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select  city,
        state,
        count(*) as total_zips,
        sum(population) as total_population, 
        min(population) as min_population, 
        max(population) as max_population, 
        avg(population) as average_population, 
        stddev(population) as standard_deviation_population
from zip_codes
where city like 'B%'
group by city, state
having count(*) > 10
order by total_zips desc

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,total_zips,total_population,min_population,max_population,average_population,standard_deviation_population
0,Brooklyn,NY,37,2600747,13244,102624,70290.459459,22795.834681
1,Birmingham,AL,30,500176,1219,55122,16672.533333,13647.101783
2,Buffalo,NY,28,562780,1666,54081,20099.285714,11867.085913
3,Bronx,NY,25,1441455,4434,103732,57658.2,25879.753776
4,Baltimore,MD,22,630874,933,58448,28676.090909,15749.242554
5,Baton Rouge,LA,18,378971,85,42094,21053.944444,12904.442566
6,Boston,MA,13,151778,1149,29108,11675.230769,10791.746531
7,Bakersfield,CA,11,559247,12028,84809,50840.636364,20025.63675


## You try it - using the zip_code table, find the cities and their state, that start with C, that have more than 5 zip codes, and have a minimum population < 10000

# Lab: SQL - Set Operations

# Lab: SQL - Join Operations

# Lab: SQL - Type 1 Subqueries

# Lab: SQL - Type 2 Subqueries

# Lab: SQL - INSERT Statements

# Lab: SQL - UPDATE Statements

# Lab: SQL - DELETE Statements

# Lab: Basic Pie Charts, Grids of Subplots

# Lab: Basic Scatter Plots, Line Plots

# Lab: Basic Bar Charts, Histograms

# Lab: Basic Box Plots, Violin Plots

# Lab: Google Maps - Display Map by Type, Size, Zoom

# Lab: Google Maps - Loccation Markers, Location Symbols

# Lab: Google Maps - Heatmaps, Choropleths

# Lab: Google Maps - Driving Directions, Traffic Layers, Transit Layers