# SQL Refresher - Solutions

In [1]:
import math
import numpy as np
import pandas as pd
import psycopg2

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [4]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [5]:
cursor = connection.cursor()

## You try it - select all the columns and all the rows from the zip_codes table

In [6]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * 
from zip_codes

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,zip,latitude,longitude,city,state,population,area,density,time_zone
0,08074,39.7158,-75.1640,Richwood,NJ,15,0.0886,169.39,America/New_York
1,08240,39.4873,-74.5318,Pomona,NJ,2293,1.5196,1508.93,America/New_York
2,08876,40.5880,-74.6874,Somerville,NJ,22059,15.1172,1459.20,America/New_York
3,10001,40.7506,-73.9972,New York,NY,22924,0.6675,34341.44,America/New_York
4,32026,30.0541,-82.1815,Raiford,FL,1907,0.6333,3011.38,America/New_York
...,...,...,...,...,...,...,...,...,...
32718,47367,40.0827,-85.3872,Oakville,IN,23,0.0866,265.47,America/Indiana/Indianapolis
32719,63079,38.2606,-91.0998,Stanton,MO,24,0.3523,68.12,America/Chicago
32720,63738,37.0893,-89.9574,Brownwood,MO,31,0.1171,264.70,America/Chicago
32721,68954,40.6227,-98.2374,Inland,NE,14,1.7437,8.03,America/Chicago


## You try it - select the following columns from the zip_codes table: zip, city, state, population, add a derived column for a 30% increase in population, and give it a meaningful alias

In [8]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select zip, city, state, population, (population * 1.3) as population_plus_30_pct
from zip_codes

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,zip,city,state,population,population_plus_30_pct
0,08074,Richwood,NJ,15,19.5
1,08240,Pomona,NJ,2293,2980.9
2,08876,Somerville,NJ,22059,28676.7
3,10001,New York,NY,22924,29801.2
4,32026,Raiford,FL,1907,2479.1
...,...,...,...,...,...
32718,47367,Oakville,IN,23,29.9
32719,63079,Stanton,MO,24,31.2
32720,63738,Brownwood,MO,31,40.3
32721,68954,Inland,NE,14,18.2


## You try it - using the zip_codes table, find top 5 zip codes with highest density, display the zip, city, state, and density

In [9]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select zip, city, state, density
from zip_codes
order by density desc
limit 5

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,zip,city,state,density
0,20052,Washington,DC,172372.59
1,10162,New York,NY,146388.52
2,10028,New York,NY,144096.64
3,10075,New York,NY,141792.32
4,20390,Washington,DC,138322.26


## You try it - using the zip_codes table, find zip codes that are in Californa, Arizona, or Washington state with a population > 10,000, display the zip, city, state, and population with the lowest population first 

In [11]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select zip, city, state, population
from zip_codes
where state in ('CA', 'AZ', 'WA') and population > 10000
order by population

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,zip,city,state,population
0,98070,Vashon,WA,10036
1,98528,Belfair,WA,10056
2,98346,Kingston,WA,10106
3,96019,Shasta Lake,CA,10111
4,93219,Earlimart,CA,10167
...,...,...,...,...
1420,92335,Fontana,CA,99284
1421,90201,Bell Gardens,CA,102433
1422,91331,Pacoima,CA,105799
1423,90650,Norwalk,CA,105886


## You try it - using the zip_code table, find all cities, their state, and the number of zip codes in that city, in which the city starts with a C, sort by state, then by city

In [13]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select city, state, count(*) as total_zips
from zip_codes
where city like 'C%'
group by city, state
order by state, city 

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,total_zips
0,Cantwell,AK,1
1,Chalkyitsik,AK,1
2,Chefornak,AK,1
3,Chevak,AK,1
4,Chignik,AK,1
...,...,...,...
2495,Cody,WY,1
2496,Cokeville,WY,1
2497,Cora,WY,1
2498,Cowley,WY,1


## You try it - using the zip_code table, find the cities and their state, that start with C, that have more than 5 zip codes, and have a minimum population < 10000, sort by state, then by city

In [15]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select city, state, count(*) as total_zips, min(population) as min_population
from zip_codes
where city like 'C%'
group by city, state
having count(*) > 5 and min(population) < 10000
order by state, city 

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,city,state,total_zips,min_population
0,Colorado Springs,CO,30,112
1,Clearwater,FL,9,8093
2,Columbus,GA,6,6336
3,Cedar Rapids,IA,6,2168
4,Chicago,IL,56,782
5,Charlotte,NC,24,6725
6,Canton,OH,12,948
7,Cincinnati,OH,45,2429
8,Cleveland,OH,29,610
9,Columbus,OH,30,2440
