# Project 3, Setup geodesic distances and zip code and population data to enhance our BART model

University of California, Berkeley

Master of Information and Data Science (MIDS) program

w205 - Fundamentals of Data Engineering


# Included Modules and Packages

Import support for Geodesic calculations

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2

from geographiclib.geodesic import Geodesic

# Supporting code

Python support for geodesic calcuations

In [2]:
def my_calculate_box(point, miles):
    "Given a point and miles, calculate the box in form left, right, top, bottom"
    
    geod = Geodesic.WGS84

    kilometers = miles * 1.60934
    meters = kilometers * 1000

    g = geod.Direct(point[0], point[1], 270, meters)
    left = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 90, meters)
    right = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 0, meters)
    top = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 180, meters)
    bottom = (g['lat2'], g['lon2'])
    
    return(left, right, top, bottom)

In [3]:
def my_station_get_zips(station, miles):
    "given a station, pull all zip codes with miles distance, print them, sum the population"
    
    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    print(query)
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)
        
    (left, right, top, bottom) = my_calculate_box(point, miles)
    
    query = "select zip, population from zip_codes "
    query += " where latitude >= " + str(bottom[0])
    query += " and latitude <= " + str(top [0])
    query += " and longitude >= " + str(left[1])
    query += " and longitude <= " + str(right[1])
    query += " order by 1 "

    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    print("\n-------------------------------------------------------------------------------")
    print("  Zip Codes within " + str(miles) + " mile(s) of " + station + " BART Station")
    print("-------------------------------------------------------------------------------\n")
    
    total_population = 0
    
    for row in rows:
        zip = row[0]
        population = row[1]
        print("     zip:", zip, "  population: ", f'{population:10,}')
        total_population += population
        
    
    print("\n-------------------------------------------------------------------------------")
    print("  Total Population: ", f'{total_population:10,}')
    print("-------------------------------------------------------------------------------")

    return int(total_population)

In [4]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [5]:
cursor = connection.cursor()

# Find all zip codes, with population, within 1 mile of the Downtown Berkeley station

Our stations table has each station, along with the latitude and longitude point for the station

The function above my_station_get_zips() takes a station name and X miles (which can be decimal such as 1.5) and finds all zip codes whose geograpic center lies within a box X miles from the station along with the population and the total population

Zip code data is pulled from the zip_codes table

Week 2 has an optional module that covers geodesic distance calculation and how to use the box method




## Since this is the first one, a solution code cell is provided for you to execute and then pattern the rest after



In [6]:
my_station_get_zips('Downtown Berkeley', 1)

select latitude, longitude from stations where station = 'Downtown Berkeley'

-------------------------------------------------------------------------------
  Zip Codes within 1 mile(s) of Downtown Berkeley BART Station
-------------------------------------------------------------------------------

     zip: 94702   population:      17,092
     zip: 94703   population:      21,937
     zip: 94704   population:      29,190
     zip: 94709   population:      11,740
     zip: 94720   population:       2,971

-------------------------------------------------------------------------------
  Total Population:      82,930
-------------------------------------------------------------------------------


82930

# Find all zip codes, with population, within 1, 2, 3, 4, and 5 miles of the Downtown Berkeley station

Write a loop to call the function my_station_get_zips() for each of 1, 2, 3, 4, and 5 miles




In [7]:
for miles in range(1,6):
    my_station_get_zips('Downtown Berkeley', miles)

select latitude, longitude from stations where station = 'Downtown Berkeley'

-------------------------------------------------------------------------------
  Zip Codes within 1 mile(s) of Downtown Berkeley BART Station
-------------------------------------------------------------------------------

     zip: 94702   population:      17,092
     zip: 94703   population:      21,937
     zip: 94704   population:      29,190
     zip: 94709   population:      11,740
     zip: 94720   population:       2,971

-------------------------------------------------------------------------------
  Total Population:      82,930
-------------------------------------------------------------------------------
select latitude, longitude from stations where station = 'Downtown Berkeley'

-------------------------------------------------------------------------------
  Zip Codes within 2 mile(s) of Downtown Berkeley BART Station
--------------------------------------------------------------------------

# 3.5.4 Find all zip codes, with population, within 2 miles of  the Powell Street station

Use the my_station_get_zips() function

In [8]:
my_station_get_zips('Powell Street', 2)

select latitude, longitude from stations where station = 'Powell Street'

-------------------------------------------------------------------------------
  Zip Codes within 2 mile(s) of Powell Street BART Station
-------------------------------------------------------------------------------

     zip: 94102   population:      31,067
     zip: 94103   population:      28,735
     zip: 94104   population:         546
     zip: 94105   population:       9,155
     zip: 94107   population:      29,689
     zip: 94108   population:      14,550
     zip: 94109   population:      56,677
     zip: 94111   population:       3,620
     zip: 94114   population:      34,754
     zip: 94115   population:      35,004
     zip: 94123   population:      25,941
     zip: 94133   population:      26,527
     zip: 94158   population:       7,291

-------------------------------------------------------------------------------
  Total Population:     303,556
-----------------------------------------------

303556

# 3.5.5 Find all zip codes, with population, within 2 miles of  the Coliseum station

Use the my_station_get_zips() function

In [9]:
my_station_get_zips('Coliseum', 2)

select latitude, longitude from stations where station = 'Coliseum'

-------------------------------------------------------------------------------
  Zip Codes within 2 mile(s) of Coliseum BART Station
-------------------------------------------------------------------------------

     zip: 94601   population:      52,299
     zip: 94603   population:      34,593
     zip: 94613   population:         861
     zip: 94621   population:      35,287

-------------------------------------------------------------------------------
  Total Population:     123,040
-------------------------------------------------------------------------------


123040

# Create a new table to store population at distances for each station

Understanding how much population opportunity exists at various distances from the stations may inform our strategy.

In [10]:
connection.rollback()

query = """

drop table if exists stations_geopop;

"""

cursor.execute(query)

connection.commit()


In [11]:
connection.rollback()

query = """

create table stations_geopop (
  station varchar(32),
  pop_1m numeric,
  pop_2m numeric,
  pop_3m numeric
);


"""

cursor.execute(query)

connection.commit()

In [12]:
def my_station_get_population(station, miles):
    "given a station, pull all zip codes with miles distance, sum the population, return the sum"
    
    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)
        
    (left, right, top, bottom) = my_calculate_box(point, miles)
    
    query = "select zip, population from zip_codes "
    query += " where latitude >= " + str(bottom[0])
    query += " and latitude <= " + str(top [0])
    query += " and longitude >= " + str(left[1])
    query += " and longitude <= " + str(right[1])
    query += " order by 1 "

    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    total_population = 0
    
    for row in rows:
        zip = row[0]
        population = row[1]
        total_population += population
        
    return int(total_population)

In [13]:

df = pd.DataFrame(columns = ['Station', 'Pop_1m', 'Pop_2m', 'Pop_3m'])

connection.rollback()
    
query = "select station from stations "

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    station = row[0]

    db_row = []

    pop_1m = my_station_get_population(station, 1)
    db_row.append(station)
    db_row.append(pop_1m)

    pop_2m = my_station_get_population(station, 2)
    db_row.append(pop_2m)

    pop_3m = my_station_get_population(station, 3)
    db_row.append(pop_3m)

    df.loc[len(df)] = db_row

df



Unnamed: 0,Station,Pop_1m,Pop_2m,Pop_3m
0,12th Street,16062,175958,298398
1,16th Street Mission,63489,339093,554106
2,19th Street,16062,165215,298398
3,24th Street Mission,108915,315201,621661
4,Antioch,0,66933,110721
5,Ashby,68219,173897,226433
6,Balboa Park,106589,253123,548413
7,Bay Fair,41059,93041,286050
8,Berryessa,28726,197640,293347
9,Castro Valley,0,110328,165604


In [14]:
df.to_csv('/user/projects/project-3-timothy-majidzadeh/code/stations_geopop.csv', index=False)

In [15]:
connection.rollback()

query = """

copy stations_geopop (station, pop_1m, pop_2m, pop_3m)
from '/user/projects/project-3-timothy-majidzadeh/code/stations_geopop.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [16]:
connection.rollback()

query = """

select * from stations_geopop

"""

cursor.execute(query)

connection.commit()

In [17]:
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [22]:
rollback_before_flag = True
rollback_after_flag = True
df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df

Unnamed: 0,station,pop_1m,pop_2m,pop_3m
0,12th Street,16062,175958,298398
1,16th Street Mission,63489,339093,554106
2,19th Street,16062,165215,298398
3,24th Street Mission,108915,315201,621661
4,Antioch,0,66933,110721
5,Ashby,68219,173897,226433
6,Balboa Park,106589,253123,548413
7,Bay Fair,41059,93041,286050
8,Berryessa,28726,197640,293347
9,Castro Valley,0,110328,165604


In [26]:
macarthur_community_df = df.loc[df['station'].isin(['MacArthur', '12th Street', '19th Street'])]
macarthur_community_df

Unnamed: 0,station,pop_1m,pop_2m,pop_3m
0,12th Street,16062,175958,298398
2,19th Street,16062,165215,298398
26,MacArthur,22811,143953,271301


In [27]:
macarthur_community_df.sum()

station    12th Street19th StreetMacArthur
pop_1m                               54935
pop_2m                              485126
pop_3m                              868097
dtype: object

In [28]:
coliseum_community_df = df.loc[df['station'].isin(['Coliseum', 'Fruitvale', 'Lake Merritt', 'OAK'])]
coliseum_community_df

Unnamed: 0,station,pop_1m,pop_2m,pop_3m
11,Coliseum,0,123040,267162
21,Fruitvale,52299,90602,317296
25,Lake Merritt,16062,175958,281357
32,OAK,0,84499,154627


In [29]:
coliseum_community_df.sum()

station    ColiseumFruitvaleLake MerrittOAK
pop_1m                                68361
pop_2m                               474099
pop_3m                              1020442
dtype: object

In [30]:
bayfair_community_df = df.loc[df['station'].isin(['Bay Fair', 'San Leandro', 'West Dublin', 'Dublin', 'Castro Valley'])]
bayfair_community_df

Unnamed: 0,station,pop_1m,pop_2m,pop_3m
7,Bay Fair,41059,93041,286050
9,Castro Valley,0,110328,165604
16,Dublin,54420,54420,90736
42,San Leandro,48088,123740,224179
48,West Dublin,0,54420,90736


In [31]:
bayfair_community_df.sum()

station    Bay FairCastro ValleyDublinSan LeandroWest Dublin
pop_1m                                                143567
pop_2m                                                435949
pop_3m                                                857305
dtype: object

In [32]:
dalycity_community_df = df.loc[df['station'].isin(['Daly City', 'Balboa Park', 'Glen Park'])]
dalycity_community_df

Unnamed: 0,station,pop_1m,pop_2m,pop_3m
6,Balboa Park,106589,253123,548413
14,Daly City,0,231056,330204
22,Glen Park,115068,287877,624151


In [34]:
dalycity_community_df.sum()

station    Balboa ParkDaly CityGlen Park
pop_1m                            221657
pop_2m                            772056
pop_3m                           1502768
dtype: object