# Project 3: Map displays of customers and population served by target BART lines

University of California, Berkeley

Master of Information and Data Science (MIDS) program

w205 - Fundamentals of Data Engineering

## imports

In [1]:
import math
import numpy as np
import pandas as pd

import psycopg2

import json

import gmaps
import gmaps.geojson_geometries

from geographiclib.geodesic import Geodesic

## my_select_query_pandas() - function to run a select query and return rows in a Pandas dataframe

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

## Connect to the Postgres database

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

## Create a cursor for the connection

In [4]:
cursor = connection.cursor()

# Project 3: Google Maps of customers and population

## Connect to Google Maps using your api key;  edit the file gmap_api_key.txt and put in your api key

In [5]:
f = open('gmap_api_key.txt', 'r')
my_api_key = f.read()
f.close()

gmaps.configure(api_key=my_api_key)

In [6]:
AGM_berkeley_store = (37.85616355778589, -122.25991778607198)

gmaps.figure(center = AGM_berkeley_store, zoom_level=9)

Figure(layout=FigureLayout(height='420px'))

In [7]:
lat_lon = {'latitude': [37.85616355778589], 'longitude': [-122.25991778607198]}
df = pd.DataFrame(lat_lon)

# Map with Marker for the Berkeley AGM store

In [8]:
#Map with marker for Store
fig = gmaps.figure(center = AGM_berkeley_store, zoom_level= 10)

df_markers = df[['latitude','longitude']]

marker_layer = gmaps.marker_layer(df_markers)

fig.add_layer(marker_layer)

fig

Figure(layout=FigureLayout(height='420px'))

# Map centered at AGM store with closest BART station

In [9]:
# Map with Marker for closest BART station
lat_lon = {'latitude': [37.85616355778589, 37.85297402163031], 'longitude': [-122.25991778607198, -122.27012579956241]}

df = pd.DataFrame(lat_lon)

fig = gmaps.figure(center = AGM_berkeley_store, zoom_level= 12)

df_markers = df[['latitude','longitude']]

marker_layer = gmaps.marker_layer(df_markers)

#Add transit layer: 
fig.add_layer(gmaps.transit_layer())

fig.add_layer(marker_layer)

fig

Figure(layout=FigureLayout(height='420px'))

# Heatmap of AGM Customers relative to BART lines

In [10]:
#Heat Map using Customer data for AGM store
rollback_before_flag = True
rollback_after_flag = True

query = """

select z.latitude, z.longitude
from customers as cu
     join zip_codes as z
         on cu.zip = z.zip
where cu.closest_store_id = 1
order by 1,2

"""

#Create customer data frame with lat & longitude
df_customer = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

fig = gmaps.figure(center= AGM_berkeley_store, zoom_level= 10)

#create Marker for the Berkeley Store location
lat_lon = {'latitude': [37.85616355778589], 'longitude': [-122.25991778607198]}
df_lat_lon = pd.DataFrame(lat_lon)
df_markers = df_lat_lon[['latitude','longitude']]

#Add Marker Layer
marker_layer = gmaps.marker_layer(df_markers)
fig.add_layer(marker_layer)

#Add transit layer: 
fig.add_layer(gmaps.transit_layer())

#Add heatmap layer
heatmap_layer = gmaps.heatmap_layer(df_customer)

fig.add_layer(heatmap_layer)

fig


Figure(layout=FigureLayout(height='420px'))

# Heatmap of AGM Berkeley Store Customers within 1 mile of red/orange BART lines

In [11]:
#Heatmap of customers within 1 mile of red/orange lines

#Read the Station_zips_customers csv from Chris's work: 
red_orange_customer_df = pd.read_csv('data/station_zips_customers.csv')

#Create dataframe with customer data, join on customer id to get lat/long
rollback_before_flag = True
rollback_after_flag = True

query = """

select z.latitude, z.longitude, cu.customer_id
from customers as cu
     join zip_codes as z
         on cu.zip = z.zip
where cu.closest_store_id = 1
order by 1,2, 3

"""

#Create customer data frame with lat & longitude
df_customer_id_lat_lon = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

#Join customer red_orange_customer dataframe with customer lat and lon in the dataframe
df_red_orange_custid = pd.merge(red_orange_customer_df, df_customer_id_lat_lon, on = 'customer_id')
df_red_orange_custid_lat_lon = df_red_orange_custid.drop(columns = ['station', 'zip', 'customer_id', 'street'])

#Heat Map of customers within 1 mile of red/orange line stations
fig = gmaps.figure(center= AGM_berkeley_store, zoom_level= 10)

#create Marker for the Berkeley Store location
lat_lon = {'latitude': [37.85616355778589], 'longitude': [-122.25991778607198]}
df_lat_lon = pd.DataFrame(lat_lon)
df_markers = df_lat_lon[['latitude','longitude']]

#Add Marker Layer for Berkeley Store
marker_layer = gmaps.marker_layer(df_markers)
fig.add_layer(marker_layer)

#Add heatmap layer for Customers within 1 mi of 
heatmap_layer = gmaps.heatmap_layer(df_red_orange_custid_lat_lon)

fig.add_layer(heatmap_layer)

fig

Figure(layout=FigureLayout(height='420px'))

# Heatmap of general population within 1 mile of red/orange lines

In [12]:
#Create dataframe from zip
rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from zip_codes

"""

df_zip_codes = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
#Convert zip codes to int data type for join
df_zip_codes['zip'] = pd.to_numeric(df_zip_codes['zip'])

#Read the Station_zips csv from Chris's work, join zip data for lat/long 
df_red_orange_stations_zip = pd.read_csv('data/station_zips.csv')

#Join customer red_orange_customer dataframe with customer lat and lon in the dataframe
df_red_orange_stations_zip_lat_lon = pd.merge(df_red_orange_stations_zip, df_zip_codes, on = 'zip')

#drop duplicate lat long to get a dataframe with unique lat lon for population heatmap purposes
df_red_orange_stations_zip_lat_lon = df_red_orange_stations_zip_lat_lon.drop_duplicates(subset =\
                                                                    ['latitude', 'longitude'], keep = 'last')
#drop columns to just keep lat, long, population_x
df_red_orange_stations_zip_lat_lon = df_red_orange_stations_zip_lat_lon.drop(columns =\
                        ['id','station', 'zip', 'city', 'state', 'population_y', 'area', 'density', 'time_zone'])

#From the population within 1 mile of a red/orange station, repeat the row x population count\
#to create heat map dataframe

#First, create empty lists for each latitude and longitude to append to n times the population
lat = []
lon = []

for index, row in df_red_orange_stations_zip_lat_lon.iterrows():
    lat.append([(row['latitude'])] * int(row['population_x']))
    lon.append([(row['longitude'])] * int(row['population_x']))
    
#Flatten lat and lon lists for appending to dataframe
flat_lat = [item for sublist in lat for item in sublist]
flat_lon = [item for sublist in lon for item in sublist]

#Convert lists to dataframe and merge on index
df_lat = pd.DataFrame (flat_lat, columns = ['latitude'])
df_lon = pd.DataFrame(flat_lon, columns = ['longitude'])
df_population_heatmap_lat_lon = df_lat.join(df_lon)

#Heat Map of population density within 1 mile of red/orange line stations
fig = gmaps.figure(center= AGM_berkeley_store, zoom_level= 10)

#create Marker for the AGM Berkeley Store location
lat_lon = {'latitude': [37.85616355778589], 'longitude': [-122.25991778607198]}
df_lat_lon = pd.DataFrame(lat_lon)
df_markers = df_lat_lon[['latitude','longitude']]

#Add Marker Layer for Berkeley Store
marker_layer = gmaps.marker_layer(df_markers)
fig.add_layer(marker_layer)

#Add heatmap layer for Customers within 1 mi of 
heatmap_layer = gmaps.heatmap_layer(df_population_heatmap_lat_lon)

fig.add_layer(heatmap_layer)

fig

Figure(layout=FigureLayout(height='420px'))

In [13]:
#Drop stations table if exists
connection.rollback()

query = """

drop table if exists stations;

"""

cursor.execute(query)

connection.commit()

In [14]:
#Create stations table
connection.rollback()

query = """

create table stations (
    station varchar(32),
    latitude numeric(9,6),
    longitude numeric(9,6),
    transfer_time numeric(3),
    primary key (station))

"""

cursor.execute(query)

connection.commit()

In [15]:
#Load stations csv into table
connection.rollback()

query = """

copy stations
from '/user/projects/project-3-mmartin131/exercise/stations.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [16]:
#Todo: create dataframe of 4 selected stations lat/lon



rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from stations
where station in ('Downtown Berkeley', 'North Berkeley', 'Powell Street', 'West Oakland')


"""
df_pilot_stations = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df_pilot_stations

Unnamed: 0,station,latitude,longitude,transfer_time
0,Downtown Berkeley,37.869799,-122.268197,323
1,North Berkeley,37.873915,-122.282552,289
2,Powell Street,37.784,-122.408,286
3,West Oakland,37.8049,-122.2951,283


In [17]:
# Map with selected 4 stations for pilot and the population density heatmap 

#Heat Map of population density within 1 mile of red/orange line stations
fig = gmaps.figure(center= AGM_berkeley_store, zoom_level= 10)

#create Marker for 4 pilot BART stations
df_markers = df_pilot_stations[['latitude','longitude']]

#Add Marker Layer for Berkeley Store
marker_layer = gmaps.marker_layer(df_markers)
fig.add_layer(marker_layer)

#Add heatmap layer for Customers within 1 mi of stations
heatmap_layer = gmaps.heatmap_layer(df_population_heatmap_lat_lon)

fig.add_layer(heatmap_layer)

fig

Figure(layout=FigureLayout(height='420px'))

In [19]:
#Calculate number of customers within 1 mi of red/orange lines
red_orange_customer_df.shape

(4415, 4)

In [24]:
#Calculate population within 1 mile of red/orange lines: 
df_red_orange_stations_population = pd.merge(df_red_orange_stations_zip, df_zip_codes, on = 'zip')
df_red_orange_stations_population = df_red_orange_stations_population.drop_duplicates()
red_orange_population_total = df_red_orange_stations_population['population_x'].sum()
print(red_orange_population_total)

1664536


In [20]:
#Calculate number of customers within 1 mi 4 pilot stations
pilots = ['Downtown Berkeley', 'North Berkeley', 'Powell Street', 'West Oakland']
red_orange_customer_df[red_orange_customer_df.station.isin(pilots)].shape

(1613, 4)

In [26]:
#Calculate population reached within 1 mi of 4 pilot stations
pilot_population = df_red_orange_stations_population[df_red_orange_stations_population.station.isin(pilots)]

pilot_population_sum = pilot_population['population_x'].sum()
print(pilot_population_sum)

300683


In [27]:
#Calculate customers reached when expanded to yellow line
#Read the Station_zips_customersYellow expansion csv from Chris's work: 
yellow_expansion_customer_df = pd.read_csv('data/station_zips_customers_yellowexpansion.csv')
yellow_expansion_customer_df.shape

(5232, 4)

In [28]:
#Calculate population reached when expanded to yellow line
#Read the station zips Yellow expansion csv from Chris's work: 
df_yellow_expansion_staion_zip = pd.read_csv('data/station_zips_yellowexpansion.csv')
df_yellow_expansion_population = pd.merge(df_yellow_expansion_staion_zip, df_zip_codes, on = 'zip')
df_yellow_expansion_population = df_yellow_expansion_population.drop_duplicates()
yellow_expansion_population_total = df_yellow_expansion_population['population_x'].sum()
print(yellow_expansion_population_total)

1933485
