# A* Algorithm Implmentation: Closest Store by BART

**imports**

In [6]:
import neo4j
import math
import numpy as np
import pandas as pd
import re
import random

import psycopg2
from scipy import spatial
from geographiclib.geodesic import Geodesic

**Driver Connection**

In [7]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

**Helper Functions**

In [32]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#
    
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query) 
    

def my_neo4j_create_customer_node(customer_id, zip_code):
    "create a node with label Station"
    
    query = """
    
    CREATE (:Customer {
                        customer_id: $customer_id,
                        zip_code: $zip_code
                    })
    
    """
    
    session.run(query, customer_id=customer_id, zip_code = zip_code)

def create_relationship_one_way_customer_station(from_customer, to_station, weight):
    "create a relationship one way between two stations with a weight"
    
    query = """
    
    MATCH (from:Customer), 
          (to:Station)
    WHERE from.customer_id = $from_customer and to.name = $to_station
    CREATE (from)-[:DIST {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_customer=from_customer, to_station=to_station, weight=weight)
    
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")

def my_calculate_box(point, miles):
    "Given a point and miles, calculate the box in form left, right, top, bottom"
    
    geod = Geodesic.WGS84

    kilometers = miles * 1.60934
    meters = kilometers * 1000

    g = geod.Direct(point[0], point[1], 270, meters)
    left = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 90, meters)
    right = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 0, meters)
    top = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 180, meters)
    bottom = (g['lat2'], g['lon2'])
    
    return(left, right, top, bottom)

def random_point(p, m): 
    l, r, t, b = my_calculate_box(p, m)
    
    random_latitude = random.uniform(l[0], r[0])
    random_longitude = random.uniform(b[1], t[1])
        
    return (random_latitude, random_longitude)

## Graph Creation plan 
- Customer nodes: 
    - randomly generate latitudes and longitudes within the customer's zip code 
    - find the closest bart station to them, create a connection 
    Note: only using bay area zip codes
- Store nodes: 
    - find the closest bart station to the store 

### Customer Zip Code -> Random (lat, long) pairs

In [68]:
rollback_before_flag = True
rollback_after_flag = True

query = """

SELECT customers.customer_id, customers.zip, zip_codes.latitude, zip_codes.longitude
FROM customers
left join zip_codes 
on customers.zip = zip_codes.zip 
where customers.zip like '9%'

"""

customers = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
customers['zip'].value_counts()

98117    315
98125    296
94602    260
94530    258
98119    257
        ... 
94565      1
94588      1
94070      1
94516      1
94957      1
Name: zip, Length: 258, dtype: int64

In [69]:
random_locs = customers.apply(lambda x: random_point((x['latitude'], x['longitude']), 1), axis=1)
customers["random_lat"] = random_locs.apply(lambda x: x[0])
customers["random_long"] = random_locs.apply(lambda x: x[1])
rand_customers_loc = customers[['customer_id', 'zip', 'random_lat', 'random_long']]
rand_customers_loc

Unnamed: 0,customer_id,zip,random_lat,random_long
0,1,94609,37.834299,-122.2643
1,2,94609,37.834299,-122.2643
2,3,94609,37.834299,-122.2643
3,4,94609,37.834299,-122.2643
4,5,94609,37.834299,-122.2643
...,...,...,...,...
15347,15348,98421,47.259198,-122.3995
15348,15349,98421,47.259198,-122.3995
15349,15350,98421,47.259198,-122.3995
15350,15351,98416,47.262498,-122.4812


## Get Stations and Coordinates

In [70]:
rollback_before_flag = True
rollback_after_flag = True

query = """

SELECT *
FROM stations

"""

stations = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
print(stations.shape)
stations

(50, 4)


Unnamed: 0,station,latitude,longitude,transfer_time
0,12th Street,37.803608,-122.272006,282
1,16th Street Mission,37.764847,-122.420042,287
2,19th Street,37.807869,-122.26898,67
3,24th Street Mission,37.752,-122.4187,277
4,Antioch,37.996281,-121.783404,0
5,Ashby,37.853068,-122.269957,299
6,Balboa Park,37.721667,-122.4475,48
7,Bay Fair,37.697,-122.1265,63
8,Berryessa,37.368361,-121.874655,288
9,Castro Valley,37.690748,-122.075679,0


## Building customers nodes, and their relations to their closest stations

In [71]:
tree = spatial.KDTree(stations[['latitude', 'longitude']])
dists, inds = tree.query(rand_cfustomers_loc[['random_lat', 'random_long']])

In [72]:
rand_customers_loc['distance_miles'] = dists
rand_customers_loc['travel_time'] = (dists / 3)

rand_customers_loc['station'] = stations.loc[inds,'station'].values
rand_customers_loc

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rand_customers_loc['distance_miles'] = dists


Unnamed: 0,customer_id,zip,random_lat,random_long,distance_miles,travel_time,station
0,1,94609,37.834299,-122.2643,0.006732,0.002244,MacArthur
1,2,94609,37.834299,-122.2643,0.006732,0.002244,MacArthur
2,3,94609,37.834299,-122.2643,0.006732,0.002244,MacArthur
3,4,94609,37.834299,-122.2643,0.006732,0.002244,MacArthur
4,5,94609,37.834299,-122.2643,0.006732,0.002244,MacArthur
...,...,...,...,...,...,...,...
15347,15348,98421,47.259198,-122.3995,9.251539,3.083846,Pittsburg
15348,15349,98421,47.259198,-122.3995,9.251539,3.083846,Pittsburg
15349,15350,98421,47.259198,-122.3995,9.251539,3.083846,Pittsburg
15350,15351,98416,47.262498,-122.4812,9.259214,3.086405,Pittsburg


In [73]:
rand_customers_loc.loc[:, 'station'].value_counts()

Pittsburg               7214
El Cerrito del Norte     844
Downtown Berkeley        578
Civic Center             469
Richmond                 458
Fruitvale                440
Orinda                   400
MacArthur                385
Lake Merritt             375
19th Street              337
Lafayette                318
Walnut Creek             306
El Cerrito Plaza         294
Rockridge                290
North Berkeley           246
San Leandro              239
West Oakland             203
Embarcadero              189
Bay Fair                 173
Montgomery Street        168
16th Street Mission      162
Balboa Park              153
24th Street Mission      145
OAK                      128
Glen Park                127
Pleasant Hill             97
Daly City                 77
Powell Street             74
Coliseum                  67
Millbrae                  55
North Concord             48
Colma                     45
South San Francisco       42
Concord                   36
Castro Valley 