# BART / Caltrain Location Data

### Imports

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from sqlalchemy import create_engine
from sqlalchemy import inspect
cnx = create_engine('postgresql://matthewmurray@52.53.245.4/policedata')
import geopandas
import geopandas.tools
from shapely.geometry import Point
import pickle
import requests
%matplotlib inline

### Collect & Process Data

In [2]:
api_key = ''
bart_url = 'http://api.bart.gov/api/stn.aspx?cmd=stns&orig=ssan&key={}&json=y'.format(api_key)

In [3]:
r = requests.get(bart_url)
json_doc = r.json()

In [4]:
bart_df = pd.DataFrame.from_dict(json_doc['root']['stations']['station'])
bart_df = bart_df[bart_df['county'] == 'sanfrancisco']
bart_df.drop(['abbr','address','city','county','state','zipcode'], axis=1, inplace=True)
bart_df.rename(columns={'gtfs_latitude':'latitude','gtfs_longitude':'longitude','name':'station_name'}, inplace=True)
bart_df.sample(3)

Unnamed: 0,latitude,longitude,station_name
8,37.779732,-122.414123,Civic Center/UN Plaza
20,37.733064,-122.433817,Glen Park
3,37.75247,-122.418143,24th St. Mission


There aren't many Caltrain stations so I just manually collected the latitude and longitude information from Wikipedia

In [5]:
caltrains = {
    'South San Francisco':[37.655833, -122.405],
    'Bayshore Station':[37.7075, -122.401944],
    '22nd St. Station':[37.757222, -122.3925],
    'San Francisco':[37.776389, -122.394444]
}

In [6]:
cal_df = pd.DataFrame.from_dict(caltrains, orient='index')
cal_df = cal_df.reset_index()
cal_df.columns = ['station_name','latitude','longitude']

In [7]:
cal_df

Unnamed: 0,station_name,latitude,longitude
0,South San Francisco,37.655833,-122.405
1,Bayshore Station,37.7075,-122.401944
2,22nd St. Station,37.757222,-122.3925
3,San Francisco,37.776389,-122.394444


In [8]:
transport_df = pd.concat([cal_df, bart_df])
transport_df['latitude'] = transport_df['latitude'].apply(lambda x: float(x))
transport_df['longitude'] = transport_df['longitude'].apply(lambda x: float(x))
transport_df['geometry'] = transport_df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)
transport_df = transport_df.set_index('station_name').reset_index()
transport_df = geopandas.GeoDataFrame(transport_df, geometry='geometry')
transport_df

Unnamed: 0,station_name,latitude,longitude,geometry
0,South San Francisco,37.655833,-122.405,POINT (-122.405 37.655833)
1,Bayshore Station,37.7075,-122.401944,POINT (-122.401944 37.7075)
2,22nd St. Station,37.757222,-122.3925,POINT (-122.3925 37.757222)
3,San Francisco,37.776389,-122.394444,POINT (-122.394444 37.776389)
4,16th St. Mission,37.765062,-122.419694,POINT (-122.419694 37.765062)
5,24th St. Mission,37.75247,-122.418143,POINT (-122.418143 37.75247)
6,Balboa Park,37.721585,-122.447506,POINT (-122.447506 37.721585)
7,Civic Center/UN Plaza,37.779732,-122.414123,POINT (-122.414123 37.779732)
8,Embarcadero,37.792874,-122.39702,POINT (-122.39702 37.792874)
9,Glen Park,37.733064,-122.433817,POINT (-122.433817 37.733064)


I intend to use these coordinates to calculate - for each crime incident - its distance from the nearest station.

In [10]:
station_locations = transport_df['geometry']

### Save Data

In [11]:
pickle.dump(station_locations, open('station_locations.pkl', 'wb'))