In [1]:
pip install folium

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install shapely

Note: you may need to restart the kernel to use updated packages.


In [81]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd

import folium
from shapely.ops import nearest_points
from calendar import monthrange

In [None]:

#Function to determine closeset citibike station to closest hospital facility
def create_gdf(df, x, y):
    return gpd.GeoDataFrame(df, 
    geometry=gpd.points_from_xy(df[y], df[x]), crs={'init':'EPSG:4326'})



In [5]:
#Merge Citibike Data
feb20data = pd.read_csv('202002-citibike-tripdata.csv')
mar20data = pd.read_csv('202003-citibike-tripdata.csv')

hosp = pd.read_csv('NYC_Health___Hospitals_patient_care_locations_-_2011.csv').dropna(subset = ['Postcode'])

data = pd.concat([feb20data,mar20data],ignore_index = True)


In [21]:
days = monthrange(2020,2)[1] + monthrange(2020,3)[1]

60


In [6]:
#Need to isolate unique stations in this dataset
#This is a separate data set needed for 1a
stations = data.drop(columns =['tripduration','starttime','stoptime','end station id','end station name','end station latitude',\
                               'end station longitude','bikeid','usertype','birth year','gender']).drop_duplicates(subset = ['start station id'])

In [7]:
#-----------------
#1a. Distance to Hospital
#-----------------

#Convert Columns to gpd in order to get nearest station/hospital pair
station_gdf = create_gdf(stations,'start station latitude','start station longitude')
hosp_gdf = create_gdf(hosp,'Latitude','Longitude')

In [8]:
#Show on map
m = folium.Map(location = [40.7128,-74.0060])

locs_stations = zip(station_gdf['start station latitude'], station_gdf['start station longitude'])

locs_hosp = zip(hosp['Latitude'], hosp['Longitude'])

for location in locs_stations:
    folium.CircleMarker(location=location, 
        color='red',   radius=2).add_to(m)
    
for location in locs_hosp:
    folium.CircleMarker(location=location, 
        color='blue', radius=2).add_to(m)
m

In [10]:
# Get the nearest geometry
dest_unary = hosp_gdf['geometry'].unary_union
nearest_geom_col = []
nearest_hosp_col = []

for i in range(len(station_gdf)):
    nearest_geom = nearest_points(station_gdf.iloc[i,4], dest_unary)
    match_geom = hosp_gdf.loc[hosp_gdf.geometry == nearest_geom[1]]
    match_value = match_geom['geometry'].to_numpy()[0]
    nearest_geom_col.append(match_value)
station_gdf['nearest_geom'] = nearest_geom_col

# Get the nearest Hospital name
for i in range(len(station_gdf)):
    nearest_geom = nearest_points(station_gdf.iloc[i,4], dest_unary)
    match_geom = hosp_gdf.loc[hosp_gdf.geometry == nearest_geom[1]]
    match_value = match_geom['Facility Name'].to_numpy()[0]
    nearest_hosp_col.append(match_value)

station_gdf['nearest_hosp'] = nearest_hosp_col

In [11]:
distance = []

for i in range(len(station_gdf)):
    dis = station_gdf.iloc[i,4].distance(station_gdf.iloc[i,5])
    distance.append(dis)
station_gdf['hosp_dist'] = distance

In [53]:
stations = pd.DataFrame(station_gdf)
stations = stations.drop(columns = ['geometry','nearest_geom'])
stations.columns = ['stat_id','stat_name','station_lat','station_long','nearest_hosp','hosp_dist']
stations = stations.sort_values(by = ['stat_id'])
stations = stations.reset_index(drop = True)
stations.head()

Unnamed: 0,stat_id,stat_name,station_lat,station_long,nearest_hosp,hosp_dist
0,72,W 52 St & 11 Ave,40.767272,-73.993929,Bellevue Hospital Center,0.032876
1,79,Franklin St & W Broadway,40.719116,-74.006667,Judson Health Center,0.011241
2,82,St James Pl & Pearl St,40.711174,-74.000165,Smith Communicare Health Center,0.002979
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,Fort Greene Child Health Clinic,0.010232
4,116,W 17 St & 8 Ave,40.741776,-74.001497,Judson Health Center,0.020867


In [57]:
#-----------------
#1b. Average Age
#-----------------

#Drop bad data
#Remove data over 2 Standard Deviations away
age_avg = data.drop(data.index[(data['birth year'] < data['birth year'].mean() - 2*data['birth year'].std())])

#Calculate Age & replace birth year column
age_avg['Age'] = ((int(dt.datetime.now().year)) - age_avg['birth year']).astype(int)
age_avg = age_avg.drop(columns = ['birth year'])
age_avg = age_avg.groupby('start station id')['Age'].mean().to_frame()
age_avg = age_avg.rename(columns = {'Age':'Average Age'})
age_avg = age_avg.reset_index()
age_avg = age_avg.rename(columns = {'start station id':'stat_id'})
age_avg.head()

Unnamed: 0,stat_id,Average Age
0,72,39.850289
1,79,41.789203
2,82,39.073649
3,83,38.431391
4,116,38.777283


In [58]:
#-----------------
#1c. Average Start Trips 
#-----------------

start_avg = data.groupby('start station id')['start station name'].value_counts().to_frame()
start_avg = start_avg.rename(columns = {'start station name':'Daily_Avg_Starts'})
start_avg['Daily_Avg_Starts'] = start_avg['Daily_Avg_Starts'].div(days)
start_avg = start_avg.reset_index()
start_avg = start_avg.rename(columns = {'start station id':'stat_id'})
start_avg.head()


Unnamed: 0,stat_id,start station name,Daily_Avg_Starts
0,72,W 52 St & 11 Ave,89.05
1,79,Franklin St & W Broadway,40.066667
2,82,St James Pl & Pearl St,26.4
3,83,Atlantic Ave & Fort Greene Pl,36.583333
4,116,W 17 St & 8 Ave,129.716667


In [59]:
#-----------------
#1d. Average End Trips 
#-----------------
#Average trips by end station, grouped by number of days in the data set
end_avg = data.groupby('end station id')['end station name'].value_counts().to_frame()
end_avg = end_avg.rename(columns = {'end station id':'stat_id','end station name':'Daily_Avg_Ends'})
end_avg['Daily_Avg_Ends'] = end_avg['Daily_Avg_Ends'].div(days)
end_avg = end_avg.reset_index()
end_avg = end_avg.rename(columns = {'end station id':'stat_id'})
end_avg.head()

Unnamed: 0,stat_id,end station name,Daily_Avg_Ends
0,72,W 52 St & 11 Ave,89.683333
1,79,Franklin St & W Broadway,40.95
2,82,St James Pl & Pearl St,26.633333
3,83,Atlantic Ave & Fort Greene Pl,37.333333
4,116,W 17 St & 8 Ave,130.6


In [76]:
#-----------------
#1e. Combine Data
#-----------------
final = pd.merge(stations,age_avg,how = 'left', on = 'stat_id')
final = pd.merge(final, end_avg,how = 'left', on = 'stat_id')
final = pd.merge(final, start_avg,how = 'left', on = 'stat_id')
final = final.drop(columns = ['start station name','end station name'])

In [99]:
final.head()

Unnamed: 0,stat_id,stat_name,station_lat,station_long,nearest_hosp,hosp_dist,Average Age,Daily_Avg_Ends,Daily_Avg_Starts
0,72,W 52 St & 11 Ave,40.767272,-73.993929,Bellevue Hospital Center,0.032876,39.850289,89.683333,89.05
1,79,Franklin St & W Broadway,40.719116,-74.006667,Judson Health Center,0.011241,41.789203,40.95,40.066667
2,82,St James Pl & Pearl St,40.711174,-74.000165,Smith Communicare Health Center,0.002979,39.073649,26.633333,26.4
3,83,Atlantic Ave & Fort Greene Pl,40.683826,-73.976323,Fort Greene Child Health Clinic,0.010232,38.431391,37.333333,36.583333
4,116,W 17 St & 8 Ave,40.741776,-74.001497,Judson Health Center,0.020867,38.777283,130.6,129.716667


In [100]:
knn = KNeighborsRegressor()
X = final.loc[:,'hosp_dist':'Daily_Avg_Ends']
y = final.loc[:,'Daily_Avg_Starts']

In [101]:
#Split Data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30, random_state = 0)

#Normalize Data - Scale the Features
scaler = MinMaxScaler(feature_range=(0,1))

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [102]:
#Fit the training data
knn.fit(X_train_scaled,y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='uniform')

In [103]:
#Test model with test data
y_pred = knn.predict(X_test_scaled)
print(y_pred)

[ 20.39666667  11.87666667  12.03333333  67.65666667  89.18333333
   8.54        28.05333333  12.52333333  37.44         3.99333333
  71.66666667  60.39666667  54.81666667  23.56666667  15.96
  34.53333333  25.30333333  21.28        35.55         3.46
  17.84       138.90333333   7.08333333  28.17333333 111.74666667
  44.51666667  22.45666667  44.80666667   6.21333333  10.80333333
  14.64666667  23.91333333  52.02        49.52333333  20.39666667
  20.39666667   2.33333333  13.71        18.65        18.96
  28.36333333  12.87        27.85        28.79666667 151.07333333
  36.56666667  22.69        36.11333333  32.19666667   8.84
  22.91333333 110.88         4.10333333  53.13333333   8.77666667
  27.85        85.02666667   7.89         6.03        72.58333333
 119.25666667  14.44666667   8.88333333   9.09333333 120.94
   6.19666667  10.39666667  35.07333333  33.58333333  46.33
  83.07        31.1         26.7         81.50666667  11.
   9.64333333  65.76666667 127.57        57.42       1

In [89]:
#Determine accuracy of models with RMSE
mean_squared_error(y_test,y_pred)

113.89153789386403

In [123]:
#Return the top 5 busiest stations based off the training set
final_info = final.loc[:,'stat_id':'nearest_hosp']
results = pd.DataFrame(X_test)
results['Predicted_Daily_Avg_Starts'] = list(y_pred)
results = results.join(final_info,how = 'left').sort_values(by = 'Predicted_Daily_Avg_Starts',ascending = False)
plot_results = results.head()

In [124]:
#Show on map
m2 = folium.Map(location = [40.7128,-74.0060])

top_stations = zip(plot_results['station_lat'], plot_results['station_long'])


for location in top_stations:
    folium.CircleMarker(location=location, 
        color='green',   radius=4).add_to(m2)
m2