In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta, date, time
import pickle

import folium
from folium import plugins
from folium.plugins import HeatMap

from haversine import haversine

In [2]:
#run in ANACONDA PROMT
#conda install -c conda-forge folium 
#conda install -c conda-forge haversine 

In [3]:
#pickle read in
infile = open(r'../data/LA_geo','rb')
df_LA_geo = pickle.load(infile)
infile.close()

infile = open(r'../data/LA','rb')
df_LA = pickle.load(infile)
infile.close()

In [4]:
#df_LA = pd.read_csv('../data/LA.csv') 
#df_LA_geo = pd.read_csv('../data/LA_geo.csv') 

In [5]:
df_LA_geo.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,trip_duration,trip_duration_in_hours,start_lat,start_lon,end_lat,end_lon
0,2018-01-01 00:04:00,2018-01-01 00:25:00,3063,3018,5889,Walk-up,Pershing Square,Grand & Olympic,0 days 00:21:00,0.35,34.048326,-118.253513,34.043732,-118.260139
1,2018-01-01 00:05:00,2018-01-01 00:25:00,3063,3018,6311,Walk-up,Pershing Square,Grand & Olympic,0 days 00:20:00,0.333333,34.048326,-118.253513,34.043732,-118.260139


**Prepeare the data**

In [6]:
#define LA center
center = np.array(['34.053691', '-118.242766'])

#prepeare dataframes
LA_stations = df_LA_geo[['start_station_name', 'start_lat', 'start_lon']]
LA_stations = LA_stations.drop_duplicates().reset_index(drop=True)
LA_stations
print(len(LA_stations), "stations in LA.")

#creating lists with station names and coordinats
namelist = []
coordlist = []
#iterating over the stations and connect lat & lon
for index, row in LA_stations.iterrows():
    #check nan values
    if not (pd.isna(row['start_lat']) | pd.isna(row['start_lon'])):
        #print(row['start_station_name'])
        #print([row['start_lat'],row['start_lon']])
        namelist.append(row['start_station_name'])
        coordlist.append([row['start_lat'],row['start_lon']])    
df_LA_stations = pd.DataFrame(data={'station_name':namelist, 'coord':coordlist})
df_LA_stations.head(2)

132 stations in LA.


Unnamed: 0,station_name,coord
0,Pershing Square,"[34.04832571326165, -118.25351303799283]"
1,Grand & Olympic,"[34.043732, -118.260139]"


**Add station ids to "df_LA_stations"**

In [7]:
#get necessary data
df_station_id = df_LA_geo[['start_station_name', 'start_station_id']].drop_duplicates()
#join id to df but prevent "endless merge add" (trough repeated run of notebook)
df_merged = pd.DataFrame()
df_merged = df_LA_stations.merge(df_station_id, how='left', left_on='station_name', right_on='start_station_name').rename(columns={'start_station_id':'id'})
df_merged = df_merged[['station_name', 'coord', 'id']]
df_LA_stations = df_merged.loc[:,~df_merged.columns.duplicated()]
df_LA_stations.head(2)

Unnamed: 0,station_name,coord,id
0,Pershing Square,"[34.04832571326165, -118.25351303799283]",3063
1,Grand & Olympic,"[34.043732, -118.260139]",3018


**Add origin and destination column to df_trips**

In [8]:
#add origin and destination column to df_trips
df_trips = df_LA_geo
df_trips = df_trips.dropna()
df_trips = df_trips.assign(orig=df_trips.apply(lambda x: [x['start_lat'],x['start_lon']],axis=1))
df_trips = df_trips.assign(dest=df_trips.apply(lambda x: [x['end_lat'],x['end_lon']],axis=1))
df_trips.head(2)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,trip_duration,trip_duration_in_hours,start_lat,start_lon,end_lat,end_lon,orig,dest
0,2018-01-01 00:04:00,2018-01-01 00:25:00,3063,3018,5889,Walk-up,Pershing Square,Grand & Olympic,0 days 00:21:00,0.35,34.048326,-118.253513,34.043732,-118.260139,"[34.04832571326165, -118.25351303799283]","[34.043732, -118.260139]"
1,2018-01-01 00:05:00,2018-01-01 00:25:00,3063,3018,6311,Walk-up,Pershing Square,Grand & Olympic,0 days 00:20:00,0.333333,34.048326,-118.253513,34.043732,-118.260139,"[34.04832571326165, -118.25351303799283]","[34.043732, -118.260139]"


**Frequently used stations**

Draw ja bigger Circle for more frequently used stations

In [9]:
#get info about the most frequently used bike stations
df_LA_stations
#count the appereance of the stations
df_start_station = pd.DataFrame(df_LA_geo.groupby('start_station_id')['start_station_name'].count())
df_end_station=pd.DataFrame(df_LA_geo.groupby('end_station_id')['end_station_id'].count())

#sum the counted values
df_joined = df_start_station.join(df_end_station, lsuffix='start_station_id', rsuffix='end_station_id')
df_joined = df_joined.assign(total_count=df_joined['start_station_name']+df_joined['end_station_id'])
df_joined = df_joined.sort_values(by='total_count', ascending = False).head(5)
df_joined 

Unnamed: 0_level_0,start_station_name,end_station_id,total_count
start_station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4214,16293,13373,29666
3005,12175,13252,25427
4215,8525,15669,24194
4210,11391,9907,21298
3014,9084,9787,18871


**Bike Stations**

In [10]:
# define new map function
def bikestation_map(location, df_stations):
    bikestation_map = folium.Map(
        location=location, 
        tiles='OpenStreetMap', 
        zoom_start=11, 
        control_scale=True, 
        max_zoom=20)
    # add station point map
    for station in df_stations["coord"]:
        folium.CircleMarker(
            radius=5,
            location=station,
            popup='The Waterfront', 
            color='crimson', 
            alpha=0.2,
            fill_color='crimson'
        ).add_to(bikestation_map)
    return bikestation_map

In [11]:
bikestation_map(center, df_LA_stations)

**Heatmap to find the best coverage (highest density of bike stations) in the city**

In [12]:
def heat_map(df, station_id):
    # define new map
    heat_map = folium.Map(
        location=center, 
        tiles='OpenStreetMap', 
        zoom_start=11, 
        control_scale=True, 
        max_zoom=20)
    # add heat map
    heat_map.add_child(
        plugins.HeatMap(df['coord'], radius=20))
    #heat map for destination points looks pretty much identical
    return heat_map

In [13]:
heat_map(df_LA_stations, np.array(df_LA_stations['id']))

**Get location of top 5 frequently used stations by their id**

In [14]:
# define new map function
def getLocation(station_id):
    getLocation = folium.Map(
        location=center, 
        tiles='OpenStreetMap', 
        zoom_start=11, 
        control_scale=True, 
        max_zoom=20)
    # add station point map
    for station in station_id:
        geo = df_LA_stations.coord[df_LA_stations['id'] == station].values[0]
        #print(geo)
        folium.CircleMarker(
            radius=5,
            location=geo,
            popup='The Waterfront', 
            color='crimson', 
            alpha=0.2,
            fill_color='crimson'
        ).add_to(getLocation)
    return getLocation

In [15]:
#top 5 stations
getLocation([4214, 3005, 4215, 4210, 3014])

**Trip Movement**

In [16]:
def trips_map(origins_list, destinations_list): 
    # define new map
    trips_map = folium.Map(
        location=center,  
        tiles='OpenStreetMap', 
        zoom_start=11, 
        control_scale=True, 
        max_zoom=20,
        height=1000)

    #draw trips
    for orig, dest in zip(origins_list, destinations_list):
        # Origin point
        #print(orig)
        folium.CircleMarker(
            radius=5,
            location=orig,
            popup='The Waterfront', 
            color='crimson', 
            alpha=0.2,
            fill_color='crimson').add_to(trips_map)
        # Destination point
        #print(dest)
        folium.CircleMarker(
            radius=5,
            location=dest,
            popup='The Waterfront', 
            color='blue', 
            alpha=0.2,
            fill_color='blue').add_to(trips_map)
        # Trip movement
        folium.PolyLine(
            (orig, dest),
            color="gray", 
            weight=1, 
            opacity=1).add_to(trips_map)
    return trips_map

First slice the dataframe to reduce the number of entrys

In [17]:
###########SLICING###################
#first 1000 trips
df_trips_sliced = df_trips.iloc[:1000,:]

#all trips of one station (e.g. most frequently used station --> 4214)
station_id = 4214

###########SLICING###################

#columns as list
orig_list = df_trips_sliced['orig'].tolist()
dest_list = df_trips_sliced['dest'].tolist()

#warning when you want to print to many trips
if(len(df_trips_sliced) > 1000):
    print('Runtime warning: You are going to print ' + str(len(df_trips_sliced))+ ' trips. Recommended are trips < 1000.')

#trip map
trips_map(orig_list, dest_list)