In [170]:
import pandas as pd
import numpy as np
import haversine as hv
from sklearn.metrics.pairwise import haversine_distances
from math import radians
import math
import folium
from folium import plugins
from folium.plugins import HeatMap
import seaborn as sns
%matplotlib inline
from haversine import haversine, Unit

In [8]:
# Einlesen der Daten
la = pd.read_csv("la_2019.csv", parse_dates=["start_time","end_time"], low_memory=False)

In [9]:
# Überblick über Daten
print(la.head())
print(la.tail())

           start_time            end_time  start_station_id  end_station_id  \
0 2019-01-01 00:07:00 2019-01-01 00:14:00              3046            3051   
1 2019-01-01 00:08:00 2019-01-01 00:14:00              3046            3051   
2 2019-01-01 00:18:00 2019-01-01 00:50:00              3030            3075   
3 2019-01-01 00:20:00 2019-01-01 00:50:00              3030            3075   
4 2019-01-01 00:22:00 2019-01-01 00:50:00              3030            3075   

  bike_id user_type start_station_name end_station_name  
0   06468   Walk-up         2nd & Hill   7th & Broadway  
1   12311   Walk-up         2nd & Hill   7th & Broadway  
2   05992   Walk-up         Main & 1st   Broadway & 9th  
3   05860   Walk-up         Main & 1st   Broadway & 9th  
4   06006   Walk-up         Main & 1st   Broadway & 9th  
                start_time            end_time  start_station_id  \
290337 2019-12-31 23:35:51 2020-01-02 17:13:50              4491   
290338 2019-12-31 23:41:52 2019-12-31 23:

In [10]:
print(la.count())
print(la.info())

start_time            290342
end_time              290342
start_station_id      290342
end_station_id        290342
bike_id               290342
user_type             290342
start_station_name    290342
end_station_name      290342
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290342 entries, 0 to 290341
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   start_time          290342 non-null  datetime64[ns]
 1   end_time            290342 non-null  datetime64[ns]
 2   start_station_id    290342 non-null  int64         
 3   end_station_id      290342 non-null  int64         
 4   bike_id             290342 non-null  object        
 5   user_type           290342 non-null  object        
 6   start_station_name  290342 non-null  object        
 7   end_station_name    290342 non-null  object        
dtypes: datetime64[ns](2), int64(2), object(4)
memory usage: 17.7+ MB
None


In [11]:
#Überprüfen, ob es null-Values gibt
la.isnull()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
290337,False,False,False,False,False,False,False,False
290338,False,False,False,False,False,False,False,False
290339,False,False,False,False,False,False,False,False
290340,False,False,False,False,False,False,False,False


In [12]:
la.dropna(axis=0, inplace=True)
la

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name
0,2019-01-01 00:07:00,2019-01-01 00:14:00,3046,3051,06468,Walk-up,2nd & Hill,7th & Broadway
1,2019-01-01 00:08:00,2019-01-01 00:14:00,3046,3051,12311,Walk-up,2nd & Hill,7th & Broadway
2,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,05992,Walk-up,Main & 1st,Broadway & 9th
3,2019-01-01 00:20:00,2019-01-01 00:50:00,3030,3075,05860,Walk-up,Main & 1st,Broadway & 9th
4,2019-01-01 00:22:00,2019-01-01 00:50:00,3030,3075,06006,Walk-up,Main & 1st,Broadway & 9th
...,...,...,...,...,...,...,...,...
290337,2019-12-31 23:35:51,2020-01-02 17:13:50,4491,4491,5903,Monthly Pass,Main & Winston,Main & Winston
290338,2019-12-31 23:41:52,2019-12-31 23:50:58,4491,3022,18912,Monthly Pass,Main & Winston,3rd & Santa Fe
290339,2019-12-31 23:43:19,2019-12-31 23:47:41,3051,3064,12298,Annual Pass,7th & Broadway,Grand & 8th
290340,2019-12-31 23:48:17,2019-12-31 23:53:55,3064,3074,19053,Annual Pass,Grand & 8th,Hope & Olympic


In [13]:
# no missing values 

len(la)-len(la.dropna())

0

In [14]:
la.describe()

Unnamed: 0,start_station_id,end_station_id
count,290342.0,290342.0
mean,3419.887526,3414.230294
std,585.993606,583.676484
min,3000.0,3000.0
25%,3028.0,3027.0
50%,3051.0,3051.0
75%,4245.0,4227.0
max,4493.0,4493.0


In [15]:
# hinzufügen der Dauer der Fahrt
la["duration"] = (la["end_time"] - la["start_time"]).astype("timedelta64[m]")
la["user_type"].size



290342

In [16]:
#remove data with same start and end station, that are shorter or equal to one minutes
print(la[((la["start_station_name"] == la["end_station_name"]) & (la["duration"] <= 1.0))].size)
filtered_data = la[ ~((la["start_station_name"] == la["end_station_name"]) & (la["duration"] <= 1.0)) ]



53325


In [17]:
#remove data with same start and end station, that are longer or equal to 24h
print(la[(la["duration"] > 1440.0)].size)
filtered_data_2 = la[ ~ (la["duration"] > 1440.0) ]

11520


In [18]:
la["user_type"].unique()

array(['Walk-up', 'Monthly Pass', 'Annual Pass', 'One Day Pass',
       'Flex Pass', 'Testing'], dtype=object)

# Add Cordinates to Data

In [21]:
# new data set with station information (locations)
# https://gbfs.bcycle.com/bcycle_lametro/station_information.json
coordinates = pd.read_csv("station_information.csv")
# drop unnecessary columns
coordinates = coordinates.drop(columns=['ttl', 'data__stations__rental_uris__ios','data__stations__rental_uris__android','version','last_updated'])
coordinates.head(3)

Unnamed: 0,data__stations__lon,data__stations__lat,data__stations___bcycle_station_type,data__stations__region_id,data__stations__address,data__stations__name,data__stations__station_id
0,-118.25854,34.0485,Kiosk and Station,bcycle_lametro_region_1,Reinstalled 6/15,7th & Flower,bcycle_lametro_3005
1,-118.25667,34.04554,Kiosk and Station,bcycle_lametro_region_1,729 S Olive Street,Olive & 8th,bcycle_lametro_3006
2,-118.25459,34.05048,Kiosk and Station,bcycle_lametro_region_1,557 S 5th Street,5th & Grand,bcycle_lametro_3007


In [22]:
# adding coordinates to stations
coordinates["ziped_coords_start"] = list(zip(coordinates["data__stations__lat"],coordinates["data__stations__lon"]))
la = la.merge(coordinates["ziped_coords_start"].to_frame(),how = "left",left_on=la["start_station_name"],right_on=coordinates["data__stations__name"])
la = la.drop(columns=["key_0"])
# dropping stations without location data
la = la.dropna(axis=0)
la.head(3)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,ziped_coords_start
0,2019-01-01 00:07:00,2019-01-01 00:14:00,3046,3051,6468,Walk-up,2nd & Hill,7th & Broadway,7.0,"(34.05287, -118.24749)"
1,2019-01-01 00:08:00,2019-01-01 00:14:00,3046,3051,12311,Walk-up,2nd & Hill,7th & Broadway,6.0,"(34.05287, -118.24749)"
2,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,5992,Walk-up,Main & 1st,Broadway & 9th,32.0,"(34.05194, -118.24353)"


In [23]:
coordinates["ziped_coords_end"] = list(zip(coordinates["data__stations__lat"],coordinates["data__stations__lon"]))
la = la.merge(coordinates["ziped_coords_end"].to_frame(),how = "left",left_on=la["end_station_name"],right_on=coordinates["data__stations__name"])
la = la.drop(columns=["key_0"])

la = la.dropna(axis=0)
la.head(3)

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,ziped_coords_start,ziped_coords_end
2,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,5992,Walk-up,Main & 1st,Broadway & 9th,32.0,"(34.05194, -118.24353)","(34.04211, -118.25619)"
3,2019-01-01 00:20:00,2019-01-01 00:50:00,3030,3075,5860,Walk-up,Main & 1st,Broadway & 9th,30.0,"(34.05194, -118.24353)","(34.04211, -118.25619)"
4,2019-01-01 00:22:00,2019-01-01 00:50:00,3030,3075,6006,Walk-up,Main & 1st,Broadway & 9th,28.0,"(34.05194, -118.24353)","(34.04211, -118.25619)"


# Make datasets for User Types and map

In [24]:
la["user_type"].unique()

array(['Walk-up', 'Monthly Pass', 'Annual Pass', 'One Day Pass',
       'Flex Pass', 'Testing'], dtype=object)

In [25]:
wu = la[la["user_type"]=="Walk-up"]
print(wu["user_type"].size)
mp = la[la["user_type"]=="Monthly Pass"]
print(mp["user_type"].size)
ap = la[la["user_type"]=="Annual Pass"]
print(ap["user_type"].size)
odp = la[la["user_type"]=="One Day Pass"]
print(odp["user_type"].size)
fp = la[la["user_type"]=="Flex Pass"]
print(fp["user_type"].size)
test = la[la["user_type"]=="Testing"]
print(test["user_type"].size)

59201
144482
17489
9289
270
33


In [26]:
la_cleaned = la

In [27]:
ids = la["bike_id"].unique()
print(ids.size)

3076


In [28]:
la.head()

Unnamed: 0,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,ziped_coords_start,ziped_coords_end
2,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,5992,Walk-up,Main & 1st,Broadway & 9th,32.0,"(34.05194, -118.24353)","(34.04211, -118.25619)"
3,2019-01-01 00:20:00,2019-01-01 00:50:00,3030,3075,5860,Walk-up,Main & 1st,Broadway & 9th,30.0,"(34.05194, -118.24353)","(34.04211, -118.25619)"
4,2019-01-01 00:22:00,2019-01-01 00:50:00,3030,3075,6006,Walk-up,Main & 1st,Broadway & 9th,28.0,"(34.05194, -118.24353)","(34.04211, -118.25619)"
5,2019-01-01 00:23:00,2019-01-01 00:51:00,3030,3075,6304,Walk-up,Main & 1st,Broadway & 9th,28.0,"(34.05194, -118.24353)","(34.04211, -118.25619)"
6,2019-01-01 00:24:00,2019-01-01 00:51:00,3030,3075,5846,Walk-up,Main & 1st,Broadway & 9th,27.0,"(34.05194, -118.24353)","(34.04211, -118.25619)"


In [29]:
log_map_start = folium.Map(location=(34.052235, -118.243683),tiles='OpenStreetMap',zoom_start=11, control_scale=True, max_zoom=20)
for index, row in wu.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_start"], popup=row["start_station_name"], color="green", fill_color="green",weight=2).add_to(log_map_start)
for index, row in mp.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_start"], popup=row["start_station_name"], color="blue", fill_color="blue",weight=2).add_to(log_map_start)    
for index, row in odp.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_start"], popup=row["start_station_name"], color="red", fill_color="red",weight=2).add_to(log_map_start)
for index, row in fp.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_start"], popup=row["start_station_name"], color="yellow", fill_color="yellow",weight=2).add_to(log_map_start)    
for index, row in ap.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_start"], popup=row["start_station_name"], color="orange", fill_color="orange",weight=2).add_to(log_map_start)
log_map_start

In [39]:
log_map_end = folium.Map(location=(34.052235, -118.243683),tiles='OpenStreetMap',zoom_start=11, control_scale=True, max_zoom=20)
for index, row in wu.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_end"], popup=row["end_station_name"], color="green", fill_color="green",weight=2).add_to(log_map_end)
for index, row in mp.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_end"], popup=row["end_station_name"], color="blue", fill_color="blue",weight=2).add_to(log_map_end)    
for index, row in odp.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_end"], popup=row["end_station_name"], color="red", fill_color="red",weight=2).add_to(log_map_end)
for index, row in fp.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_end"], popup=row["end_station_name"], color="yellow", fill_color="yellow",weight=2).add_to(log_map_end)    
for index, row in ap.iloc[:50].iterrows():
    folium.CircleMarker(radius=5, location=row["ziped_coords_end"], popup=row["end_station_name"], color="orange", fill_color="orange",weight=2).add_to(log_map_end)
log_map_end

# Clustering

In [115]:
#distance
la["lat1"], la["lon1"] = list(zip(*la["ziped_coords_start"]))
la["lat2"], la["lon2"] = list(zip(*la["ziped_coords_end"]))
lat1 = la["lat1"].tolist()
lat2 = la["lat2"].tolist()
lon1 = la["lon1"].tolist()
lon2 = la["lon2"].tolist()
listsize = len(Lat1List)

In [147]:
distance = []
radius = 6371  # km
# notice that if you do listsize, the loop will fail in the last element
for n in range(listsize):
     
    dlat = math.radians(lat2[n] - lat1[n])
    dlon = math.radians(lon2[n] - lon1[n])
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1[n])) * math.cos(math.radians(lat2[n])) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    d = radius * c
    distance.append(round(d,2))
distance = list(distance)

In [161]:
print(len(distance))
print(len(la))

230764
230764


In [171]:
la.insert(loc=0, column='distance', value=distance)

In [174]:
la

Unnamed: 0,distance,start_time,end_time,start_station_id,end_station_id,bike_id,user_type,start_station_name,end_station_name,duration,ziped_coords_start,ziped_coords_end,lat1,lon1,lat2,lon2,start_dist_centre,end_dist_centre
2,1.60,2019-01-01 00:18:00,2019-01-01 00:50:00,3030,3075,05992,Walk-up,Main & 1st,Broadway & 9th,32.0,"(34.05194, -118.24353)","(34.04211, -118.25619)",34.05194,-118.24353,34.04211,-118.25619,0.035703,1.611013
3,1.60,2019-01-01 00:20:00,2019-01-01 00:50:00,3030,3075,05860,Walk-up,Main & 1st,Broadway & 9th,30.0,"(34.05194, -118.24353)","(34.04211, -118.25619)",34.05194,-118.24353,34.04211,-118.25619,0.035703,1.611013
4,1.60,2019-01-01 00:22:00,2019-01-01 00:50:00,3030,3075,06006,Walk-up,Main & 1st,Broadway & 9th,28.0,"(34.05194, -118.24353)","(34.04211, -118.25619)",34.05194,-118.24353,34.04211,-118.25619,0.035703,1.611013
5,1.60,2019-01-01 00:23:00,2019-01-01 00:51:00,3030,3075,06304,Walk-up,Main & 1st,Broadway & 9th,28.0,"(34.05194, -118.24353)","(34.04211, -118.25619)",34.05194,-118.24353,34.04211,-118.25619,0.035703,1.611013
6,1.60,2019-01-01 00:24:00,2019-01-01 00:51:00,3030,3075,05846,Walk-up,Main & 1st,Broadway & 9th,27.0,"(34.05194, -118.24353)","(34.04211, -118.25619)",34.05194,-118.24353,34.04211,-118.25619,0.035703,1.611013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255636,0.60,2019-12-31 23:34:46,2019-12-31 23:42:28,3063,3069,12019,Walk-up,Pershing Square,Broadway & 3rd,7.0,"(34.04804, -118.25374)","(34.05088, -118.24825)",34.04804,-118.25374,34.05088,-118.24825,1.037348,0.446916
255637,0.00,2019-12-31 23:35:51,2020-01-02 17:13:50,4491,4491,5903,Monthly Pass,Main & Winston,Main & Winston,2497.0,"(34.04744, -118.24794)","(34.04744, -118.24794)",34.04744,-118.24794,34.04744,-118.24794,0.661892,0.661892
255638,1.38,2019-12-31 23:41:52,2019-12-31 23:50:58,4491,3022,18912,Monthly Pass,Main & Winston,3rd & Santa Fe,9.0,"(34.04744, -118.24794)","(34.04607, -118.23309)",34.04744,-118.24794,34.04607,-118.23309,0.661892,1.192647
255639,0.40,2019-12-31 23:48:17,2019-12-31 23:53:55,3064,3074,19053,Annual Pass,Grand & 8th,Hope & Olympic,5.0,"(34.04613, -118.25759)","(34.04417, -118.26117)",34.04613,-118.25759,34.04417,-118.26117,1.449995,1.843891
