In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import folium
import geopandas

from datetime import time
from scipy import sparse

from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans

In [2]:
myMap = folium.Map(location=[42.36, -71.11], zoom_start=15)
myMap

In [3]:
stations = pd.read_csv("hubway_2011_07_through_2013_11/hubway_stations.csv")
for i in stations.index:
    lon = stations.iloc[i]['lng']
    lat = stations.iloc[i]['lat']
    folium.Circle(location = [lat, lon], radius = 20, popup = stations.iloc[i]['station'], tooltip = stations.iloc[i]['municipal']).add_to(myMap)

In [4]:
stations.head()

Unnamed: 0,id,terminal,station,municipal,lat,lng,status
0,3,B32006,Colleges of the Fenway,Boston,42.340021,-71.100812,Existing
1,4,C32000,Tremont St. at Berkeley St.,Boston,42.345392,-71.069616,Existing
2,5,B32012,Northeastern U / North Parking Lot,Boston,42.341814,-71.090179,Existing
3,6,D32000,Cambridge St. at Joy St.,Boston,42.361285,-71.06514,Existing
4,7,A32000,Fan Pier,Boston,42.353412,-71.044624,Existing


In [5]:
myMap

In [6]:
trips = pd.read_csv("hubway_2011_07_through_2013_11/hubway_trips.csv")
trips['s_date'] = pd.to_datetime(trips['start_date'])
trips['sDate'] = trips['s_date'].dt.date
trips['sTime'] = trips['s_date'].dt.time

In [7]:
trips.head()

Unnamed: 0,seq_id,hubway_id,status,duration,start_date,strt_statn,end_date,end_statn,bike_nr,subsc_type,zip_code,birth_date,gender,s_date,sDate,sTime
0,1,8,Closed,9,7/28/2011 10:12:00,23.0,7/28/2011 10:12:00,23.0,B00468,Registered,'97217,1976.0,Male,2011-07-28 10:12:00,2011-07-28,10:12:00
1,2,9,Closed,220,7/28/2011 10:21:00,23.0,7/28/2011 10:25:00,23.0,B00554,Registered,'02215,1966.0,Male,2011-07-28 10:21:00,2011-07-28,10:21:00
2,3,10,Closed,56,7/28/2011 10:33:00,23.0,7/28/2011 10:34:00,23.0,B00456,Registered,'02108,1943.0,Male,2011-07-28 10:33:00,2011-07-28,10:33:00
3,4,11,Closed,64,7/28/2011 10:35:00,23.0,7/28/2011 10:36:00,23.0,B00554,Registered,'02116,1981.0,Female,2011-07-28 10:35:00,2011-07-28,10:35:00
4,5,12,Closed,12,7/28/2011 10:37:00,23.0,7/28/2011 10:37:00,23.0,B00554,Registered,'97214,1983.0,Female,2011-07-28 10:37:00,2011-07-28,10:37:00


In [8]:
#Filtering Origin-Distination on hourly base
time_min = time(0, 0, 0)
hourly_trips = []
for hour in range(1, 24):
    time_max = time(hour, 0, 0)
    period = (trips['sTime'] >= time_min) & (trips['sTime'] < time_max)
    trips_during_period = trips.loc[period]
    hourly_trips.append(trips_during_period)
    time_min = time_max

In [9]:
#Aggregate Origin-Destination occuring between 5:00 am and 10:00 am and preprocessing
frames = [hourly_trips[5], hourly_trips[6], hourly_trips[7], hourly_trips[8], hourly_trips[9]]
earlier_morning_trips = pd.concat(frames)
X = earlier_morning_trips[["strt_statn", "end_statn", "duration"]]
imp = SimpleImputer(strategy='constant', fill_value=-1)
X = imp.fit_transform(X)

In [10]:
#Getting adress of stations involving Origin-Destination occuring between 5:00 am and 10:00 am
class StationInfos:
    def __init__(self, lat, lng):
        self.lat = lat
        self.lng = lng
class Trip:
    def __init__(self, start_station, end_station, duration):
        self.start_station = start_station
        self.end_station = end_station
        self.duration = duration
stations.set_index('id')
Xdf_station_datas = pd.DataFrame(columns=["start_lat", "start_lng", "end_lat", "end_lng"])
Xdf_trips = pd.DataFrame(columns=["start_lat", "start_lng", "end_lat", "end_lng", "duration"])
df_index = 0
for i in range(len(X[:, 0])):
    for j in stations.index:
        if j == int(X[i, 0]):
            for k in stations.index:
                if k == int(X[i, 1]):
                    startStation = StationInfos(stations.loc[int(X[i, 0]), 'lat'], stations.loc[int(X[i, 0]), 'lng'])
                    endStation = StationInfos(stations.loc[int(X[i, 1]), 'lat'], stations.loc[int(X[i, 1]), 'lng'])
                    trip = Trip(startStation, endStation, X[i, 2])
                    
                    dataFrame = pd.DataFrame({'start_lat': [trip.start_station.lat],
                                             'start_lng': [trip.start_station.lng],
                                             'end_lat': [trip.end_station.lat],
                                             'end_lng': [trip.end_station.lng],
                                             'duration': [trip.duration]},
                                            index=[df_index])
                    
                    Xdf_trips = Xdf_trips.append(dataFrame)
                    
                    df_index = df_index + 1
                    break
            break

In [11]:
Xdf_trips.head()

Unnamed: 0,start_lat,start_lng,end_lat,end_lng,duration
0,42.352096,-71.070378,42.360583,-71.056868,25720.0
1,42.341332,-71.076847,42.341332,-71.076847,1619.0
2,42.341332,-71.076847,42.341332,-71.076847,1646.0
3,42.344763,-71.09788,42.354979,-71.063348,822.0
4,42.347527,-71.105828,42.347433,-71.076163,630.0


In [12]:
folium.features.ColorLine(
        [[42.352096, -71.070378], [42.360583, -71.056868]],
        [0, 1, 2, 3],
        colormap=['b', 'g', 'y', 'r'],
        nb_steps=4,
        weight=0,
        opacity=0
).add_to(myMap)
myMap
#gdf = geopandas.GeoDataFrame(stations, geometry=geopandas.points_from_xy(stations.lng, stations.lat))
#gdf.head()

In [11]:
#Preprocessing of datas of type category
#np_station = np.array(stations['station'])
#np_municipal = np.array(stations['municipal'])
#enc = preprocessing.OneHotEncoder(categories=[np_station, np_municipal, np_station, np_municipal])
#enc.fit(np.array(od_station_datas))
#X = enc.transform(np.array(od_station_datas))

In [13]:
#Clustering of Origin-Destination
kmeans = KMeans(n_clusters=50, random_state=0).fit(np.array(Xdf_trips))
kmeans.labels_

array([26, 45, 45, ..., 38,  0, 14], dtype=int32)

In [14]:
kmeans.cluster_centers_

array([[ 4.23533740e+01, -7.10822396e+01,  4.23553314e+01,
        -7.10814495e+01,  1.92399914e+02],
       [ 4.23509890e+01, -7.10736440e+01,  4.23521750e+01,
        -7.10555470e+01,  3.13728000e+06],
       [ 4.23413320e+01, -7.10768470e+01,  4.23520960e+01,
        -7.10703780e+01,  5.35108300e+06],
       [ 4.23589034e+01, -7.10839362e+01,  4.23505912e+01,
        -7.10913998e+01,  1.99439800e+05],
       [ 4.23402990e+01, -7.11099270e+01,  4.23440230e+01,
        -7.10570540e+01,  2.36222000e+06],
       [ 4.23204940e+01, -7.10511220e+01,  4.23834050e+01,
        -7.11075930e+01,  9.16140000e+05],
       [ 4.23546060e+01, -7.10817009e+01,  4.23555841e+01,
        -7.10799466e+01,  2.94853538e+04],
       [ 4.23507012e+01, -7.10941090e+01,  4.23524026e+01,
        -7.10971772e+01,  1.06786600e+05],
       [ 4.23340730e+01, -7.11052210e+01,  4.23474330e+01,
        -7.10761630e+01,  4.57909000e+05],
       [ 4.23537861e+01, -7.10815217e+01,  4.23549519e+01,
        -7.10819537e+01

In [15]:
from matplotlib import colors as mcolors
colors = dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS)
by_hsv = sorted((tuple(mcolors.rgb_to_hsv(mcolors.to_rgba(color)[:3])), name)
                for name, color in colors.items())
color_names = [name for hsv, name in by_hsv]

myMap = folium.Map(location=[42.36, -71.11], zoom_start=15)
for i in stations.index:
    lon = stations.iloc[i]['lng']
    lat = stations.iloc[i]['lat']
    folium.Circle(location = [lat, lon], radius = 20, popup = stations.iloc[i]['station'], tooltip = stations.iloc[i]['municipal']).add_to(myMap)

i = 0
for j in range(len(kmeans.cluster_centers_[:, 0])):
    start_lat = kmeans.cluster_centers_[j,0]
    start_lng = kmeans.cluster_centers_[j,1]
    end_lat = kmeans.cluster_centers_[j,2]
    end_lng = kmeans.cluster_centers_[j,3]
    folium.Marker(
        location = [start_lat, start_lng],
        popup = 'Start station ' + str(j),
        icon = folium.Icon(color="green")
    ).add_to(myMap)
    folium.Marker(
        location = [end_lat, end_lng],
        popup = 'End station ' + str(j),
        icon = folium.Icon(color="red")
    ).add_to(myMap)
    #if i < 50:
    #    i = i + 2
myMap

In [15]:
#myMap = folium.Map(location=[42.36, -71.11], zoom_start=15)
#for i in stations.index:
#    lon = stations.iloc[i]['lng']
#    lat = stations.iloc[i]['lat']
#    folium.Circle(location = [lat, lon], radius = 20, popup = stations.iloc[i]['station'], tooltip = stations.iloc[i]['municipal']).add_to(myMap)

od_clusters = []
temp_cluster = []
for j in range(50):
    for i in range(len(kmeans.labels_[:])):
        if j == kmeans.labels_[i]:
            temp_cluster.append(i)
    od_clusters.append(temp_cluster)
    temp_cluster = []        

In [17]:
cluster0 = np.array(od_clusters[0])
cluster0
#for i in range(len(cluster0[:]))

#Xdf_trips.iloc[]

array([     4,     44,     46, ..., 313220, 313231, 313235])

In [19]:
len(od_clusters)

50