In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

In [2]:
bus_stops = pd.read_csv('Capital_District_Bus_Stops_20240421.csv')

In [3]:
bus_stops.head()

Unnamed: 0,Description,Stop ID,Longitude,Latitude,Routes,Municipality,Nearest Intersection,Bikerack,Bench,Shelter,Info Booth,Signpost,Discontinued,Location
0,1160 Central Ave (Sunset Lanes),193,,,"1, 807",Colonie,UNKNOWN,No,No,No,No,Yes,No,
1,River St & 101st St,852,,,802,Troy,REGATTA PLACE,No,No,No,No,Yes,No,
2,Whitehall Rd & Picotte Dr,1194,,,"801, 803",Albany,UNKNOWN,No,No,No,No,No,No,
3,Rt 50 & 5th St,1737,-73.957117,42.832875,450,Glenville,UNKNOWN,No,No,No,No,Yes,No,"(42.832875, -73.957117)"
4,Craig St & Duane Ave,3606,,,"353, 873",Schenectady,LINCOLN AVENUE,No,No,No,No,Yes,No,


In [4]:
bus_stops_slim = bus_stops[['Stop ID', 'Routes']]
bus_stops_slim.columns = ['stop_id','routes']
bus_stops_slim.head()

Unnamed: 0,stop_id,routes
0,193,"1, 807"
1,852,802
2,1194,"801, 803"
3,1737,450
4,3606,"353, 873"


In [5]:
def split_routes(row):
    row.routes = str(row.routes).split(',')
    return row

bus_stops_slim = bus_stops_slim.apply(split_routes, axis=1)
bus_stops_slim.head()

Unnamed: 0,stop_id,routes
0,193,"[1, 807]"
1,852,[802]
2,1194,"[801, 803]"
3,1737,[450]
4,3606,"[353, 873]"


In [6]:
stop_map = {}
for index,row in bus_stops_slim.iterrows():
    for route in row.routes:
        this_route = stop_map.get(route, {})
        this_route[row.stop_id] = this_route.get(row.stop_id, 0) + 1 
        stop_map[route] = this_route



In [7]:
stop_map

{'1': {193: 1,
  190: 1,
  3937: 1,
  3488: 1,
  143: 1,
  2663: 1,
  12875: 1,
  32: 1,
  10525: 1,
  145: 1,
  211: 1,
  2509: 1,
  222: 1,
  158: 1,
  205: 1,
  3239: 1,
  156: 1,
  3250: 1,
  223: 1,
  10267: 1,
  3301: 1,
  10087: 1,
  1789: 1,
  10514: 1,
  10524: 1,
  136: 1,
  10506: 1,
  209: 1,
  12874: 1,
  3414: 1,
  203: 1,
  165: 1,
  10502: 1,
  152: 1,
  195: 1,
  139: 1,
  218: 1,
  3413: 1,
  207: 1,
  10523: 1,
  2706: 1,
  1787: 1,
  1788: 1,
  3419: 1,
  216: 1,
  10522: 1,
  164: 1,
  10503: 1,
  1357: 1,
  10515: 1,
  3500: 1,
  198: 1,
  141: 1,
  206: 1,
  148: 1,
  220: 1,
  31: 1,
  214: 1,
  3987: 1,
  189: 1,
  159: 1,
  2665: 1,
  221: 1,
  3248: 1,
  3455: 1},
 ' 807': {193: 1,
  190: 1,
  3937: 1,
  3488: 1,
  143: 1,
  2663: 1,
  12875: 1,
  32: 1,
  145: 1,
  211: 1,
  2509: 1,
  222: 1,
  158: 1,
  205: 1,
  3239: 1,
  156: 1,
  3250: 1,
  223: 1,
  10267: 1,
  3301: 1,
  10087: 1,
  1789: 1,
  10514: 1,
  136: 1,
  10506: 1,
  209: 1,
  12874: 1,
  3

In [8]:
index = stop_map.keys()
rows = [stop_map[k] for k in index]
routes_df = pd.DataFrame(rows, index=index)
routes_df = routes_df.fillna(0)
routes_df

Unnamed: 0,193,190,3937,3488,143,2663,12875,32,10525,145,...,15599,12631,12773,12630,10324,20020,11952,13015,13014,2895
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
807,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
803,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [9]:
for cluster_count in range(2,5):
    cluster_model = KMeans(n_clusters=cluster_count)
    cluster_model.fit(routes_df)
    cluster_labels = cluster_model.predict(routes_df)
    route_cluster_df = pd.DataFrame(cluster_labels, index=routes_df.index, columns=['cluster'])
    print(f'*********************** Clusters: {cluster_count}****************************')
    print(route_cluster_df['cluster'].value_counts())


*********************** Clusters: 2****************************
1    138
0      4
Name: cluster, dtype: int64
*********************** Clusters: 3****************************
2    136
0      4
1      2
Name: cluster, dtype: int64
*********************** Clusters: 4****************************
0    135
3      4
2      2
1      1
Name: cluster, dtype: int64


In [17]:
cluster_count = 3
cluster_model = KMeans(n_clusters=cluster_count)
cluster_model.fit(routes_df)
cluster_labels = cluster_model.predict(routes_df)
route_cluster_df = pd.DataFrame(cluster_labels, index=routes_df.index, columns=['cluster'])
print(f'*********************** Clusters: {cluster_count}****************************')
print(route_cluster_df['cluster'].value_counts())



*********************** Clusters: 3****************************
1    134
0      6
2      2
Name: cluster, dtype: int64


In [23]:
route_cluster_df[route_cluster_df.cluster ==0].head()


Unnamed: 0,cluster
803,0
801,0
922,0
923,0
106,0


In [24]:
route_cluster_df[route_cluster_df.cluster ==1].head()


Unnamed: 0,cluster
1,1
807,1
802,1
801,1
450,1


In [25]:
route_cluster_df[route_cluster_df.cluster ==2].head()


Unnamed: 0,cluster
806,2
805,2
