In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.cluster import KMeans
import os
from datetime import datetime

In [2]:
df = pd.DataFrame()

for file in os.listdir():
  if '14' in file:  
    df2 = pd.read_csv(file)
    df = pd.concat([df,df2],axis = 0)

In [3]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


The column Date/Time must have a datetime format.

In [5]:
format_date = '%m/%d/%Y %H:%M:%S'

In [8]:
df.iloc[:,0] = df.iloc[:,0].apply(lambda x : datetime.strptime(x,format_date))

In [9]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,2014-04-01 00:11:00,40.769,-73.9549,B02512
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512


We create a new column which contains the day in week.

In [12]:
day_in_week = {0 : 'Monday',
              1 : 'Tuesday',
              2 : 'Wednesday',
              3 : 'Thursday',
              4 : 'Friday',
              5 : 'Saturday',
              6 : 'Sunday'}

In [16]:
df['day'] = df.iloc[:,0].apply(lambda x : x.weekday()).replace(day_in_week)

We create a new column which contains the hour of each trip.

In [17]:
df['hour'] = df.iloc[:,0].apply(lambda x : x.hour)

In [18]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,day,hour
0,2014-04-01 00:11:00,40.769,-73.9549,B02512,Tuesday,0
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,Tuesday,0
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,Tuesday,0
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,Tuesday,0
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,Tuesday,0


We create a new column which consider intervals of three hours.

In [21]:
hour_interval = lambda x : f'{3*(x//3)}-{3*(x//3)+3}'

In [25]:
df['interval_hour'] = df.hour.apply(hour_interval)

In [26]:
df.sample(10)

Unnamed: 0,Date/Time,Lat,Lon,Base,day,hour,interval_hour
466995,2014-09-15 18:04:00,40.7795,-73.9758,B02617,Monday,18,18-21
668023,2014-09-03 21:48:00,40.7266,-73.9848,B02682,Wednesday,21,21-24
262833,2014-09-29 09:28:00,40.8155,-73.9597,B02598,Monday,9,9-12
503177,2014-09-18 16:17:00,40.7095,-73.9558,B02617,Thursday,16,15-18
415494,2014-08-14 15:09:00,40.6677,-73.998,B02617,Thursday,15,15-18
548066,2014-06-13 16:49:00,40.7529,-73.9657,B02682,Friday,16,15-18
279756,2014-04-17 21:07:00,40.676,-73.999,B02617,Thursday,21,21-24
123002,2014-04-17 18:30:00,40.7143,-73.9568,B02598,Thursday,18,18-21
706498,2014-08-17 17:02:00,40.7225,-74.0106,B02682,Sunday,17,15-18
449121,2014-08-17 05:40:00,40.6463,-73.7768,B02617,Sunday,5,3-6


We construct, for each day in week and each interval of time, 4 clusters.

In [46]:
for day in df.day.unique():
    for interval_hour in df.interval_hour.unique():
        
        df_new = df[(df.day == day)&(df.interval_hour == interval_hour)]
        labels = KMeans(n_clusters = 4).fit_predict(df_new[['Lon','Lat']])
        df.loc[(df.day == day)&(df.interval_hour == interval_hour),'labels'] = labels

In [47]:
for day in df.day.unique():
    for interval_hour in df.interval_hour.unique():
        
        labels_sorted = df[(df.day == day)&(df.interval_hour == interval_hour)]\
        ['labels'].value_counts().sort_values().index.tolist()
        df.loc[(df.day == day)&(df.interval_hour == interval_hour),'labels'] \
        = df[(df.day == day)&(df.interval_hour == interval_hour)]['labels']\
        .replace({label:new_label for new_label,label in enumerate(labels_sorted)})

In [44]:
labels_sorted = df[(df.day == 'Friday')&(df.interval_hour=='18-21')]['labels']\
.value_counts().sort_values().index.tolist()

df.loc[(df.day == 'Friday')&(df.interval_hour=='18-21'),'labels'] = \
df[(df.day == 'Friday')&(df.interval_hour=='18-21')]['labels'].replace({label:new_label\
                        for new_label,label in enumerate(labels_sorted)})

In [49]:
df[(df.day == 'Sunday')&(df.interval_hour=='18-21')]['labels']\
.value_counts()

3.0    61679
2.0     9807
1.0     6329
0.0     1500
Name: labels, dtype: int64

In [55]:
Lon_mean = df.Lon.mean()
Lat_mean = df.Lat.mean()

In [94]:
def scatter_map(day,interval_hour):

 df_new = df[(df.day == day)&(df.interval_hour == interval_hour)].rename(columns = \
                                                            {'labels' : 'Hot spot'})

 fig = px.scatter_mapbox(df_new.sample(3000),\
                         lat="Lat", lon="Lon",     color="Hot spot", size_max=5, zoom=2)


 fig.update_layout(
    margin ={'l':0,'t':0,'b':0,'r':0},
    mapbox = {
        'center': {'lon': Lon_mean, 'lat': Lat_mean},
        'style': "carto-positron",
        'zoom': 10})

 return fig



In [95]:
 scatter_map('Tuesday','12-15').show()

In [96]:
if not os.path.exists("images_kmeans"):
    os.mkdir("images_kmeans")


In [98]:
for day in df.day.unique():
    for interval_hour in df.interval_hour.unique():

      fig = scatter_map(day,interval_hour)

      fig.write_image(f"images_kmeans/{day}_{interval_hour}.png")