## DBSCAN clustering to find place where most time was spent in a day

DBSCAN reference: http://geoffboeing.com/2014/08/visualizing-summer-travels/


In [1]:
# Import some initial libraries
import pandas as pd, numpy as np, matplotlib.pyplot as plt, time
from sklearn.cluster import DBSCAN
from sklearn import metrics
import math

In [2]:
# After install, import more libraries
# These must be installed using the command above.
from geopy.distance import great_circle 
from shapely.geometry import MultiPoint

In [3]:
# define the number of kilometers in one radian, a needed paramter
kms_per_radian = 6371.0088

### Read in data here

In [4]:
# load the data set
# df = pd.read_csv('gpslog58.csv', encoding='utf-8')
df = pd.read_csv('alldata2.csv', encoding='utf-8') 

df.columns = ['obs','dateandtimestamp', 'lon', 'lat','sats','temp','sample_time']
print df.head(5)
print df.count()

   obs  dateandtimestamp         lon        lat  sats   temp  \
0    0      180207173301 -120.663994  35.303253     8  73.84   
1    1      180207173302 -120.664085  35.303272     8  73.84   
2    2      180207173303 -120.664047  35.303341     9  73.84   
3    3      180207173304 -120.664116  35.303329     9  73.84   
4    4      180207173305 -120.664124  35.303329     9  73.84   

           sample_time  
0  2018-02-07 17:33:01  
1  2018-02-07 17:33:02  
2  2018-02-07 17:33:03  
3  2018-02-07 17:33:04  
4  2018-02-07 17:33:05  
obs                 406740
dateandtimestamp    406740
lon                 406740
lat                 406740
sats                406740
temp                406740
sample_time         406740
dtype: int64


In [5]:
mask1= df.dateandtimestamp >= 180215160000 #enter start date here (8am)
df1=df[mask1]
df1.count()
#need small data frame because of memory error

obs                 162825
dateandtimestamp    162825
lon                 162825
lat                 162825
sats                162825
temp                162825
sample_time         162825
dtype: int64

In [6]:
mask2= df1.dateandtimestamp <= 180216060000 #enter finish date here (10pm)
df2=df1[mask2]
df2.count()

obs                 37281
dateandtimestamp    37281
lon                 37281
lat                 37281
sats                37281
temp                37281
sample_time         37281
dtype: int64

In [7]:
# To prepare for the DBSCAN, represent points consistently as (lat, lon) in a matrix data structure
coords = df2.as_matrix(columns=['lat', 'lon'])

In [8]:
# define DBSCAN input paramters epsilon and min_samples
# Read about the paramters here: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html

# epsilon is your guess on the max distance between stationary points
# the units are in kilometers, so .03 would be .03 kilometers or 30 meters
epsilon = 0.010 / kms_per_radian 

# minimum_samples, this is the number of datapoints required to be considered "stationary"
# Remember, each datapoins is 1 second apart, so 30 or 60 (1 minute) or 300 (5 minutes)?
ms = 300 # this represents 30 seconds in one place to be a stationary_point

### DBSCAN

In [9]:
# This is the cell that calls the DBSCAN algorithm. This may take a long time, depending on
# the number of points in your datafile. You must wait for the * in the In[*] to complete.
# On a big dataset, this took about 5 minutes on my pretty powerful desktop

db = DBSCAN(eps=epsilon, min_samples=ms, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([coords[cluster_labels == n] for n in range(num_clusters)])

In [10]:
np.unique(cluster_labels)

array([-1,  0,  1,  2,  3,  4], dtype=int64)

In [11]:
pd.options.mode.chained_assignment = None  # suppress default='warn'

In [12]:
# Add cluster labels column to original dataframe
# so we know if each point is a "stationary" point (does NOT have a value of -1)
# or if each point is a "travel" point (has a value of -1)
df2['cluster_labels']=cluster_labels
print df2.head(5)

           obs  dateandtimestamp         lon        lat  sats   temp  \
243915  243915      180215160000 -120.660507  35.299767     9  62.41   
243916  243916      180215160001 -120.660507  35.299782     9  62.41   
243917  243917      180215160002 -120.660515  35.299793     9  62.41   
243918  243918      180215160003 -120.660530  35.299805     9  62.41   
243919  243919      180215160004 -120.660530  35.299816     9  62.41   

                sample_time  cluster_labels  
243915  2018-02-15 16:00:00              -1  
243916  2018-02-15 16:00:01              -1  
243917  2018-02-15 16:00:02              -1  
243918  2018-02-15 16:00:03              -1  
243919  2018-02-15 16:00:04              -1  


In [13]:
# This cell takes the groups of stationary points (clusters) and calculates the 
# centermost point of each cluster. This would best represent the lat/lon of the cluster
def get_centermost_point(cluster):
    centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
    centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
    return tuple(centermost_point)

# This is a way to call the above function on every point in the staionary_points clusters
vfunc = np.vectorize(get_centermost_point)
centermost_points = vfunc(clusters[1:(len(clusters)-1)])

### Analysis on Stationary Points

In [14]:
stationary_points2 = df2.loc[df2['cluster_labels'] != -1]

In [15]:
# Build a new dictionary that we will use to create a new DataFrame called stationary_points
data = {'lon': centermost_points[1],
       'lat': centermost_points[0]}
# from these lats/lons create a new DataFrame of stationary points
stationary_points = pd.DataFrame(data)
print stationary_points.head(5)
print len(stationary_points)

         lat         lon
0  35.300808 -120.662415
1  35.297527 -120.660988
2  35.303242 -120.665009
3  35.293953 -120.664330
4


In [16]:
#find elapsed time in each location
df_time1=stationary_points2.groupby(['cluster_labels']).first() #first occurances of cluster label
df_time2=stationary_points2.groupby(['cluster_labels']).last() #last occurances

print df_time1.head(5)
df_time1.count()

                   obs  dateandtimestamp         lon        lat  sats   temp  \
cluster_labels                                                                 
0               243976      180215160103 -120.660965  35.300537     8  61.53   
1               244274      180215160614 -120.662239  35.300907    10  61.53   
2               247791      180215170724 -120.660873  35.297810    10  68.56   
3               257484      180215200804 -120.664665  35.303062     7  74.71   
4               263423      180215215123 -120.664314  35.294186     8  72.96   

                        sample_time  
cluster_labels                       
0               2018-02-15 16:01:03  
1               2018-02-15 16:06:14  
2               2018-02-15 17:07:24  
3               2018-02-15 20:08:04  
4               2018-02-15 21:51:23  


obs                 5
dateandtimestamp    5
lon                 5
lat                 5
sats                5
temp                5
sample_time         5
dtype: int64

In [17]:
def elapsed (x): #function to calculate elaspsed time at cluster x
    time2 = pd.to_datetime(df_time2.sample_time.loc[x])
    time1 = pd.to_datetime(df_time1.sample_time.loc[x])
    timedif = time2-time1
    return timedif

print (elapsed(1)) #test to see if it worked

0 days 03:57:10


In [18]:
#set up dataframe
elapsed_col = pd.DataFrame(np.zeros(len(df_time1.index))) #make empty dataframe
elapsed_col.columns = ['elapsed_time'] # Give the column a name

elapsed_col.tail(5) #check to see if it worked

Unnamed: 0,elapsed_time
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [19]:
for index, row in elapsed_col.iterrows(): #fill empty dataframe with elapsed time values
    elapsed_col.elapsed_time.loc[index]= elapsed(index)
elapsed_col.head(10)

Unnamed: 0,elapsed_time
0,0 days 03:59:36
1,0 days 03:57:10
2,0 days 01:49:37
3,0 days 01:24:30
4,0 days 05:09:45


In [20]:
places = pd.concat([df_time1,elapsed_col],axis=1) #combine data frame
places = places.drop(['obs','dateandtimestamp','sats','temp','sample_time'],axis=1)
print places
print places.max()

                       lon        lat     elapsed_time
cluster_labels                                        
0              -120.660965  35.300537  0 days 03:59:36
1              -120.662239  35.300907  0 days 03:57:10
2              -120.660873  35.297810  0 days 01:49:37
3              -120.664665  35.303062  0 days 01:24:30
4              -120.664314  35.294186  0 days 05:09:45
lon                    -120.661
lat                     35.3031
elapsed_time    0 days 05:09:45
dtype: object


### Checking Results

In [21]:
#set up dataframe
check_time = pd.DataFrame(np.zeros(len(places.index))) #make empty dataframe
check_time.columns = ['check']

In [22]:
for index, row in check_time.iterrows(): #fill empty dataframe with elapsed time values
    df5 = stationary_points2[stationary_points2.cluster_labels == index]
    check_time.check.loc[index]=len(df5)
check_time.head(10)

Unnamed: 0,check
0,3085.0
1,3259.0
2,5416.0
3,4008.0
4,17689.0


In [23]:
#checked to make sure elapsed time made sense
places2 = pd.concat([places,check_time],axis=1) #combine data frame
print (places2.sort_values(by='check', ascending=False))

                       lon        lat     elapsed_time    check
cluster_labels                                                 
4              -120.664314  35.294186  0 days 05:09:45  17689.0
2              -120.660873  35.297810  0 days 01:49:37   5416.0
3              -120.664665  35.303062  0 days 01:24:30   4008.0
1              -120.662239  35.300907  0 days 03:57:10   3259.0
0              -120.660965  35.300537  0 days 03:59:36   3085.0


### put in lat, lon in google maps