In [165]:
import pandas as pd

import numpy as np

import os,sys

from shapely.geometry import Point, Polygon

import copy

from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score 

import matplotlib.pyplot as plt

from matplotlib.ticker import MultipleLocator, FormatStrFormatter

import time

In [166]:
def Get_Shift(Startstamp,Endstamp,Duration,Distance):
    
    '''
    Input:
    
    (1) Startstamp,
    
    (2) Endstamp,
    
    (3) Duration,
    
    (4) Distance
    
    Output:

    (1) 'The number of Orders'
    
    (2) 'Avg gap between orders (sec)'

    (3) 'Shift Duration (sec)'

    (4) 'Occupied Time (sec)'

    (5) 'Shift Start Time'
    
    (6) 'Shift End Time', 

    (7) 'Occupied Distance (m)'

    '''
    
    Result=list()
    
    '''Clean the data'''
        
    k=1

    while k in range(1,len(Startstamp),1):

        if Startstamp[k] <= Endstamp[k-1] and Endstamp[k] <= Endstamp[k-1]:

            Startstamp.pop(k)

            Endstamp.pop(k)

            Duration.pop(k)

            Distance.pop(k)

        k+=1
    
    if len(Startstamp)==1:
        
        Result.append([1,Duration[0],Duration[0],Startstamp[0],Endstamp[0],Distance[0]])
        
    else:
                
        '''Calculation'''  
        
        s=0
        
        i=1
        
        order_num=1
        
        occupied_duration=Duration[i-1]
        
        occupied_distance=Distance[i-1]
        
        while i<len(Startstamp):
            
            if Startstamp[i]-Endstamp[i-1]<3600*2:
                
                order_num+=1
                
                if Endstamp[i-1]<Startstamp[i]:
                
                    occupied_duration+=Duration[i]

                    occupied_distance+=Distance[i]
                    
                else:

                    occupied_duration+=Endstamp[i]-Endstamp[i-1]

                    occupied_distance+=((Endstamp[i]-Endstamp[i-1])/(Endstamp[i]-Startstamp[i]))*Distance[i]
                
                if i==len(Startstamp)-1:
                    
                    shift_duration=Endstamp[i]-Startstamp[s]
                    
                    Result.append([order_num,shift_duration,occupied_duration,Startstamp[s],Endstamp[i],occupied_distance])
                    
                i+=1
                
            else:
                
                shift_duration=Endstamp[i-1]-Startstamp[s]
                
                Result.append([order_num,shift_duration,occupied_duration,Startstamp[s],Endstamp[i-1],occupied_distance])
                
                s=i

                i+=1
                
                order_num=1

                occupied_duration=Duration[i-1]
        
                occupied_distance=Distance[i-1]
                
    return Result

def Get_Gap(Startstamp,Endstamp):
    
        
    if len(Startstamp)==1:
        
        gap=0
        
    else:
        
        gaps=list()
        
        k=1
        
        while k in range(1,len(Startstamp),1):
            
            if Startstamp[k] >= Endstamp[k-1]:
            
                gaps.append(Startstamp[k] - Endstamp[k-1])
                
            k+=1
            
        if len(gaps)>0:
            
            gap=np.array(gaps).mean()

            gap=round(gap,2)

        else:

            gap=0
        
    return gap
        
def explode(df, col):
    df[col] = df[col].apply(lambda x: [x] if not isinstance(x, list) else x)
    return df.drop(col, axis=1).join(
        pd.DataFrame(list(df[col])).stack().reset_index(level=1, drop=True).rename(col)
    )

Get_hour=lambda x:int(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(x))[11:13])+float(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(x))[14:16])/60.0

Get_3600=lambda x:round(x/3600.0,2)

Get_10=lambda x:round(x/10.0,2)

Get_1000=lambda x:round(x/10.0,2)



## Data Initialization

In [167]:
Load_path='./Data/GAIA/'

dt='20161101'

GPS_Df=pd.read_csv(os.path.join(Load_path,'gps_'+dt+'.csv'),header=None,names=['Driver_id','Order_id',\
                                                                               'Timestamp','Longitude',\
                                                                               'Latitude'])

GPS_Df=GPS_Df.drop_duplicates(subset=['Driver_id','Order_id'])

GPS_Df=GPS_Df.reset_index(drop=True)

GPS_Df=GPS_Df[['Driver_id','Order_id']]

GPS_Df


Order_Df=pd.read_csv(os.path.join(Load_path,'order_'+dt+'.csv'),header=None,names=['Order_id','Startstamp','Endstamp',\
                                                                                   'Pickup_Longitude','Pickup_Latitude',\
                                                                                   'Dropoff_Longitude','Dropoff_Latitude'])

Get_distance=lambda lng1,lat1,lng2,lat2:int(Point(lng1,lat1).distance(Point(lng2,lat2))*111000*1.2)

Order_Df['Duration']=Order_Df.apply(lambda x:x['Endstamp']-x['Startstamp'],axis=1)

Order_Df['Distance']=Order_Df.apply(lambda x:Get_distance(x['Pickup_Longitude'],x['Pickup_Latitude'],x['Dropoff_Longitude'],x['Dropoff_Latitude']),axis=1)

Order_Df=Order_Df.loc[Order_Df['Duration']>0]

Order_Df['Speed']=Order_Df.apply(lambda x:x['Distance']/x['Duration'],axis=1)

Order_Df=Order_Df[['Order_id','Startstamp','Endstamp','Duration','Distance','Speed']]


'''

The data was then cleaned by removing drivers who showed unrealistic data trends 

such as having a second trip that began prior to the conclusion of the first trip 

or having trips that were seen as outliers in terms of the distance, speed, or time traveled. 

The outliers were defined as data that is not within the middle 99\% of the data set for each of the variables.


'''

Duration_arr=np.sort(np.array(Order_Df['Duration']))

Duration_min_=Duration_arr[int(0.01 * len(Duration_arr))]
    
Duration_max_=Duration_arr[int(0.99 * len(Duration_arr))]

Distance_arr=np.sort(np.array(Order_Df['Distance']))

Distance_min_=Distance_arr[int(0.01 * len(Distance_arr))]
    
Distance_max_=Distance_arr[int(0.99 * len(Distance_arr))]

Speed_arr=np.sort(np.array(Order_Df['Speed']))

Speed_min_=Speed_arr[int(0.01 * len(Speed_arr))]

Speed_max_=Speed_arr[int(0.99 * len(Speed_arr))]



Order_Df=Order_Df.loc[(Order_Df['Duration']>=Duration_min_)&(Order_Df['Duration']<=Duration_max_)]

Order_Df=Order_Df.loc[(Order_Df['Distance']>=Distance_min_)&(Order_Df['Distance']<=Distance_max_)]

Order_Df=Order_Df.loc[(Order_Df['Speed']>=Speed_min_)&(Order_Df['Speed']<=Speed_max_)]



Order_Df



Unnamed: 0,Order_id,Startstamp,Endstamp,Duration,Distance,Speed
0,eb9dd4095d9850e6287cefd813775a6c,1477964797,1477966507,1710,7111,4.158480
1,387a742fa5a3fbe4a1f215ac58ea33a8,1477985585,1477987675,2090,23760,11.368421
2,9cf55f8e6e02a1e0f792df06e5d85011,1478004952,1478006217,1265,11883,9.393676
3,5feeae0307e15203484b9ffceef89855,1477989840,1477991065,1225,8009,6.537959
4,ad4b52cb15b90c44c8f20a8b1e57a622,1477958005,1477958577,572,5905,10.323427
5,ad551eb23b72e2a77ae5e81e22f2dbd0,1477997663,1477998786,1123,7880,7.016919
6,ad4b52cb15b90c44c8f20a8b1e57a622,1477958005,1477958577,572,5905,10.323427
7,db46d8931c1ac3f61d8278df67c2a365,1477958918,1477960167,1249,9651,7.726982
8,908e7f068da5768c492ab69dbf81efda,1477960528,1477961815,1287,7870,6.114996
9,2d48affae032a4ca29d220660732686d,1477962166,1477963634,1468,9382,6.391008


In [169]:
'''Collect the data into a list array'''

Behavior_Df=Order_Df.merge(GPS_Df,on='Order_id',how='left')

Behavior_Df=Behavior_Df.dropna()

Behavior_Df['Startstamp']=Behavior_Df['Startstamp'].astype('int32')

Behavior_Df['Endstamp']=Behavior_Df['Endstamp'].astype('int32')

Behavior_Df=Behavior_Df[['Driver_id','Order_id','Startstamp','Endstamp','Duration','Distance']]

Behavior_Df=Behavior_Df.drop_duplicates(subset=['Driver_id','Order_id'])

Behavior_Df=Behavior_Df.sort_values(by=['Driver_id','Startstamp'])

Behavior_Df=Behavior_Df.reset_index(drop=True)

Stat_Df = pd.DataFrame([])

Stat_Df['Startstamp']=Behavior_Df.groupby('Driver_id').apply(lambda x: x['Startstamp'].to_list())

Stat_Df['Endstamp']=Behavior_Df.groupby('Driver_id').apply(lambda x: x['Endstamp'].to_list())

Stat_Df['Duration']=Behavior_Df.groupby('Driver_id').apply(lambda x: x['Duration'].to_list())

Stat_Df['Distance']=Behavior_Df.groupby('Driver_id').apply(lambda x: x['Distance'].to_list())

Stat_Df['Driver_id']=Stat_Df.index

Stat_Df=Stat_Df.reset_index(drop=True)

Stat_Df=Stat_Df[['Driver_id','Startstamp','Endstamp','Duration','Distance']]

Stat_Df['Transition']=Stat_Df.apply(lambda x:Get_Shift(x['Startstamp'],x['Endstamp'],\
                                                      x['Duration'],x['Distance']),axis=1)


Stat_Df['Gap']=Stat_Df.apply(lambda x:Get_Gap(x['Startstamp'],x['Endstamp']),axis=1)

Stat_Df=Stat_Df[['Driver_id','Transition','Gap']]

Stat_Df=explode(Stat_Df,'Transition')

Stat_Df=Stat_Df.reset_index(drop=True)


'''

(1) 'The number of Orders'

(2) 'Shift Duration (sec)'

(3) 'Occupied Time (sec)'

(4) 'Shift Start Time'

(5) 'Shift End Time', 

(6) 'Occupied Distance (m)'

'''


Cols=['The number of Orders', 'Shift Duration (sec)', 'Occupied Time (sec)', 'Shift Start Time', 'Shift End Time','Occupied Distance (m)']

for i in range(len(Cols)):
    
    Stat_Df[Cols[i]]=Stat_Df.apply(lambda x:x['Transition'][i],axis=1)
    

Stat_Df=Stat_Df[['Driver_id','Gap']+Cols]

Stat_Df['Shift']=1

Stat_Df





Unnamed: 0,Driver_id,Gap,The number of Orders,Shift Duration (sec),Occupied Time (sec),Shift Start Time,Shift End Time,Occupied Distance (m),Shift
0,0000131d486b69eb77ab6e9e7cca9f4c,1187.00,2,3119,1932,1477972625,1477975744,13808.000000,1
1,000211a97ba2b768ef941593bf1ae1a5,19088.50,1,966,966,1477963781,1477964747,5897.000000,1
2,000211a97ba2b768ef941593bf1ae1a5,19088.50,3,6251,5342,1478002015,1478008266,26215.653614,1
3,000770ada4aa5d856b5a766fb6bf7cf1,1593.75,5,11649,5274,1477962292,1477973941,31479.000000,1
4,0007a380800445346fff19afc4071ce3,0.00,1,1419,1419,1477963840,1477965259,6544.000000,1
5,0008c9d2ac91c00ef8b3ef80eba02a06,5840.43,3,4074,2729,1477967006,1477971080,19702.000000,1
6,0008c9d2ac91c00ef8b3ef80eba02a06,5840.43,2,3937,3431,1477984364,1477988301,23593.000000,1
7,0008c9d2ac91c00ef8b3ef80eba02a06,5840.43,3,10959,2406,1478005496,1478016455,18851.000000,1
8,00095da02ea38d4d8eaa120201e38ae9,0.00,1,1522,1522,1477965144,1477966666,6060.000000,1
9,000d0e4bd461b1f964daeb6d21868757,11802.60,5,7148,5668,1477940181,1477947329,58447.000000,1


In [172]:
'''

Consequently, the data was then summarised into one row per driver, 

with each row describing the total number of shifts, average number 

of orders, average gap between orders, average shift duration, average 

occupied time, first shift start time and average occupied distance.


'''


'''Gap Between two Shifts'''

Stat_Df=Stat_Df.sort_values(by=['Driver_id','Shift Start Time'])

Gap_Df=pd.DataFrame([])

Gap_Df['Shift Start Time']=Stat_Df.groupby('Driver_id').apply(lambda x: x['Shift Start Time'].to_list())

Gap_Df['Shift End Time']=Stat_Df.groupby('Driver_id').apply(lambda x: x['Shift End Time'].to_list())

Gap_Df['Driver_id']=Gap_Df.index

def Get_gap(Start,End):
    if len(Start)==1:
        return 0.0
    else:
        gap=list()
        for i in range(1,len(Start),1):
            gap.append(Start[i]-End[i-1])
        return round(np.array(gap).mean()/3600.0,2)

Gap_Df['Average Gap between shifts (Hour)']=Gap_Df.apply(lambda x:Get_gap(x['Shift Start Time'],x['Shift End Time']),axis=1)

Gap_Df = Gap_Df[['Driver_id','Average Gap between shifts (Hour)']]

Gap_Df = Gap_Df.reset_index(drop=True)

Gap_Df


'''Total Number of Shifts'''

Sum_Df=Stat_Df.groupby('Driver_id').sum()

Sum_Df['Driver_id']=Sum_Df.index

Sum_Df=Sum_Df.rename(columns={'Shift':'Total Number of Shifts'})

Sum_Df=Sum_Df.reset_index(drop=True)

Sum_Df=Sum_Df[['Driver_id','Total Number of Shifts']]



'''Average Values'''

Avg_Df=Stat_Df.groupby('Driver_id').mean()

Avg_Df['Driver_id']=Avg_Df.index

Avg_Df=Avg_Df.rename(columns={'The number of Orders':'Average Order Number',\
                              'Shift Duration (sec)':'Average Shift Duration (sec)',\
                              'Occupied Time (sec)':'Average Occupied Time (sec)',\
                              'Occupied Distance (m)':'Average Occupied Distance (m)',\
                              'Gap': 'Average Gap between orders (sec)'})

Avg_Df=Avg_Df.reset_index(drop=True)


Avg_Df=Avg_Df[['Driver_id',\
               'Average Order Number',\
               'Average Shift Duration (sec)',\
               'Average Occupied Time (sec)',\
               'Average Occupied Distance (m)',\
               'Average Gap between orders (sec)']]



'''First Shift Start Time'''


First_Df=Stat_Df.sort_values(by=['Driver_id','Shift Start Time'])

First_Df=First_Df.drop_duplicates(subset=['Driver_id'],keep='first')

First_Df=First_Df[['Driver_id','Shift Start Time']]

First_Df=First_Df.rename(columns={'Shift Start Time':'First Shift Start Time'})

First_Df=First_Df.reset_index(drop=True)

First_Df

'''Collection'''

'''Merge'''

df=Sum_Df.merge(Avg_Df,on='Driver_id')

df=df.merge(First_Df,on='Driver_id')

df=df.merge(Gap_Df,on='Driver_id')

df=df[['Driver_id',\
       'Total Number of Shifts',\
       'Average Order Number',\
       'Average Shift Duration (sec)',\
       'Average Occupied Time (sec)',\
       'First Shift Start Time',\
       'Average Occupied Distance (m)',\
       'Average Gap between orders (sec)',\
       'Average Gap between shifts (Hour)']]

df['Average Shift Duration (Hour)']=df.apply(lambda x:Get_3600(x['Average Shift Duration (sec)']),axis=1)

df['Average Occupied Time (Hour)']=df.apply(lambda x:Get_3600(x['Average Occupied Time (sec)']),axis=1)

df['Average Gap between orders (Hour)']=df.apply(lambda x:Get_3600(x['Average Gap between orders (sec)']),axis=1)

df['First Shift Start Hour']=df.apply(lambda x:Get_hour(x['First Shift Start Time']),axis=1)

df['Average Occupied Distance (km)']=df.apply(lambda x:Get_1000(x['Average Occupied Distance (m)']),axis=1)

df=df[['Driver_id',\
       'Total Number of Shifts',\
       'Average Order Number',\
       'Average Shift Duration (sec)',\
       'Average Occupied Time (sec)',\
       'Average Shift Duration (Hour)',\
       'Average Occupied Time (Hour)',\
       'First Shift Start Time',\
       'First Shift Start Hour',\
       'Average Occupied Distance (m)',\
       'Average Occupied Distance (km)',\
       'Average Gap between orders (Hour)',\
       'Average Gap between shifts (Hour)']]

df=df.round({'Average Order Number': 2,\
             'Average Shift Duration (sec)':0,\
             'Average Occupied Time (sec)':0,\
             'Average Occupied Distance (m)':2,\
             'First Shift Start Hour': 2})

df


Unnamed: 0,Driver_id,Total Number of Shifts,Average Order Number,Average Shift Duration (sec),Average Occupied Time (sec),Average Shift Duration (Hour),Average Occupied Time (Hour),First Shift Start Time,First Shift Start Hour,Average Occupied Distance (m),Average Occupied Distance (km),Average Gap between orders (Hour),Average Gap between shifts (Hour)
0,0000131d486b69eb77ab6e9e7cca9f4c,1,2.00,3119.0,1932.0,0.87,0.54,1477972625,11.95,13808.00,1380.80,0.33,0.00
1,000211a97ba2b768ef941593bf1ae1a5,2,2.00,3608.0,3154.0,1.00,0.88,1477963781,9.48,16056.33,1605.63,5.30,10.35
2,000770ada4aa5d856b5a766fb6bf7cf1,1,5.00,11649.0,5274.0,3.24,1.47,1477962292,9.07,31479.00,3147.90,0.44,0.00
3,0007a380800445346fff19afc4071ce3,1,1.00,1419.0,1419.0,0.39,0.39,1477963840,9.50,6544.00,654.40,0.00,0.00
4,0008c9d2ac91c00ef8b3ef80eba02a06,3,2.67,6323.0,2855.0,1.76,0.79,1477967006,10.38,20715.33,2071.53,1.62,4.23
5,00095da02ea38d4d8eaa120201e38ae9,1,1.00,1522.0,1522.0,0.42,0.42,1477965144,9.87,6060.00,606.00,0.00,0.00
6,000d0e4bd461b1f964daeb6d21868757,1,5.00,7148.0,5668.0,1.99,1.57,1477940181,2.93,58447.00,5844.70,3.28,0.00
7,000d7d7fe559c61e98d0a51e23e69bf7,2,3.50,8173.0,4854.0,2.27,1.35,1477966852,10.33,31157.50,3115.75,1.16,5.10
8,000db06fb2f4a700c4105348643f593e,1,1.00,1161.0,1161.0,0.32,0.32,1477968693,10.85,3255.00,325.50,0.00,0.00
9,000f63bea1c5b36b439d6fa60ee6cac7,2,5.50,8693.0,5495.0,2.41,1.53,1477967652,10.57,34297.00,3429.70,0.64,4.58
