In [4]:
import pandas as pd

import numpy as np

import os,sys

from shapely.geometry import Point, Polygon

import copy

from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score 

import matplotlib.pyplot as plt

from matplotlib.ticker import MultipleLocator, FormatStrFormatter

import time

In [18]:
def Get_Shift(Startstamp,Endstamp,Duration,Distance):
    
    '''
    Input:
    
    (1) Startstamp,
    
    (2) Endstamp,
    
    (3) Duration,
    
    (4) Distance
    
    Output:

    (1) 'The number of Orders'

    (2) 'Shift Duration (sec)'

    (3) 'Occupied Time (sec)'

    (4) 'Shift Start Time'
    
    (5) 'Shift End Time', 

    (6) 'Occupied Distance(m)'

    '''
    
    Result=list()
    
    if len(Startstamp)==1:
        
        Result.append([1,Duration[0],Duration[0],Startstamp[0],Endstamp[0],Distance[0]])
        
    else:
        
        s=0
        
        i=0
        
        j=1
        
        order_num=0
        
        occupied_duration=0
        
        occupied_distance=0
        
        while j<len(Startstamp):
            
            order_num+=1
                
            occupied_duration+=Duration[i]

            occupied_distance+=Distance[i]
            
            if Startstamp[j]-Endstamp[i]<3600*2:
                
                if j==len(Startstamp)-1:
                    
                    shift_duration=Endstamp[j]-Startstamp[s]
                    
                    order_num+=1
                
                    occupied_duration+=Duration[j]

                    occupied_distance+=Distance[j]
                    
                    Result.append([order_num,shift_duration,occupied_duration,Startstamp[s],Endstamp[j],occupied_distance])
                    
                i+=1
                
                j+=1
                
            else:
                
                shift_duration=Endstamp[i]-Startstamp[s]
                
                Result.append([order_num,shift_duration,occupied_duration,Startstamp[s],Endstamp[i],occupied_distance])
                
                order_num=0
                
                occupied_duration=0
                
                occupied_distance=0
                
                s=j
                
                i=j
                
                j+=1
                
    return Result

def explode(df, col):
    df[col] = df[col].apply(lambda x: [x] if not isinstance(x, list) else x)
    return df.drop(col, axis=1).join(
        pd.DataFrame(list(df[col])).stack().reset_index(level=1, drop=True).rename(col)
    )      

## Data Initialization

In [13]:
Load_path='./Data/GAIA/'

dt='20161101'

''''''

GPS_Df=pd.read_csv(os.path.join(Load_path,'gps_'+dt+'.csv'),header=None,names=['Driver_id','Order_id',\
                                                                               'Timestamp','Longitude',\
                                                                               'Latitude'])

GPS_Df=GPS_Df.drop_duplicates(subset=['Driver_id','Order_id'])

GPS_Df=GPS_Df.reset_index(drop=True)

GPS_Df=GPS_Df[['Driver_id','Order_id']]

GPS_Df


Order_Df=pd.read_csv(os.path.join(Load_path,'order_'+dt+'.csv'),header=None,names=['Order_id','Startstamp','Endstamp',\
                                                                                   'Pickup_Longitude','Pickup_Latitude',\
                                                                                   'Dropoff_Longitude','Dropoff_Latitude'])

Get_distance=lambda lng1,lat1,lng2,lat2:int(Point(lng1,lat1).distance(Point(lng2,lat2))*111000*1.2)

Order_Df['Duration']=Order_Df.apply(lambda x:x['Endstamp']-x['Startstamp'],axis=1)

Order_Df['Distance']=Order_Df.apply(lambda x:Get_distance(x['Pickup_Longitude'],x['Pickup_Latitude'],x['Dropoff_Longitude'],x['Dropoff_Latitude']),axis=1)

Order_Df['Speed']=Order_Df.apply(lambda x:x['Distance']/x['Duration'],axis=1)

Order_Df=Order_Df[['Order_id','Startstamp','Endstamp','Duration','Distance','Speed']]


'''

The data was then cleaned by removing drivers who showed unrealistic data trends 

such as having a second trip that began prior to the conclusion of the first trip 

or having trips that were seen as outliers in terms of the distance, speed, or time traveled. 

The outliers were defined as data that is not within the middle 99\% of the data set for each of the variables.


'''

Duration_arr=np.sort(np.array(Order_Df['Duration']))

Duration_min_=Duration_arr[int(0.01 * len(Duration_arr))]
    
Duration_max_=Duration_arr[int(0.99 * len(Duration_arr))]

Distance_arr=np.sort(np.array(Order_Df['Distance']))

Distance_min_=Distance_arr[int(0.01 * len(Distance_arr))]
    
Distance_max_=Distance_arr[int(0.99 * len(Distance_arr))]

Speed_arr=np.sort(np.array(Order_Df['Speed']))

Speed_min_=Speed_arr[int(0.01 * len(Speed_arr))]

Speed_max_=Speed_arr[int(0.99 * len(Speed_arr))]



Order_Df=Order_Df.loc[(Order_Df['Duration']>=Duration_min_)&(Order_Df['Duration']<=Duration_max_)]

Order_Df=Order_Df.loc[(Order_Df['Distance']>=Distance_min_)&(Order_Df['Distance']<=Distance_max_)]

Order_Df=Order_Df.loc[(Order_Df['Speed']>=Speed_min_)&(Order_Df['Speed']<=Speed_max_)]



Order_Df



Unnamed: 0,Order_id,Startstamp,Endstamp,Duration,Distance,Speed
0,eb9dd4095d9850e6287cefd813775a6c,1477964797,1477966507,1710,7111,4.158480
1,387a742fa5a3fbe4a1f215ac58ea33a8,1477985585,1477987675,2090,23760,11.368421
2,9cf55f8e6e02a1e0f792df06e5d85011,1478004952,1478006217,1265,11883,9.393676
3,5feeae0307e15203484b9ffceef89855,1477989840,1477991065,1225,8009,6.537959
4,ad4b52cb15b90c44c8f20a8b1e57a622,1477958005,1477958577,572,5905,10.323427
5,ad551eb23b72e2a77ae5e81e22f2dbd0,1477997663,1477998786,1123,7880,7.016919
6,ad4b52cb15b90c44c8f20a8b1e57a622,1477958005,1477958577,572,5905,10.323427
7,db46d8931c1ac3f61d8278df67c2a365,1477958918,1477960167,1249,9651,7.726982
8,908e7f068da5768c492ab69dbf81efda,1477960528,1477961815,1287,7870,6.114996
9,2d48affae032a4ca29d220660732686d,1477962166,1477963634,1468,9382,6.391008


In [35]:
'''Collect the data into a list array'''

Behavior_Df=Order_Df.merge(GPS_Df,on='Order_id',how='left')

Behavior_Df=Behavior_Df.dropna()

Behavior_Df['Startstamp']=Behavior_Df['Startstamp'].astype('int32')

Behavior_Df['Endstamp']=Behavior_Df['Endstamp'].astype('int32')

Behavior_Df=Behavior_Df[['Driver_id','Order_id','Startstamp','Endstamp','Duration','Distance']]

Behavior_Df=Behavior_Df.drop_duplicates(subset=['Driver_id','Order_id'])

Behavior_Df=Behavior_Df.sort_values(by=['Driver_id','Startstamp'])

Behavior_Df=Behavior_Df.reset_index(drop=True)

Stat_Df = pd.DataFrame([])

Stat_Df['Startstamp']=Behavior_Df.groupby('Driver_id').apply(lambda x: x['Startstamp'].to_list())

Stat_Df['Endstamp']=Behavior_Df.groupby('Driver_id').apply(lambda x: x['Endstamp'].to_list())

Stat_Df['Duration']=Behavior_Df.groupby('Driver_id').apply(lambda x: x['Duration'].to_list())

Stat_Df['Distance']=Behavior_Df.groupby('Driver_id').apply(lambda x: x['Distance'].to_list())

Stat_Df['Driver_id']=Stat_Df.index

Stat_Df=Stat_Df.reset_index(drop=True)

Stat_Df=Stat_Df[['Driver_id','Startstamp','Endstamp','Duration','Distance']]

Stat_Df['Transition']=Stat_Df.apply(lambda x:Get_Shift(x['Startstamp'],x['Endstamp'],\
                                                      x['Duration'],x['Distance']),axis=1)

Stat_Df=Stat_Df[['Driver_id','Transition']]

Stat_Df=explode(Stat_Df,'Transition')


'''

(1) 'The number of Orders'

(2) 'Shift Duration (sec)'

(3) 'Occupied Time (sec)'

(4) 'Shift Start Time'

(5) 'Shift End Time', 

(6) 'Occupied Distance (m)'


'''


Cols=['The number of Orders', 'Shift Duration (sec)', 'Occupied Time (sec)', 'Shift Start Time', 'Shift End Time','Occupied Distance (m)']


for i in range(len(Cols)):
    
    Stat_Df[Cols[i]]=Stat_Df.apply(lambda x:x['Transition'][i],axis=1)
    

Stat_Df=Stat_Df[['Driver_id']+Cols]

Stat_Df['Shift']=1

Stat_Df





Unnamed: 0,Driver_id,The number of Orders,Shift Duration (sec),Occupied Time (sec),Shift Start Time,Shift End Time,Occupied Distance (m),Shift
0,0000131d486b69eb77ab6e9e7cca9f4c,2,3119,1932,1477972625,1477975744,13808,1
1,000211a97ba2b768ef941593bf1ae1a5,1,966,966,1477963781,1477964747,5897,1
1,000211a97ba2b768ef941593bf1ae1a5,4,6251,8455,1478002015,1478008266,40387,1
2,000770ada4aa5d856b5a766fb6bf7cf1,5,11649,5274,1477962292,1477973941,31479,1
3,0007a380800445346fff19afc4071ce3,1,1419,1419,1477963840,1477965259,6544,1
4,0008c9d2ac91c00ef8b3ef80eba02a06,3,4074,2729,1477967006,1477971080,19702,1
4,0008c9d2ac91c00ef8b3ef80eba02a06,2,3937,3431,1477984364,1477988301,23593,1
4,0008c9d2ac91c00ef8b3ef80eba02a06,3,10959,2406,1478005496,1478016455,18851,1
5,00095da02ea38d4d8eaa120201e38ae9,1,1522,1522,1477965144,1477966666,6060,1
6,000d0e4bd461b1f964daeb6d21868757,5,7148,5668,1477940181,1477947329,58447,1


In [41]:
'''Gap Between two Shifts'''

Stat_Df=Stat_Df.sort_values(by=['Driver_id','Shift Start Time'])

Gap_Df=pd.DataFrame([])

Gap_Df['Shift Start Time']=Stat_Df.groupby('Driver_id').apply(lambda x: x['Shift Start Time'].to_list())

Gap_Df['Shift End Time']=Stat_Df.groupby('Driver_id').apply(lambda x: x['Shift End Time'].to_list())

Gap_Df['Driver_id']=Gap_Df.index

def Get_gap(Start,End):
    if len(Start)==1:
        return np.nan
    else:
        gap=list()
        for i in range(1,len(Start),1):
            gap.append(Start[i]-End[i-1])
        return np.array(gap).mean()/3600.0

Gap_Df['Gap Between Shifts']=Gap_Df.apply(lambda x:Get_gap(x['Shift Start Time'],x['Shift End Time']),axis=1)

Gap_Df = Gap_Df[['Driver_id','Gap Between Shifts']]

Gap_Df = Gap_Df.reset_index(drop=True)

Gap_Df


'''Total Number of Shifts'''

Sum_Df=Stat_Df.groupby('Driver_id').sum()

Sum_Df['Driver_id']=Sum_Df.index

Sum_Df=Sum_Df.rename(columns={'Shift':'Total Number of Shifts'})

Sum_Df=Sum_Df.reset_index(drop=True)

Sum_Df=Sum_Df[['Driver_id','Total Number of Shifts']]



'''Average Values'''

Avg_Df=Stat_Df.groupby('Driver_id').mean()

Avg_Df['Driver_id']=Avg_Df.index

Avg_Df=Avg_Df.rename(columns={'The number of Orders':'Average Order Number',\
                              'Shift Duration (sec)':'Average Shift Duration (sec)',\
                              'Occupied Time (sec)':'Average Occupied Time (sec)',\
                              'Occupied Distance (m)':'Average Occupied Distance (m)'})

Avg_Df=Avg_Df.reset_index(drop=True)


Avg_Df=Avg_Df[['Driver_id','Average Order Number','Average Shift Duration (sec)','Average Occupied Time (sec)','Average Occupied Distance (m)']]



'''First Shift Start Time'''


First_Df=Stat_Df.sort_values(by=['Driver_id','Shift Start Time'])

First_Df=First_Df.drop_duplicates(subset=['Driver_id'],keep='first')

First_Df=First_Df[['Driver_id','Shift Start Time']]

First_Df=First_Df.rename(columns={'Shift Start Time':'First Shift Start Time'})

First_Df=First_Df.reset_index(drop=True)

First_Df

'''Collection'''

'''Merge'''

df=Sum_Df.merge(Avg_Df,on='Driver_id')

df=df.merge(First_Df,on='Driver_id')

df=df.merge(Gap_Df,on='Driver_id')

df=df[['Driver_id',\
       'Total Number of Shifts',\
       'Average Order Number',\
       'Average Shift Duration (sec)',\
       'Average Occupied Time (sec)',\
       'First Shift Start Time',\
       'Average Occupied Distance (m)',\
       'Gap Between Shifts']]

df


Unnamed: 0,Driver_id,Total Number of Shifts,Average Order Number,Average Shift Duration (sec),Average Occupied Time (sec),First Shift Start Time,Average Occupied Distance (m),Gap Between Shifts
0,0000131d486b69eb77ab6e9e7cca9f4c,1,2.000000,3119.000000,1932.000000,1477972625,13808.000000,
1,000211a97ba2b768ef941593bf1ae1a5,2,2.500000,3608.500000,4710.500000,1477963781,23142.000000,10.352222
2,000770ada4aa5d856b5a766fb6bf7cf1,1,5.000000,11649.000000,5274.000000,1477962292,31479.000000,
3,0007a380800445346fff19afc4071ce3,1,1.000000,1419.000000,1419.000000,1477963840,6544.000000,
4,0008c9d2ac91c00ef8b3ef80eba02a06,3,2.666667,6323.333333,2855.333333,1477967006,20715.333333,4.233194
5,00095da02ea38d4d8eaa120201e38ae9,1,1.000000,1522.000000,1522.000000,1477965144,6060.000000,
6,000d0e4bd461b1f964daeb6d21868757,1,5.000000,7148.000000,5668.000000,1477940181,58447.000000,
7,000d7d7fe559c61e98d0a51e23e69bf7,2,3.500000,8173.000000,4854.000000,1477966852,31157.500000,5.096944
8,000db06fb2f4a700c4105348643f593e,1,1.000000,1161.000000,1161.000000,1477968693,3255.000000,
9,000f63bea1c5b36b439d6fa60ee6cac7,2,5.500000,8693.000000,5495.000000,1477967652,34297.000000,4.576667
