<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
from os import kill
import pandas as pd
import numpy as np
from kiwi_ridesharing.data import Kiwi

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data = Kiwi().get_data()

In [4]:
rides = data["timestamps"]
rides["timestamp"] = pd.to_datetime(rides["timestamp"])
timestamps = rides.pivot(values='timestamp', index="ride_id", columns="event").reset_index()

In [5]:
timestamps.columns

Index(['ride_id', 'accepted_at', 'arrived_at', 'dropped_off_at',
       'picked_up_at', 'requested_at'],
      dtype='object', name='event')

In [6]:
(timestamps["picked_up_at"] - timestamps["arrived_at"]).dt.seconds

0         2.0
1         1.0
2         3.0
3         4.0
4         3.0
         ... 
194076    3.0
194077    2.0
194078    2.0
194079    1.0
194080    6.0
Length: 194081, dtype: float64

In [7]:
rides["ride_id"].nunique()

194081

In [8]:
def get_duration_in_minutes():
    
    """
    Returns Dataframe with ride_id, and ride duration in minutes
    """
    
    rides = data["rides"].copy()
    rides["ride_duration_minutes"] = round(rides["ride_duration"]/60)
    return rides[["ride_id", "ride_duration_minutes"]]

def get_duration_in_hours():
    
    """
    Returns Dataframe with ride_id, and ride duration in hours
    """
    
    rides = data["rides"].copy()
    rides["ride_duration_hours"] = round(rides["ride_duration"]/(60*60))
    return rides[["ride_id", "ride_duration_hours"]]    

def get_speed_kmh():
    
    """
    Returns Dataframe with ride_id, and average speed in kmh
    """
    rides = data["rides"].copy()
    rides["average_speed"] = round((rides["ride_distance"]/1000)/(rides["ride_duration"]/(60*60)))
    return rides[["ride_id", "average_speed"]] 

def get_ride_timestamps():
    
    """
    Returns Dataframe with ride_id, accepted_at, arrived_at, dropped_off_at,
    picked_up_at, and requested_at as columns
    """
    timestamps = data["timestamps"].copy()
    timestamps["timestamp"] = pd.to_datetime(timestamps["timestamp"])
    timestamps = timestamps.pivot(values='timestamp', index="ride_id", columns="event").reset_index()
    timestamps.rename_axis(None, axis=1, inplace=True)
    return timestamps

def get_waittime_driver():
    
    """
    Returns Dataframe with ride_id, arrived_at, picked_up_at and
    driver_wait_time in seconds
    """
    
    wait_time = get_ride_timestamps()
    wait_time["driver_wait_time"] = (wait_time["picked_up_at"] - wait_time["arrived_at"]).dt.seconds
    return wait_time[["ride_id", "arrived_at", "picked_up_at", "driver_wait_time"]]

def get_waittime_customer():
    
    """
    Returns Dataframe with ride_id, arrived_at, picked_up_at and
    customer_wait_time in seconds
    """
    
    wait_time = get_ride_timestamps()
    wait_time["customer_wait_time"] = (wait_time["arrived_at"] - wait_time["accepted_at"]).dt.seconds
    return wait_time[["ride_id", "arrived_at", "picked_up_at", "customer_wait_time"]]

def get_full_rides_data():
    
    """
    Returns a DataFrame with the all following columns:
    ['ride_id', 'requested_at', 'accepted_at', 'arrived_at', 'picked_up_at',
    'dropped_off_at', 'ride_duration_minutes', 'ride_duration_hours',
    'average_speed', 'driver_wait_time', 'customer_wait_time']
    """
    
    full_data =\
            get_duration_in_minutes()\
                .merge(
                get_duration_in_hours(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)").merge(
                get_speed_kmh(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)").merge(
                get_ride_timestamps(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)").merge(
                get_waittime_driver(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)").merge(
                get_waittime_customer(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)")
    
    return full_data[['ride_id','requested_at',
                      'accepted_at', 'arrived_at',
                      'picked_up_at', 'dropped_off_at',
                      'ride_duration_minutes', 'ride_duration_hours',
                      'average_speed','driver_wait_time','customer_wait_time']]


In [9]:
df = get_full_rides_data()

In [10]:
df[df["arrived_at"] > df["picked_up_at"]].min()

ride_id                  0001480a2c27492b1f8cb5fbc38ca667
requested_at                          2016-03-28 15:40:39
accepted_at                           2016-03-28 15:40:42
arrived_at                            2016-03-28 15:41:40
picked_up_at                          2016-03-28 15:41:39
dropped_off_at                        2016-03-28 15:51:57
ride_duration_minutes                                 0.0
ride_duration_hours                                   0.0
average_speed                                         0.0
driver_wait_time                                  82924.0
customer_wait_time                                    4.0
dtype: object

In [11]:
df[df["customer_wait_time"] == df["customer_wait_time"].max()]

Unnamed: 0,ride_id,requested_at,accepted_at,arrived_at,picked_up_at,dropped_off_at,ride_duration_minutes,ride_duration_hours,average_speed,driver_wait_time,customer_wait_time
182580,019dbbfb6180540bcc4b509567fb2cfe,2016-04-14 16:00:26,2016-04-14 16:00:30,2016-04-15 03:39:56,2016-04-15 03:13:47,2016-04-15 03:40:28,27.0,0.0,42.0,84831.0,41966.0


In this case, it might be that the customer moved away from the pickup point and met with the driver in a location near the original pickup point. The driver then passed the original pickup point after picking up the customer.

In [12]:
df[df['ride_id'] == "72f0fa0bd86800e9da5c4dced32c8735"]

Unnamed: 0,ride_id,requested_at,accepted_at,arrived_at,picked_up_at,dropped_off_at,ride_duration_minutes,ride_duration_hours,average_speed,driver_wait_time,customer_wait_time
97152,72f0fa0bd86800e9da5c4dced32c8735,2016-04-20 11:56:50,2016-04-20 11:57:20,NaT,2016-04-20 12:02:20,2016-04-20 12:16:30,14.0,0.0,19.0,,


In [36]:
from kiwi_ridesharing.ride import Ride
df_new = Ride().get_full_rides_data(clean_data=True)

In [37]:
df_new

Unnamed: 0,ride_id,requested_at,accepted_at,arrived_at,picked_up_at,dropped_off_at,ride_duration_minutes,ride_duration_hours,ride_distance,average_speed,driver_wait_time,customer_wait_time,fare,ride_prime_time,is_prime_time
0,006d61cf7446e682f7bc50b0f8a5bea5,2016-04-23 02:13:50,2016-04-23 02:14:15,2016-04-23 02:16:36,2016-04-23 02:16:40,2016-04-23 02:22:07,5.0,0.0,1811,20.0,4,141,6.14,50,1
1,01b522c5c3a756fbdb12e95e87507eda,2016-03-29 19:00:49,2016-03-29 19:00:52,2016-03-29 19:03:57,2016-03-29 19:04:01,2016-03-29 19:17:30,13.0,0.0,3362,15.0,4,185,9.01,0,0
2,029227c4c2971ce69ff2274dc798ef43,2016-06-21 11:56:31,2016-06-21 11:56:39,2016-06-21 12:01:32,2016-06-21 12:01:35,2016-06-21 12:11:07,10.0,0.0,3282,21.0,3,293,8.30,0,0
3,034e861343a63ac3c18a9ceb1ce0ac69,2016-05-19 09:15:29,2016-05-19 09:15:33,2016-05-19 09:18:20,2016-05-19 09:18:20,2016-05-19 10:13:58,56.0,1.0,65283,70.0,0,167,62.72,25,1
4,034f2e614a2f9fc7f1c2f77647d1b981,2016-04-20 22:05:30,2016-04-20 22:05:32,2016-04-20 22:07:02,2016-04-20 22:07:02,2016-04-20 22:20:45,14.0,0.0,4115,18.0,0,90,9.77,100,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184814,fc717192b3512767269ff5a54b97af05,2016-06-18 01:14:52,2016-06-18 01:15:52,2016-06-18 01:21:11,2016-06-18 01:21:14,2016-06-18 01:43:30,22.0,0.0,10127,27.0,3,319,15.83,0,0
184815,fd6fa5f9265d2cf83936ead663f9e0e7,2016-06-23 23:51:55,2016-06-23 23:52:08,2016-06-23 23:59:04,2016-06-23 23:59:08,2016-06-24 00:06:33,7.0,0.0,1908,15.0,4,416,6.65,0,0
184816,fe0857c43025264d337dfe1d8463e503,2016-05-31 19:26:34,2016-05-31 19:26:42,2016-05-31 19:29:08,2016-05-31 19:29:11,2016-05-31 19:43:46,15.0,0.0,4039,17.0,3,146,9.94,0,0
184817,ff0db0ca4557bf5b05b4da6f660a1ac1,2016-05-15 00:51:53,2016-05-15 00:51:59,2016-05-15 00:52:05,2016-05-15 00:52:07,2016-05-15 01:05:04,13.0,0.0,4760,22.0,2,6,10.01,0,0


In [22]:
df_new[df_new["arrived_at"].isnull()]

Unnamed: 0,ride_id,requested_at,accepted_at,arrived_at,picked_up_at,dropped_off_at,ride_duration_minutes,ride_duration_hours,average_speed,driver_wait_time,customer_wait_time
