<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
from os import kill
import pandas as pd
import numpy as np
from kiwi_ridesharing.data import Kiwi

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data = Kiwi().get_data()

In [4]:
rides = data["timestamps"]
rides["timestamp"] = pd.to_datetime(rides["timestamp"])
timestamps = rides.pivot(values='timestamp', index="ride_id", columns="event").reset_index()

In [5]:
timestamps.columns

Index(['ride_id', 'accepted_at', 'arrived_at', 'dropped_off_at',
       'picked_up_at', 'requested_at'],
      dtype='object', name='event')

In [6]:
(timestamps["picked_up_at"] - timestamps["arrived_at"]).dt.seconds

0         2.0
1         1.0
2         3.0
3         4.0
4         3.0
         ... 
194076    3.0
194077    2.0
194078    2.0
194079    1.0
194080    6.0
Length: 194081, dtype: float64

In [7]:
rides["ride_id"].nunique()

194081

In [8]:
def get_duration_in_minutes():
    
    """
    Returns Dataframe with ride_id, and ride duration in minutes
    """
    
    rides = data["rides"].copy()
    rides["ride_duration_minutes"] = round(rides["ride_duration"]/60)
    return rides[["ride_id", "ride_duration_minutes"]]

def get_duration_in_hours():
    
    """
    Returns Dataframe with ride_id, and ride duration in hours
    """
    
    rides = data["rides"].copy()
    rides["ride_duration_hours"] = round(rides["ride_duration"]/(60*60))
    return rides[["ride_id", "ride_duration_hours"]]    

def get_speed_kmh():
    
    """
    Returns Dataframe with ride_id, and average speed in kmh
    """
    rides = data["rides"].copy()
    rides["average_speed"] = round((rides["ride_distance"]/1000)/(rides["ride_duration"]/(60*60)))
    return rides[["ride_id", "average_speed"]] 

def get_ride_timestamps():
    
    """
    Returns Dataframe with ride_id, accepted_at, arrived_at, dropped_off_at,
    picked_up_at, and requested_at as columns
    """
    timestamps = data["timestamps"].copy()
    timestamps["timestamp"] = pd.to_datetime(timestamps["timestamp"])
    timestamps = timestamps.pivot(values='timestamp', index="ride_id", columns="event").reset_index()
    timestamps.rename_axis(None, axis=1, inplace=True)
    return timestamps

def get_waittime_driver():
    
    """
    Returns Dataframe with ride_id, arrived_at, picked_up_at and
    driver_wait_time in seconds
    """
    
    wait_time = get_ride_timestamps()
    wait_time["driver_wait_time"] = (wait_time["picked_up_at"] - wait_time["arrived_at"]).dt.seconds
    return wait_time[["ride_id", "arrived_at", "picked_up_at", "driver_wait_time"]]

def get_waittime_customer():
    
    """
    Returns Dataframe with ride_id, arrived_at, picked_up_at and
    customer_wait_time in seconds
    """
    
    wait_time = get_ride_timestamps()
    wait_time["customer_wait_time"] = (wait_time["arrived_at"] - wait_time["accepted_at"]).dt.seconds
    return wait_time[["ride_id", "arrived_at", "picked_up_at", "customer_wait_time"]]

def get_full_rides_data():
    
    """
    Returns a DataFrame with the all following columns:
    ['ride_id', 'requested_at', 'accepted_at', 'arrived_at', 'picked_up_at',
    'dropped_off_at', 'ride_duration_minutes', 'ride_duration_hours',
    'average_speed', 'driver_wait_time', 'customer_wait_time']
    """
    
    full_data =\
            get_duration_in_minutes()\
                .merge(
                get_duration_in_hours(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)").merge(
                get_speed_kmh(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)").merge(
                get_ride_timestamps(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)").merge(
                get_waittime_driver(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)").merge(
                get_waittime_customer(), on='ride_id', suffixes=('', '_DROP')
            ).filter(regex="^(?!.*DROP)")
    
    return full_data[['ride_id','requested_at',
                      'accepted_at', 'arrived_at',
                      'picked_up_at', 'dropped_off_at',
                      'ride_duration_minutes', 'ride_duration_hours',
                      'average_speed','driver_wait_time','customer_wait_time']]


In [9]:
df = get_full_rides_data()

In [10]:
df[df["arrived_at"] > df["picked_up_at"]].min()

ride_id                  0001480a2c27492b1f8cb5fbc38ca667
requested_at                          2016-03-28 15:40:39
accepted_at                           2016-03-28 15:40:42
arrived_at                            2016-03-28 15:41:40
picked_up_at                          2016-03-28 15:41:39
dropped_off_at                        2016-03-28 15:51:57
ride_duration_minutes                                 0.0
ride_duration_hours                                   0.0
average_speed                                         0.0
driver_wait_time                                  82924.0
customer_wait_time                                    4.0
dtype: object

In [11]:
df[df["customer_wait_time"] == df["customer_wait_time"].max()]

Unnamed: 0,ride_id,requested_at,accepted_at,arrived_at,picked_up_at,dropped_off_at,ride_duration_minutes,ride_duration_hours,average_speed,driver_wait_time,customer_wait_time
182580,019dbbfb6180540bcc4b509567fb2cfe,2016-04-14 16:00:26,2016-04-14 16:00:30,2016-04-15 03:39:56,2016-04-15 03:13:47,2016-04-15 03:40:28,27.0,0.0,42.0,84831.0,41966.0


In this case, it might be that the customer moved away from the pickup point and met with the driver in a location near the original pickup point. The driver then passed the original pickup point after picking up the customer.

In [12]:
df[df['ride_id'] == "72f0fa0bd86800e9da5c4dced32c8735"]

Unnamed: 0,ride_id,requested_at,accepted_at,arrived_at,picked_up_at,dropped_off_at,ride_duration_minutes,ride_duration_hours,average_speed,driver_wait_time,customer_wait_time
97152,72f0fa0bd86800e9da5c4dced32c8735,2016-04-20 11:56:50,2016-04-20 11:57:20,NaT,2016-04-20 12:02:20,2016-04-20 12:16:30,14.0,0.0,19.0,,


In [17]:
from kiwi_ridesharing.ride import Ride
df_new = Ride().get_full_rides_data(clean_data=True)

In [18]:
df_new[df_new['ride_id'] == "72f0fa0bd86800e9da5c4dced32c8735"]

Unnamed: 0,ride_id,requested_at,accepted_at,arrived_at,picked_up_at,dropped_off_at,ride_duration_minutes,ride_duration_hours,average_speed,driver_wait_time,customer_wait_time
97152,72f0fa0bd86800e9da5c4dced32c8735,2016-04-20 11:56:50,2016-04-20 11:57:20,2016-04-20 12:02:20,2016-04-20 12:02:20,2016-04-20 12:16:30,14.0,0.0,19.0,0,300


In [21]:
df_new[df_new["arrived_at"] ]

Unnamed: 0,ride_id,requested_at,accepted_at,arrived_at,picked_up_at,dropped_off_at,ride_duration_minutes,ride_duration_hours,average_speed,driver_wait_time,customer_wait_time
