In [None]:
!pip install --quiet duckdb
# !pip install --quiet jupysql 
# !pip install --quiet duckdb-engine
!pip install --quiet chart-studiot
!pip install --quiet pycaret 
!pip install --quiet pandas_profiling
!pip install --quiet dython
!pip install --quiet numpy  --upgrade --user
!pip install --quiet dowhy  --upgrade --user

# Objective:

### 1. Analyze driver’s Lifetime Value (LTV) 
### 2. Using the data provided, identify trends and insights of the factors that impact LTV

In [None]:
import warnings
warnings.simplefilter("ignore")

# data engineering
import duckdb
# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# data processing, compute, auto-EDA
import numpy as np
import scipy
from scipy import stats
import pandas as pd 
from pandas_profiling import ProfileReport

# visualizations for EDA
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# correlation scores
from sklearn.metrics import jaccard_score
from dython.nominal import associations
from dython.nominal import identify_nominal_columns

# feature importance
from pycaret.classification import *

# causality inference
from dowhy import CausalModel

# Data

In [None]:
drivers = pd.read_csv('/kaggle/input/lyftdatachallenge/driver_ids.csv')
rides = pd.read_csv('/kaggle/input/lyftdatachallenge/ride_ids.csv')
ride_timestamps = pd.read_csv('/kaggle/input/lyftdatachallenge/ride_timestamps.csv')
combined = pd.read_csv('/kaggle/input/lyft-analysis/combined_lyft_dataset.csv')

In [None]:
drivers.head()

In [None]:
rides.head()

In [None]:
ride_timestamps.shape

In [None]:
ride_timestamps.head(10)

# Data Quality Check- duplications, outliers, non-sense data

In [None]:
query = """
select count(*) as total_rows
from drivers
"""

duckdb.query(query).df()

In [None]:
query = """
select count(distinct driver_id) as unique_drivers
from drivers
"""

duckdb.query(query).df() # same as total rows 

In [None]:
query = """
select count(*) as total_ride_rows
from rides
"""

duckdb.query(query).df() 

In [None]:
query = """
select count(distinct ride_id) as unique_rides
from rides
"""

duckdb.query(query).df() # same as total rows

In [None]:
query = """
select *
from rides
"""

duckdb.query(query).df() 

In [None]:
query = """
select min(ride_distance)
, max(ride_distance)
, min(ride_duration)
, max(ride_duration)
from rides
"""

duckdb.query(query).df()

In [None]:
rides.boxplot(column='ride_distance', return_type='axes')

In [None]:
rides.ride_distance.describe()

In [None]:
duckdb.query('select ride_distance from rides where ride_distance > 200000').df()

In [None]:
duckdb.query('select ride_distance, ride_duration from rides where ride_distance > 200000').df()

In [None]:
clean_rides = duckdb.query('select * from rides where ride_distance between 0 and 200000').df()
clean_rides.boxplot(column='ride_distance', return_type='axes')

In [None]:
clean_rides.ride_distance.describe()

In [None]:
clean_rides.boxplot(column='ride_duration', return_type='axes')

In [None]:
clean_rides.ride_duration.describe()

In [None]:
clean_rides.head()

In [None]:
# Take out anything beyond 10,000 seconds/2.8 hours and do some conversions

query = """
select driver_id
, ride_id
, ride_distance * 0.000621371192 as ride_distance
, ride_duration/60 as ride_duration
, ride_prime_time/100 as ride_prime_time
from clean_rides
where ride_duration < 10000
"""


clean_rides2 = duckdb.query(query).df()
clean_rides2.boxplot(column='ride_duration', return_type='axes')

In [None]:
# Seeing if z-score creates different distribution
clean_rides3 = clean_rides[(np.abs(stats.zscore(clean_rides.drop(['driver_id', 'ride_id', 'ride_distance', 'ride_prime_time'], 
                                                       axis=1))) < 3).all(axis=1)]
clean_rides3.boxplot(column='ride_duration', return_type='axes')

Extreme filtering with z-score. 
Final dataset will have abnormal, long tail data with distance, duration, which depicts real driver life.

In [None]:
clean_rides2.ride_prime_time.value_counts()

In [None]:
# Checking that each distinct ride has all 5 events

query = """
with ride_event_freq as (
select ride_id
, count(event) as event_num
from ride_timestamps
group by 1
)

select event_num
, count(*) as event_num_freq
from ride_event_freq
group by 1
order by 2
"""

duckdb.query(query).df() # event_num_freq same as total unique rides

In [None]:
# Checking that each subsequent event has later timestamp
query = """
with time_test as (
select *
, lead(event, 1) over (partition by ride_id order by timestamp) as subsequent_event
from ride_timestamps
)

select count(*)
from time_test
where subsequent_event IS NULL
"""

duckdb.query(query).df() 

Great! This is expected since for each unique ride, suppose to have last event (dropped_off_at as NULL). And got count that matches unique rides.

In [None]:
ride_timestamps.head()

In [None]:
ride_timestamps.head()

In [None]:
drivers['driver_onboard_date'] = pd.to_datetime(drivers['driver_onboard_date'])
ride_timestamps['timestamp'] = pd.to_datetime(ride_timestamps['timestamp'])

In [None]:
type(drivers['driver_onboard_date'][0])

In [None]:
type(ride_timestamps['timestamp'][0])

In [None]:
# Using SQL to determine duration to subsequent step in rider journey
query = """
with time_test as (
select *
, lead(timestamp, 1) over (partition by ride_id order by timestamp) as next_event_time
from ride_timestamps
)

select *
, case when event = 'requested_at' then date_diff('second', timestamp, next_event_time) end as accept_dur_sec
, case when event = 'accepted_at' then date_diff('second', timestamp, next_event_time)/60 end as arrive_dur_min
, case when event = 'arrived_at' then date_diff('second', timestamp, next_event_time) end as picked_up_dur_sec
, case when event = 'picked_up_at' then date_diff('second', timestamp, next_event_time)/60 end as dropped_off_dur_min
from time_test
order by ride_id, timestamp
"""

rider_journey = duckdb.query(query).df() 
rider_journey

In [None]:
ride_dur = rider_journey[['accept_dur_sec',	'arrive_dur_min', 'picked_up_dur_sec', 'dropped_off_dur_min']]

sns.boxplot(data=ride_dur)

Most variance from request -> accept and arrive -> picked up.

In [None]:
ride_timestamps.head()

In [None]:
# Pivot rider_timestamp event table so each row is a distinct ride_id
ride_ts_pivot = duckdb.query('PIVOT ride_timestamps ON event USING FIRST(timestamp)').df() 
ride_ts_pivot = ride_ts_pivot[['ride_id', 'requested_at', 'accepted_at', 'arrived_at', 'picked_up_at', 'dropped_off_at']]
ride_ts_pivot = duckdb.query('select * from ride_ts_pivot order by ride_id').df()
ride_ts_pivot.head()

In [None]:
#X-check
duckdb.query('select * from ride_timestamps order by 1,3 limit 20').df() 

In [None]:
query = """
select count(distinct drivers.driver_id)/(select count(*) from drivers) as drivers_without_rides_perc
from drivers
left join clean_rides2
on drivers.driver_id = clean_rides2.driver_id
where clean_rides2.driver_id is null
"""

driver_rides = duckdb.query(query).df() 
driver_rides

Reasons for this mismatch:

* not all drivers have started completing rides
* not all rides have drivers from driver onboard time range
* not all rides have ride events logged

In [None]:
clean_rides2.shape, ride_ts_pivot.shape #more rides in ride events table

In [None]:
query = """
select count(*)/(select count(*) from ride_ts_pivot) as rides_with_events_without_drivers_perc
from clean_rides2
full join ride_ts_pivot
on clean_rides2.ride_id = ride_ts_pivot.ride_id
where ride_ts_pivot.ride_id IS NULL
"""

ride_events = duckdb.query(query).df() 
ride_events

Will perform INNER join between drivers-rides-ride_events.

Questions that dataset can answer:

1. When drivers approved to drive (if power users tied to time then maybe there was a promo offered during that time to receive bonus with X amount rides completed 3 months out)
2. How many rides done per user
3. What sort of trip lengths accepted by certain users?
4. How long is the usual ride broken down by different user?
5. What is the usual multiplier applied to rides attached to different driver cohorts?
6. What is the usual time between requested_at and dropped_off_at as well as all the usual times between each ride timestamp (requested to accepted, accepted to arrived, arrive to pick up and pick up to drop off)


Extra data that would be nice to help define drivers' LTV:

1. CAC - cost per acquired driver
1. how is revenue distributed in city (cost areas)

# Recommended Driver's Lifetime Value 

value of a driver to Lyft over the entire projected lifetime of a driver

## Equation: 

Assumptions:

## What is the average projected lifetime of a driver? 

1. That is, once a driver is onboarded, how long do they typically continue driving with Lyft?
1. Number of Rides and Active Drivers over time (in weekly buckets)

In [None]:
query = """
with driver_rides_events as (
SELECT drivers.driver_id as driver_id1
, drivers.driver_onboard_date
, clean_rides2.driver_id as driver_id2
, clean_rides2.ride_id as ride_id1
, clean_rides2.ride_distance
, clean_rides2.ride_duration
, clean_rides2.ride_prime_time
, ride_ts_pivot.ride_id as ride_id2
, ride_ts_pivot.requested_at	
, ride_ts_pivot.accepted_at	
, ride_ts_pivot.arrived_at	
, ride_ts_pivot.picked_up_at	
, ride_ts_pivot.dropped_off_at
from drivers
join clean_rides2 -- only drivers with rides
on drivers.driver_id = clean_rides2.driver_id
join ride_ts_pivot -- all drivers with rides' events
on clean_rides2.ride_id = ride_ts_pivot.ride_id
)

, last_drive as (
select *
from driver_rides_events
qualify row_number() over (partition by driver_id1 order by accepted_at desc) = 1 
)

select  *
, datediff('day', driver_onboard_date,  accepted_at) as lifetime
from driver_rides_events
"""

lt = duckdb.query(query).df()
lt.boxplot(column='lifetime', return_type='axes')

In [None]:
lt.lifetime.describe()

In [None]:
# Average lifetime of driver with "cleaned" up data:
duckdb.query('select avg(lifetime) as average_lifetime_of_driver_days from lt').df()

## Explore how drivers churn once they start with Lyft.
1. Are there any predictive indicators for driver churn?
1. % of active drivers over time (in weekly buckets, cohorted by onboard week)

## Segment the driver population to identify driving behavior that may lead to churn

## What are the main factors that affect a driver’s lifetime value? Please support your answers using your data analysis.

## How does the demand for rides impact the number of drivers?

## What actionable recommendations are there for the business?