In [None]:
!pip install --quiet duckdb
# !pip install --quiet jupysql 
# !pip install --quiet duckdb-engine
!pip install --quiet chart-studiot
!pip install --quiet pycaret 
!pip install --quiet pandas_profiling
!pip install --quiet dython
!pip install --quiet numpy  --upgrade --user
!pip install --quiet dowhy  --upgrade --user

# Objective:

### 1. Analyze driver’s Lifetime Value (LTV) 
### 2. Using the data provided, identify trends and insights of the factors that impact LTV

In [None]:
import warnings
warnings.simplefilter("ignore")

# data engineering
import duckdb
# Import jupysql Jupyter extension to create SQL cells
%load_ext sql

# data processing, compute, auto-EDA
import numpy as np
import scipy
from scipy import stats
import pandas as pd 
from pandas_profiling import ProfileReport

# visualizations for EDA
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# correlation scores
from sklearn.metrics import jaccard_score
from dython.nominal import associations
from dython.nominal import identify_nominal_columns

# feature importance
from pycaret.classification import *

# causality inference
from dowhy import CausalModel

# Data

In [None]:
drivers = pd.read_csv('/kaggle/input/lyftdatachallenge/driver_ids.csv')
riders = pd.read_csv('/kaggle/input/lyftdatachallenge/ride_ids.csv')
ride_timestamps = pd.read_csv('/kaggle/input/lyftdatachallenge/ride_timestamps.csv')
combined = pd.read_csv('/kaggle/input/lyft-analysis/combined_lyft_dataset.csv')

In [None]:
drivers.head()

In [None]:
riders.head()

In [None]:
ride_timestamps.head(10)

# Data Quality Check- duplications, outliers, non-sense data

In [None]:
query = """
select count(*) as total_rows
from drivers
"""

duckdb.query(query).df()

In [None]:
query = """
select count(distinct driver_id) as unique_drivers
from drivers
"""

duckdb.query(query).df() # same as total rows 

In [None]:
query = """
select count(*) as total_ride_rows
from riders
"""

duckdb.query(query).df() 

In [None]:
query = """
select count(distinct ride_id) as unique_rides
from riders
"""

duckdb.query(query).df() # same as total rows

In [None]:
query = """
select *
from riders
"""

duckdb.query(query).df() 

In [None]:
query = """
select min(ride_distance)
, max(ride_distance)
, min(ride_duration)
, max(ride_duration)
from riders
"""

duckdb.query(query).df()

In [None]:
riders.boxplot(column='ride_distance', return_type='axes')

In [None]:
riders.ride_distance.describe()

In [None]:
duckdb.query('select ride_distance from riders where ride_distance > 200000').df()

In [None]:
duckdb.query('select ride_distance, ride_duration from riders where ride_distance > 200000').df()

In [None]:
clean_riders = duckdb.query('select * from riders where ride_distance between 0 and 200000').df()
clean_riders.boxplot(column='ride_distance', return_type='axes')

In [None]:
clean_riders.ride_distance.describe()

In [None]:
clean_riders.boxplot(column='ride_duration', return_type='axes')

In [None]:
clean_riders.ride_duration.describe()

In [None]:
# Take out anything beyond 10,000 seconds/2.8 hours
clean_riders2 = duckdb.query('select * from clean_riders where ride_duration < 10000').df()
clean_riders2.boxplot(column='ride_duration', return_type='axes')

In [None]:
# Seeing if z-score creates different distribution
clean_riders3 = clean_riders[(np.abs(stats.zscore(clean_riders.drop(['driver_id', 'ride_id', 'ride_distance', 'ride_prime_time'], 
                                                       axis=1))) < 3).all(axis=1)]
clean_riders3.boxplot(column='ride_duration', return_type='axes')

Extreme filtering with z-score. 
Final dataset will have abnormal, long tail data with distance, duration, which depicts real driver life.

In [None]:
riders.ride_prime_time.value_counts()

In [None]:
# Checking that each distinct ride has all 5 events

query = """
with ride_event_freq as (
select ride_id
, count(event) as event_num
from ride_timestamps
group by 1
)

select event_num
, count(*) as event_num_freq
from ride_event_freq
group by 1
order by 2
"""

duckdb.query(query).df() # event_num_freq same as total unique rides

In [None]:
# Checking that each subsequent event has later timestamp
query = """
with time_test as (
select *
, lead(event, 1) over (partition by ride_id order by timestamp) as subsequent_event
from ride_timestamps
)

select count(*)
from time_test
where subsequent_event IS NULL
"""

duckdb.query(query).df() 

Great! This is expected since for each unique ride, suppose to have last event (dropped_off_at as NULL). And got count that matches unique rides.

In [None]:
ride_timestamps.head()

In [None]:
ride_timestamps.head()

In [None]:
drivers['driver_onboard_date'] = pd.to_datetime(drivers['driver_onboard_date'])
ride_timestamps['timestamp'] = pd.to_datetime(ride_timestamps['timestamp'])

In [None]:
type(drivers['driver_onboard_date'][0])

In [None]:
type(ride_timestamps['timestamp'][0])

In [None]:
# Using SQL to determine duration to subsequent step in rider journey
query = """
with time_test as (
select *
, lead(timestamp, 1) over (partition by ride_id order by timestamp) as next_event_time
from ride_timestamps
)

select *
, case when event = 'requested_at' then date_diff('second', timestamp, next_event_time) end as accept_dur_sec
, case when event = 'accepted_at' then date_diff('second', timestamp, next_event_time)/60 end as arrive_dur_min
, case when event = 'arrived_at' then date_diff('second', timestamp, next_event_time) end as picked_up_dur_sec
, case when event = 'picked_up_at' then date_diff('second', timestamp, next_event_time)/60 end as dropped_off_dur_min
from time_test
order by ride_id, timestamp
limit 15
"""

rider_journey = duckdb.query(query).df() 
rider_journey

In [None]:
ride_dur = rider_journey[['accept_dur_sec',	'arrive_dur_min', 'picked_up_dur_sec', 'dropped_off_dur_min']]

sns.boxplot(data=ride_dur)

Most variance from request -> accept and pick up -> drop off.

Questions that dataset can answer:

1. When drivers approved to drive (if power users tied to time then maybe there was a promo offered during that time to receive bonus with X amount rides completed 3 months out)
2. How many rides done per user
3. What sort of trip lengths accepted by certain users?
4. How long is the usual ride broken down by different user?
5. What is the usual multiplier applied to rides attached to different driver cohorts?
6. What is the usual time between requested_at and dropped_off_at as well as all the usual times between each ride timestamp (requested to accepted, accepted to arrived, arrive to pick up and pick up to drop off)


Extra data that would be nice to help define drivers' LTV: