In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import warnings
warnings.filterwarnings(action="ignore")

import pandas as pd 
import dask.dataframe as dd 
import sys
import os 
sys.path.append('..')

In [None]:
# Arguemnt to speed up the notebook and only show the results (very little computations). 
# This is to avoid long computations when running all the notebook
fast_notebook = True

# Data Loading

In [None]:
from src.data_loading import load_data_json, load_data_csv

In [None]:
# Loading JSON FILE
dask_zones_bag = load_data_json('../dataset/zones.json', lib='dask')

In [None]:
# Load the GPS DRIVERS CSV File
drivers_gps_dask_df = load_data_csv('../dataset/drivers/drivers.csv', lib='dask')

# How big is the GPS file?
total_rows = drivers_gps_dask_df.shape[0].compute()
unique_drivers = drivers_gps_dask_df['driver'].nunique().compute()
print(f"Total Rows : {total_rows}")
print(f"Unique Drivers : {unique_drivers}")

# Data Cleaning

In [None]:
from src.data_cleaning import data_cleaning

In [None]:
drivers_gps_dask_df = data_cleaning(drivers_gps_dask_df, print_=False)



# Data Processing

In [None]:
from src.data_processing import polygon_conversion
from src.data_processing import create_geodataframe
from src.data_processing import create_unique_drivers

In [None]:
drivers_dask_df = create_unique_drivers(drivers_gps_dask_df, save_flag=False)

## Extract Polygons

In [None]:
# Extract Polygons
zones_dask_df = polygon_conversion(dask_zones_bag)

## Create GeoDataFrame

In [None]:
# Create Zone GeoDataFrame
zones_geodataframe = create_geodataframe(zones_dask_df)

# Data Visualization

## Maps

In [None]:
from src.geo_visualization import compute_mean_coordinates, select_specific_driver, create_folium_map, draw_gps_dots_and_lines, draw_zones_on_map

# Create Zone GeoDataFrame
zones_geodataframe = create_geodataframe(zones_dask_df)

# Assuming zones_geodataframe, drivers_gps_dask_df are already defined
mean_latitude, mean_longitude = compute_mean_coordinates(zones_geodataframe)

# Select a specific driver and draw 
driver_id = 'fff76584'
specific_driver_df, start_location = select_specific_driver(drivers_gps_dask_df, driver_id)

# Create a Folium map object with mean_latitude and mean_longitude
oMap = create_folium_map(mean_latitude, mean_longitude)

# Draw zones on the Folium map
oMap = draw_zones_on_map(zones_geodataframe, oMap)

# Draw GPS dots and lines on the Folium map for the specific driver
oMap = draw_gps_dots_and_lines(specific_driver_df, oMap)
oMap

### Load Data previously computed

In [None]:
from_intermediate_results = True 
intermediate_results_path = '../intermediate_results/'
 
# Sort by timestamp 
def timestamp_sort(df):
    return df.sort_values(by='timestamp')

if from_intermediate_results:
    # Reload the Pandas Dataframes
    drivers_gps_pandas_df = pd.read_parquet(os.path.join(intermediate_results_path,'drivers_gps_pandas_df/drivers_gps_pandas_df.parquet'))
    drivers_gps_dask_df = dd.from_pandas(drivers_gps_pandas_df, npartitions=4)
    # Sort values by timestamp and driver
    drivers_gps_dask_df = drivers_gps_dask_df.groupby('driver').apply(timestamp_sort, meta=drivers_gps_dask_df.compute())
    
    # Pandas save
    drivers_pandas_df = pd.read_parquet(os.path.join(intermediate_results_path,'drivers_pandas_df/drivers_pandas_df.parquet'))
    drivers_dask_df = dd.from_pandas(drivers_pandas_df, npartitions=4)
    drivers_dask_df = drivers_dask_df.sort_values(by=['driver'])
    
    # Zones
    zones_pandas_df = pd.read_csv(os.path.join(intermediate_results_path,'zones_pandas_df/zones_pandas_df.csv')) 
    zones_dask_df = dd.from_pandas(zones_pandas_df, npartitions=4)
else:
    pass

## Heatmap

In [None]:
from src.geo_visualization import create_heatmap

In [None]:
m = create_heatmap(drivers_gps_pandas_df)
m

# Analysis

## Computations
This part takes a long time to run. It is advised to read the already computed parquet files instead.

In [None]:
from src.data_processing import process_drivers_gps_data, compute_driver_stats, compute_driver_stats_dask, iterative_process_zones_id_gps, determine_zone_id_dask_geo, compute_mean_time_zone, count_unique_drivers_per_zone

In [None]:
# COMPUTING DISTANCE AND SPEED BETWEEN EACH GPS VALUE => PANDA METHOD (Takes 230 seconds)

# Notes : Impossible to use Dask here because we need a sequential work to have the N and N-1 record.
#         Parallelism seems useless here. To be confirmed...

if not fast_notebook: 
    drivers_gps_dask_df = process_drivers_gps_data(drivers_gps_dask_df, drivers_dask_df)

In [None]:
# COMPUTING MEAN SPEED AND TOTAL DISTANCE FOR EACH DRIVERS => PANDAS  #
#######################################################################

# 2024-01-12 - +- 1500 seconds ...
if not fast_notebook: 
    drivers_pandas_df = compute_driver_stats(drivers_gps_pandas_df, drivers_pandas_df, drivers_dask_df, save_flag=False)

In [None]:
# COMPUTING MEAN SPEED AND TOTAL DISTANCE FOR EACH DRIVERS => DASK    #
#######################################################################

# 2024-01-12 - Cell execution time : 71.882 seconds, which is 20 times faster than the Pandas method
if not fast_notebook: 
    drivers_dask_df = compute_driver_stats_dask(drivers_gps_dask_df, save_flag=False)

In [None]:
# DETERMINE ZONE ID FOR EACH GPS VALUE. ITERATIVE METHOD WITH GEODATAFRAMES AND R-TREE SPATIAL INDEX #
######################################################################################################

# 2024-01-06 - Cell execution time : 53456.204 seconds => +- 15 hours

# Notes : This was very long to compute. The result is saved in 3 differents formats for further use.
#         So this cell is desactivated and replaced by the loading of the saved file.
#         This is usefull to compare with other method like Dask or Sparx.

if not fast_notebook: 
    gdf_drivers = iterative_process_zones_id_gps(drivers_gps_pandas_df, zones_geodataframe)

In [None]:
# DETERMINE ZONE ID FOR EACH GPS VALUE. PARALLEL METHOD WITH DASK & GEOPANDAS #
###############################################################################

# 2024-01-12 - Cell execution time : 875.037 seconds => +- 15 min
if not fast_notebook: 
    drivers_gps_dask_df = determine_zone_id_dask_geo(drivers_gps_dask_df, zones_geodataframe, save_flag=False)

In [None]:
# COMPUTING MEAN TIME ON ZONE AND FAVORITE ZONE => DASK    #
############################################################

# 2024-01-12 - Cell execution time : 472.539 seconds
if not fast_notebook: 
    drivers_pandas_df = compute_mean_time_zone(drivers_gps_dask_df, save_flag=False)

In [None]:
# COMPUTING UNIQUE DRIVERS PER ZONE   #
#######################################

# 2024-01-13 - 

# Total GPS Count: 3949761
# Count zone_id = -1 : 129870
# The total is OK !!!
if not fast_notebook:   
    count_unique_drivers_per_zone(drivers_gps_pandas_df, drivers_gps_dask_df, zones_geodataframe, save_flag=False)

## Visualizations

In [None]:
from src.visualization_analysis import *

In [None]:
# Plot the distribution of mean speeds
plot_mean_speed_distribution(drivers_pandas_df)
    

In [None]:
# Plot the distribution of total distances traveled
plot_total_distance_distribution(drivers_pandas_df)



In [None]:
# Plot the distribution of GPS counts
plot_gps_count_distribution(drivers_pandas_df)
    

In [None]:
# Plot the relationship between mean speed and total distance
plot_speed_distance_relationship(drivers_pandas_df)
    

In [None]:
# Plot the distribution of favorite zones by time spent
plot_favorite_zone_by_time(drivers_pandas_df)
    

In [None]:
# Plot the distribution of favorite zones by number of records
plot_favorite_zone_by_values(drivers_pandas_df)
    

In [None]:
# Plot the number of GPS lines per zone
plot_gps_count_by_zone(zones_pandas_df)
    

In [None]:
# Plot the number of unique drivers per zone
plot_unique_drivers_by_zone(zones_pandas_df)
    

In [None]:
# Plot the distribution of GPS data by hour
plot_gps_data_distribution(drivers_gps_pandas_df)