### Libraries and used functions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import datetime as dt

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import requests
import plotly.express as px
import plotly.graph_objects as go

from IPython.display import HTML

from scipy.stats import f_oneway
from scipy.stats import pearsonr
from scipy.stats import mannwhitneyu

%matplotlib inline
# from pydantic_settings import BaseSettings
#from ydata_profiling import ProfileReport

import sys
import os
sys.path.append('../utils/')
#sys.path.append('..\\utils\\')

from functions import get_zone_info
from functions import haversine_distance as hsd
from functions import main_concurrent

In [None]:
def generate_custom_blue_scale(n_steps, dark_start=True):
    """
    Generate a custom blue color scale with n_steps.
    
    Args:
    - n_steps: Number of steps in the color scale.
    - dark_start: If True, start with a darker blue; otherwise, start lighter.
    
    Returns:
    - A list of color strings in RGB format.
    """
    cmap = plt.get_cmap('Blues')
    colors = [cmap(i) for i in np.linspace(0.3 if dark_start else 0, 1, n_steps)]  # Start from 0.3 to avoid too light colors
    # Convert colors to RGB format for Plotly
    plotly_colors = ["rgb({},{},{})".format(int(r*255), int(g*255), int(b*255)) for r, g, b, _ in colors]
    return plotly_colors

# # Example usage
n_steps = 24  # Define the number of steps you want in your color scale
custom_blue_scale = generate_custom_blue_scale(n_steps, dark_start=True)

# Green Scale
def generate_custom_green_scale(n_steps, dark_start=True):
    """
    Generate a custom green color scale with n_steps.
    
    Args:
    - n_steps: Number of steps in the color scale.
    - dark_start: If True, start with a darker green; otherwise, start lighter.
    
    Returns:
    - A list of color strings in RGB format.
    """
    cmap = plt.get_cmap('Greens')
    colors = [cmap(i) for i in np.linspace(0.3 if dark_start else 0, 1, n_steps)]  # Start from 0.3 to avoid too light colors
    # Convert colors to RGB format for Plotly
    plotly_colors = ["rgb({},{},{})".format(int(r*255), int(g*255), int(b*255)) for r, g, b, _ in colors]
    return plotly_colors

# Test the function
generate_custom_green_scale(5)



# # Using the custom color scale in Plotly
# fig = go.Figure(data=go.Contour(
#     z=[[10, 10.625, 12.5, 15.625, 20],
#        [5.625, 6.25, 8.125, 11.25, 15.625],
#        [2.5, 3.125, 5.0, 8.125, 12.5],
#        [0.625, 1.25, 3.125, 6.25, 10.625],
#        [0, 0.625, 2.5, 5.625, 10]],
#     colorscale=custom_blue_scale,
#     # other attributes here
# ))

# fig.show()

## Data Cleaning and Preprocessing. Exploratory and Confirmatory Data Analysis

### 1. Data Cleaning and Processing.

In [None]:
# Initialize ShareNow and GreenMobility datasets
gm_original = pd.read_csv('../data/raw/GM_preparedData.csv')
sn_original = pd.read_excel('../data/raw/DriveNow/DTU - data til case_LTMZones1.xlsx')

#   ----------------------------------------------------------------------------
#   SHARENOW PART
#   ----------------------------------------------------------------------------

# Rename ShareNow columns
dict_sn_renamecols =    {'TurID':'TripID',
                        'BilID':'CarID',
                        'PersonID':'UserID',
                        'Latitude (Start)':'LatitudeStart',
                        'Longitude (Start)':'LongitudeStart',
                        'Latitude (Slut)': "LatitudeEnd", 
                        'Longitude (Slut)':'LongitudeEnd',
                        'Reservationstidspunkt': 'ReservationTime',
                        'Start tidspunkt': 'StartTime',
                        'Slut tidspunkt': 'EndTime',
                        'Alder': 'Age',
                        'Køn': 'Gender',
                        'Batteristatus (start)': 'BatteryStart',
                        'Batteristatus (slut)':'BatteryEnd',
                        'Km kørt': 'KmDriven'}

sn_processing = sn_original.rename(columns=dict_sn_renamecols)

# Check the data types of the dimensions
print(sn_processing.info())

# Convert BatteryStart and BatteryEnd to Integer type
sn_processing['BatteryStart'] = pd.to_numeric(sn_processing['BatteryStart'],errors='coerce')
sn_processing['BatteryEnd'] = pd.to_numeric(sn_processing['BatteryEnd'],errors='coerce')

# Convert location coordinates to floating type
coordinates = [
    'LatitudeEnd',
    'LatitudeStart',
    'LongitudeEnd',
    'LongitudeStart'
]

for col in coordinates:
    # sn_processing[col] = sn_processing[col].apply(lambda x: x.replace(',','.'))
    sn_processing[col] = pd.to_numeric(sn_processing[col],errors='coerce')

# Auxiliary columns
sn_processing['TripDurationHours'] = (sn_processing['EndTime'] - sn_processing['StartTime']).dt.total_seconds()/3600
sn_processing['TripDurationMinutes'] = sn_processing['TripDurationHours']*60  #   trip duration in min
sn_processing['AvgSpeed'] = sn_processing['KmDriven']/sn_processing['TripDurationHours'] #   avg speed
sn_processing['Month'] = sn_processing['StartTime'].dt.strftime("%B").astype('category')  #   month
sn_processing['Weekday'] = sn_processing['StartTime'].dt.day_name().astype('category')    #   day of the week
sn_processing['StartHour'] = sn_processing['StartTime'].dt.hour.astype('category')    #   hour of the day
sn_processing['Date'] = sn_processing['StartTime'].dt.date  #   date

sn_processing['HaversineDistance'] = hsd(sn_processing['LatitudeStart'], sn_processing['LongitudeStart'],sn_processing['LatitudeEnd'], sn_processing['LongitudeEnd'])

# Custom cut for age interval
dfbrackets = pd.DataFrame({
    'ranges': [range(18, 30), range(30, 40), range(40, 50), range(50, 60), range(60, 91)],
    'brackets': ['18-29', '30-39', '40-49', '50-59', '60-90']
}).explode('ranges')

dfbrackets['ranges'] = dfbrackets['ranges'].astype(int)

def get_bracket(age):
    bracket = dfbrackets.loc[dfbrackets['ranges'] == age, 'brackets']
    return bracket.iloc[0] if not bracket.empty else None

sn_processing['AgeBracket'] = sn_processing['Age'].apply(get_bracket)

#   ----------------------------------------------------------------------------
#   GREENMOBILITY PART
#   ----------------------------------------------------------------------------

# Rename GreenMobility columns
dict_gm_renamecols = {'id':'TripID',
                        'vehicleId':'CarID',
                        'id2':'UserID',
                        'startPositionLat':'LatitudeStart',
                        'startPositionLng':'LongitudeStart',
                        'endPositionLat':'LatitudeEnd',
                        'endPositionLng':'LongitudeEnd',
                        'tripStart':'StartTime',
                        'tripEnd':'EndTime',
                        'birthday':'Age',
                        'key':'Gender',
                        'DriveLength':'KmDriven'
                      }

# List the columns to be kept for GreenMobility
gmkeepcols_list = ['TripID','CarID',
                   'UserID','LatitudeStart',
                   'LongitudeStart','LatitudeEnd',
                   'LongitudeEnd','StartTime',
                   'EndTime','Age','Gender','KmDriven',
                   'FromZoneID','ToZoneID']

gm_processing = gm_original.rename(columns=dict_gm_renamecols)
gm_processing = gm_processing[gmkeepcols_list]

# Convert StartTime and EndTime to Datetime
gm_processing['StartTime'] = pd.to_datetime(gm_processing['StartTime'], errors='coerce', format='%d%b%y:%H:%M:%S')
gm_processing['EndTime'] = pd.to_datetime(gm_processing['EndTime'], errors='coerce' , format='%d%b%y:%H:%M:%S')

gm_processing['TripDurationHours'] = (gm_processing['EndTime'] - gm_processing['StartTime']).dt.total_seconds()/3600
gm_processing['TripDurationMinutes'] = gm_processing['TripDurationHours']*60  #   trip duration in min
gm_processing['AvgSpeed'] = gm_processing['KmDriven']/gm_processing['TripDurationHours'] #   avg speed
gm_processing['Month'] = gm_processing['StartTime'].dt.strftime("%B").astype('category')  #   month
gm_processing['Weekday'] = gm_processing['StartTime'].dt.day_name().astype('category')    #   day of the week
gm_processing['StartHour'] = gm_processing['StartTime'].dt.hour.astype('category')    #   hour of the day
gm_processing['Date'] = gm_processing['StartTime'].dt.date  #   date
gm_processing['AgeBracket'] = gm_processing['Age'].apply(get_bracket)   #   age bracket

### 1.1 Data cleaning and processing : SHARE NOW

In [None]:
# Summary statics on numerical variables for SN
describecols = ['Age','BatteryStart','BatteryEnd','KmDriven','TripDurationHours','TripDurationMinutes','AvgSpeed']
sn_summary = sn_processing[describecols].describe().T

sn_summary.rename(columns={'std':'st deviation','50%':'median'}, inplace = True)
sn_summary.drop(columns='count', inplace=True)

display(HTML('<p style="font-size:20px; font-family:Consolas;">Descriptive statistics for Share Now</p>'))
sn_summary.style.format("{:.2f}")

##### Dropping unfeasible records for KmDriven, TripDurationMinutes, and Battery Levels

In [None]:
# 1. Drop records with negative KmDriven and those exceeding 350km
sn_processing = sn_processing[(sn_processing['KmDriven']>0)&(sn_processing['KmDriven']<350)]

# 2. Drop records with negative trip duration
sn_processing = sn_processing[sn_processing['TripDurationMinutes']>=0]

# 3. Drop records where BatteryStart is greater than BatteryEnd and negative values
sn_processing = sn_processing[(sn_processing['BatteryStart']>=sn_processing['BatteryEnd']) & (sn_processing['BatteryStart']>=0) & (sn_processing['BatteryEnd']>0)]

# 4. Drop records where AvgSpeed exceeds 100 km/h
sn_processing = sn_processing[sn_processing['AvgSpeed']<100]

# 5. Drop records where TripDurationHours is greater than 5
sn_processing = sn_processing[sn_processing['TripDurationHours']<6]

In [None]:
sn_processing[sn_processing['KmDriven']>3][describecols].describe().T.style.format("{:.2f}")

In [None]:
# Check 85% quantile on TripDurationMinutes
sn_processing[(sn_processing['TripDurationMinutes']<sn_processing['TripDurationMinutes'].quantile(0.85))][describecols].describe().T.style.format("{:.2f}")

#### Population insights

We can see that most of the SN trips are made by people younger than 30 years, however, they tend to drive shorter distances, while doing it faster on average.

In [None]:
n_steps = 5  # Define the number of steps you want in your color scale
custom_blue_scale = generate_custom_blue_scale(n_steps, dark_start=True)

agg_df = sn_processing[sn_processing['KmDriven']>=3].groupby('AgeBracket')['KmDriven'].agg(['mean','count']).reset_index()
fig = px.bar(agg_df, x='AgeBracket', y='count', color='mean',
             color_continuous_scale=custom_blue_scale,
             labels={'mean': 'Km', 'count': 'Number of trips', 'AgeBracket':'Age group'})

fig.update_layout(width=500,height=600, template=None, font=dict(family='Aptos',size=18))
fig.update_traces()
# fig.update_yaxes(showgrid=False)
fig.update_xaxes(showgrid=False)
fig.show()


In [None]:
spped = sn_processing[sn_processing['KmDriven']>=3].groupby('AgeBracket')['AvgSpeed'].agg(['mean','count']).reset_index()
fig = px.bar(spped, x='AgeBracket', y='count', color='mean',
             color_continuous_scale=custom_blue_scale,
             labels={'mean': 'Km/h', 'count': 'Number of trips', 'AgeBracket':'Age group'})

fig.update_layout(width=500,height=600, template=None, font=dict(family='Aptos',size=18))
fig.update_traces()
# fig.update_yaxes(showgrid=False)
fig.update_xaxes(showgrid=False)
fig.show()

#### Check data distributions: KmDriven, TripDurationMinutes, AvgSpeed
Cuts: KmDriven above $3$ km, TripDurationMinutes -  $85\%$

In [None]:
columns = ['KmDriven', 'TripDurationMinutes', 'AvgSpeed']

# Set the figure size
plt.figure(figsize=(15, 10))

# Create subplots for histograms
for i, column in enumerate(columns):
    plt.subplot(2, 3, i+1)
    if column == 'TripDurationMinutes':
        sns.boxplot(data=sn_processing[(sn_processing['KmDriven']>=3)&(sn_processing['TripDurationMinutes']<sn_processing['TripDurationMinutes'].quantile(0.85))], y=column, color='lightblue')
    else:
        sns.histplot(data=sn_processing[(sn_processing['KmDriven']>=3)&(sn_processing['TripDurationMinutes']<sn_processing['TripDurationMinutes'].quantile(0.85))], x=column, bins=30,color='lightblue')
    # sns.color_palette('rocket_r')
    sns.set_style('whitegrid')
    plt.grid(False)
    plt.title(column)
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
# Display average daily demand per hour
sn_processing['Date'] = sn_processing['StartTime'].dt.date
days = sn_processing.groupby(['Date','StartHour']).agg({'TripID':'count'}).reset_index()
dfplot = days.groupby('StartHour').agg({'TripID':'mean'},).reset_index().rename(columns={'TripID':'AvgDemand'})

In [None]:
r = dfplot['AvgDemand'].tolist()
theta = np.arange(0,360,15)
width = [15]*24

ticktexts = [str(i)+":00" if i % 6 == 0 else '' for i in np.arange(24)]

fig = go.Figure(go.Barpolar(
    r=r,
    theta=[i-7.5 for i in theta],
    width=width,
    marker_color=dfplot['AvgDemand'],
    marker_colorscale=custom_blue_scale,
    marker_line_color="white",
    marker_line_width=2,
    opacity=0.9
))

fig.update_layout(
    template=None,
    polar=dict(
        hole=0.4,
        bgcolor='rgb(223, 223,223)',
        radialaxis=dict(
            showticklabels=False,
            ticks='',
            linewidth=2,
            linecolor='white',
            showgrid=False,
        ),
        angularaxis=dict(
            tickvals=[i for i in theta],
            ticktext=ticktexts,
            showline=True,
            direction='clockwise',
            period=24,
            linecolor='white',
            gridcolor='white',
            showticklabels=True,
            ticks=''
        )
    )
)

fig.show()

In [None]:
agg_trips_km = sn_processing[(sn_processing['KmDriven']>=3)&(sn_processing['KmDriven']<40)]\
    .groupby('Date')[['TripID','KmDriven']].agg({'TripID':'count','KmDriven':'sum'}).reset_index()
agg_trips_km['Weekday'] = pd.to_datetime(agg_trips_km['Date']).dt.day_name()
daily_trip_km = agg_trips_km.groupby('Weekday').agg({'TripID':'mean','KmDriven':'mean'}).reset_index()
daily_trip_km['KmDriven'] = np.round(daily_trip_km['KmDriven']/daily_trip_km['TripID'],decimals=2)
daily_trip_km['TripID'] = np.round(daily_trip_km['TripID'],decimals=0).astype(int)
daily_trip_km.rename(columns={'TripID':'AvgDemand','KmDriven':'AvgDistance'},inplace=True)

orderlist = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

fig = px.bar(daily_trip_km, x='Weekday', y='AvgDemand',category_orders={'Weekday':orderlist},color='AvgDistance',
             color_continuous_scale=custom_blue_scale,
             labels={'AvgDistance': 'Km', 'AvgDemand': 'Number of trips'})

fig.update_layout(width=700,height=600, template=None, font=dict(family='Aptos',size=20))
fig.update_traces()
fig.update_yaxes(showgrid=False)
fig.update_xaxes(
    ticktext=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
    tickvals=[0, 1, 2, 3, 4, 5, 6], showgrid=False, title=None
)

fig.show()

In [None]:
# Perform the Pearson correlation test
monday = sn_processing[sn_processing['Weekday'] == 'Monday']['KmDriven']
sunday = sn_processing[sn_processing['Weekday'] == 'Sunday']['KmDriven']

# Perform Mann-Whitney U test
statistic, p_value = mannwhitneyu(monday, sunday, alternative='two-sided')

# Print the test results
print("Mann-Whitney U statistic:", statistic)
print("p-value:", p_value)


In [None]:
sn_processing.groupby('AgeBracket')['UserID'].unique().apply(len).reset_index().rename(columns={'UserID':'Number of users'})

In [None]:
# Extract the average speed for each age group
age_group_18_29 = sn_processing[sn_processing['AgeBracket'] == '18-29']['AvgSpeed']
age_group_30_39 = sn_processing[sn_processing['AgeBracket'] == '30-39']['AvgSpeed']
age_group_40_49 = sn_processing[sn_processing['AgeBracket'] == '40-49']['AvgSpeed']
age_group_50_59 = sn_processing[sn_processing['AgeBracket'] == '50-59']['AvgSpeed']
age_group_60_90 = sn_processing[sn_processing['AgeBracket'] == '60-90']['AvgSpeed']

# Perform one-way ANOVA test
f_statistic, p_value = f_oneway(age_group_18_29, age_group_30_39, age_group_40_49, age_group_50_59, age_group_60_90)

# Check the p-value
if p_value < 0.05:
    print("There is a statistically significant difference between the mean of average speed across age groups.")
else:
    print("There is no statistically significant difference between the mean of average speed across age groups.")


In [None]:
# Extract the 'Age' and 'AvgSpeed' columns from the DataFrame
age = sn_processing['Age']
avg_speed = sn_processing['AvgSpeed']

# Perform the Pearson correlation test
correlation, p_value = pearsonr(age, avg_speed)

# Print the correlation coefficient and p-value
print("Pearson correlation coefficient:", correlation)
print("p-value:", p_value)


### 2.1 GREEN MOBILITY: Data cleaning and processing

In [None]:
describecols = ['Age','KmDriven','TripDurationHours','TripDurationMinutes','AvgSpeed']
gm_processing.replace([np.inf, -np.inf], np.nan, inplace=True)
gm_processing[describecols].describe().T.style.format("{:.2f}")

In [None]:
gm_processing = gm_processing[gm_processing['Age']>0]
gm_processing = gm_processing.dropna(subset=['Age', 'KmDriven', 'TripDurationMinutes', 'AvgSpeed'])
gm_processing = gm_processing[gm_processing['AvgSpeed']<100]
gm_processing = gm_processing[gm_processing['KmDriven']>3]
gm_processing[describecols].describe().T.style.format("{:.2f}")

In [None]:
# Drop missing values in the dataset gm_processing
gm_processing = gm_processing.dropna(subset=['Age', 'KmDriven', 'TripDurationMinutes', 'AvgSpeed'])
gm_processing = gm_processing[(gm_processing['KmDriven']>0)]
gm_processing = gm_processing[gm_processing['TripDurationMinutes']>3]
gm_processing = gm_processing[gm_processing['Age']>0]
gm_processing = gm_processing[gm_processing['AvgSpeed']<100]
gm_processing = gm_processing[gm_processing['KmDriven']>3]
gm_processing = gm_processing[gm_processing['TripDurationMinutes']<gm_processing['TripDurationMinutes'].quantile(0.90)]

In [None]:
gm_processing['KmDriven']=np.round(gm_processing['KmDriven']).astype(int)
gm_processing[describecols].describe().T.style.format("{:.2f}")

In [None]:
# plot gm_processing['TripDurationHours'] histogram using seaborn
plt.figure(figsize=(10, 5))
sns.histplot(data=gm_processing, x='TripDurationMinutes', bins=50, color='seagreen')
sns.set_style('whitegrid')
plt.grid(False)
plt.title('Trip Duration Minutes')
plt.show()

In [None]:
columns = ['KmDriven', 'TripDurationMinutes', 'AvgSpeed']

# Set the figure size
plt.figure(figsize=(15, 10))

# Create subplots for histograms
for i, column in enumerate(columns):
    plt.subplot(2, 3, i+1)
    if column == 'TripDurationMinutes':
        sns.boxplot(data=gm_processing, y=column, color='seagreen')
    else:
        sns.histplot(data=gm_processing, x=column, bins=30,color='seagreen')
    # sns.boxplot(data=gm_processing[(gm_processing['KmDriven']>=3)&(gm_processing['AvgSpeed']<100)&(gm_processing['AvgSpeed']>5)], y=column, color='seagreen')
    # sns.color_palette('rocket_r')
    sns.set_style('whitegrid')
    plt.grid(False)
    plt.title(column)
plt.tight_layout()

# Show the plot
plt.show()


In [None]:
gm_processing.groupby('AgeBracket')['UserID'].unique().apply(len).reset_index().rename(columns={'UserID':'Number of users'})

In [None]:
# Initialize ShareNow and GreenMobility datasets
gm_original = pd.read_csv('../data/raw/GM_preparedData.csv')
# sn_original = pd.read_excel('../data/raw/DriveNow/DTU - data til case_LTMZones1.xlsx')

#   ----------------------------------------------------------------------------
#   GREENMOBILITY PART
#   ----------------------------------------------------------------------------

# Rename GreenMobility columns
dict_gm_renamecols = {'id':'TripID',
                        'vehicleId':'CarID',
                        'id2':'UserID',
                        'startPositionLat':'LatitudeStart',
                        'startPositionLng':'LongitudeStart',
                        'endPositionLat':'LatitudeEnd',
                        'endPositionLng':'LongitudeEnd',
                        'tripStart':'StartTime',
                        'tripEnd':'EndTime',
                        'birthday':'Age',
                        'key':'Gender',
                        'DriveLength':'KmDriven'
                      }

# List the columns to be kept for GreenMobility
gmkeepcols_list = ['TripID','CarID',
                   'UserID','LatitudeStart',
                   'LongitudeStart','LatitudeEnd',
                   'LongitudeEnd','StartTime',
                   'EndTime','Age','Gender','KmDriven',
                   'FromZoneID','ToZoneID']

gm_test = gm_original.rename(columns=dict_gm_renamecols)
gm_test = gm_test[gmkeepcols_list]

# Convert StartTime and EndTime to Datetime
gm_test['StartTime'] = pd.to_datetime(gm_test['StartTime'], errors='coerce', format='%d%b%y:%H:%M:%S')
gm_test['EndTime'] = pd.to_datetime(gm_test['EndTime'], errors='coerce' , format='%d%b%y:%H:%M:%S')

gm_test['TripDurationHours'] = (gm_test['EndTime'] - gm_test['StartTime']).dt.total_seconds()/3600
gm_test['TripDurationMinutes'] = gm_test['TripDurationHours']*60  #   trip duration in min
gm_test['AvgSpeed'] = gm_test['KmDriven']/gm_test['TripDurationHours'] #   avg speed
gm_test['Month'] = gm_test['StartTime'].dt.strftime("%B").astype('category')  #   month
gm_test['Weekday'] = gm_test['StartTime'].dt.day_name().astype('category')    #   day of the week
gm_test['StartHour'] = gm_test['StartTime'].dt.hour.astype('category')    #   hour of the day
gm_test['Date'] = gm_test['StartTime'].dt.date  #   date
gm_test['AgeBracket'] = gm_test['Age'].apply(get_bracket)   #   age bracket

In [None]:
gm_test.groupby('AgeBracket')['UserID'].unique().apply(len).reset_index().rename(columns={'UserID':'Number of users'})

In [None]:
n_steps = 5  # Define the number of steps you want in your color scale
custom_green_scale = generate_custom_green_scale(n_steps, dark_start=True)

agg_df = gm_processing.groupby('AgeBracket')['KmDriven'].agg(['mean','count']).reset_index()
fig = px.bar(agg_df, x='AgeBracket', y='count', color='mean',
             color_continuous_scale=custom_green_scale,
             labels={'mean': 'Km', 'count': 'Number of trips', 'AgeBracket':'Age group'})

fig.update_layout(width=500,height=600, template=None, font=dict(family='Aptos',size=18))
fig.update_traces()
# fig.update_yaxes(showgrid=False)
fig.update_xaxes(showgrid=False)
fig.show()


In [None]:
spped = gm_processing.groupby('AgeBracket')['AvgSpeed'].agg(['mean','count']).reset_index()
fig = px.bar(spped, x='AgeBracket', y='count', color='mean',
             color_continuous_scale=custom_green_scale,
             labels={'mean': 'Km/h', 'count': 'Number of trips', 'AgeBracket':'Age group'})

fig.update_layout(width=500,height=600, template=None, font=dict(family='Aptos',size=18))
fig.update_traces()
# fig.update_yaxes(showgrid=False)
fig.update_xaxes(showgrid=False)
fig.show()

In [None]:
custom_green_scale = generate_custom_green_scale(7, dark_start=True)

agg_trips_km = gm_processing\
    .groupby('Date')[['TripID','KmDriven']].agg({'TripID':'count','KmDriven':'sum'}).reset_index()
agg_trips_km['Weekday'] = pd.to_datetime(agg_trips_km['Date']).dt.day_name()
daily_trip_km = agg_trips_km.groupby('Weekday').agg({'TripID':'mean','KmDriven':'mean'}).reset_index()
daily_trip_km['KmDriven'] = np.round(daily_trip_km['KmDriven']/daily_trip_km['TripID'],decimals=2)
daily_trip_km['TripID'] = np.round(daily_trip_km['TripID'],decimals=0).astype(int)
daily_trip_km.rename(columns={'TripID':'AvgDemand','KmDriven':'AvgDistance'},inplace=True)

orderlist = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

fig = px.bar(daily_trip_km, x='Weekday', y='AvgDemand',category_orders={'Weekday':orderlist},color='AvgDistance',
             color_continuous_scale=custom_green_scale,
             labels={'AvgDistance': 'Km', 'AvgDemand': 'Number of trips'})

fig.update_layout(width=700,height=600, template=None, font=dict(family='Aptos',size=20))
fig.update_traces()
fig.update_yaxes(showgrid=False)
fig.update_xaxes(
    ticktext=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'],
    tickvals=[0, 1, 2, 3, 4, 5, 6], showgrid=False, title=None
)

fig.show()

In [None]:
# sn_processing['Date'] = sn_processing['StartTime'].dt.date
days = gm_processing.groupby(['Date','StartHour']).agg({'TripID':'count'}).reset_index()
dfplot = days.groupby('StartHour').agg({'TripID':'mean'},).reset_index().rename(columns={'TripID':'AvgDemand'})

custom_green_scale = generate_custom_green_scale(24, dark_start=True)

r = dfplot['AvgDemand'].tolist()
theta = np.arange(0,360,15)
width = [15]*24

ticktexts = [str(i)+":00" if i % 6 == 0 else '' for i in np.arange(24)]

fig = go.Figure(go.Barpolar(
    r=r,
    theta=[i-7.5 for i in theta],
    width=width,
    marker_color=dfplot['AvgDemand'],
    marker_colorscale=custom_green_scale,
    marker_line_color="white",
    marker_line_width=2,
    opacity=0.9
))

fig.update_layout(
    template=None,
    polar=dict(
        hole=0.4,
        bgcolor='rgb(223, 223,223)',
        radialaxis=dict(
            showticklabels=False,
            ticks='',
            linewidth=2,
            linecolor='white',
            showgrid=False,
        ),
        angularaxis=dict(
            tickvals=[i for i in theta],
            ticktext=ticktexts,
            showline=True,
            direction='clockwise',
            period=24,
            linecolor='white',
            gridcolor='white',
            showticklabels=True,
            ticks=''
        )
    )
)

fig.show()

In [None]:
dfplot

In [None]:
gm_test['HaversineDistance'] = hsd(gm_test['LatitudeStart'], gm_test['LongitudeStart'],gm_test['LatitudeEnd'], gm_test['LongitudeEnd'])

# # scatterplot of gm_processing['KmDriven'] vs. gm_processing['HaveDistance']

plt.figure(figsize=(10, 5))
sns.scatterplot(data=gm_test, x='KmDriven', y='HaversineDistance', color='seagreen', alpha=0.005)
sns.set_style('whitegrid')
sns.despine()
plt.grid(False)
plt.title('KmDriven vs. HaversineDistance')
plt.show()

In [None]:
# # scatterplot of gm_processing['KmDriven'] vs. gm_processing['HaveDistance']

plt.figure(figsize=(10, 5))
sns.scatterplot(data=sn_processing[(sn_processing['KmDriven']>=3)&(sn_processing['KmDriven']<40)], x='KmDriven', y='HaversineDistance', color='darkblue', alpha=0.005)
sns.set_style('whitegrid')
sns.despine()
plt.grid(False)
plt.title('KmDriven vs. HaversineDistance')
plt.show()

In [None]:
gm_original.describe().T.style.format("{:.2f}")

### 1.3 Data cleaning and processing: Donkey Republic