#  Module 1 : Parts modales

**Description** : Le but de ce module est de mener un premier calcul des parts modales kilométriques i.e. les distances journalières moyennisées par mode et par motif.

**Durée estimée première partie** : 7 jours

**Objectifs spécifiques** :
- [x] Sous-échantillonnage des résidents et visiteurs par canton (basé sur le GPS)
- [x] Rééchantillonnage des jours d’observation pour avoir un calendrier continue par usager
- [x] Intégrer le détail des transit
- [x] Distinguer de façon aussi systématique que possible les jours sans déplacement des jours
non-détectés et comparaison statistique au jours non-déplacé dans d’autres bases de
données
- [x] Recodage des modes selon besoin des cantons
- [x] Calcul liminaire des parts modales kilométriques et par déplacements
- [x] Ajout des données d’équipement (e.g. type de motorisation principale du ménage)
- [ ] Documenter les hypothèses et limites du calcul liminaire des parts modales (e.g. aspects
saisonniers, échantillonnage, perte de signal, moyennisation des données longitudinales, ...)

**Résultats attendus** : Parts modales kilométriques par mode pour les résidents et visiteurs de chaque canton en vue du calcul des émissions carbone. Il doit être possible de calculer les parts modales en tenant compte des jours non-mobiles.

**Sous-échantillonnage** :
- Vaud : résident·es du canton + visiteur·euses
- Genève : résident·es du canton + visiteur·euses

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

from shapely import geometry, ops
from shapely.geometry import MultiLineString, LineString, Point
import os
import concurrent.futures
from shapely.ops import unary_union
from shapely.geometry import JOIN_STYLE, Polygon, MultiPolygon

import pycountry
import xyt

import time

from panel_functions import *

### Charger les données

In [None]:
%%time
# Définir le CRS du projet (EPSG:4326 for WGS84)
target_crs = 'EPSG:4326'
print("CRS du projet: WGS84 \n")

#Charger les étapes
legs_ = pd.read_pickle('../Data/dumps_motiontag/storyline_time_space_filters/legs_filter.pkl')
print("Fichier étape chargé")

#Charger les activités
staypoints = pd.read_pickle('../Data/dumps_motiontag/storyline_formated/staypoints.pkl').reset_index(drop=True)
print("Fichier activité chargé")

#Charger les user_statistics
usr_stats = pd.read_pickle('../Data/processed_feuille_de_route/gps_user_statistics.pkl')
print("Fichier statistiques utilisateur·ices chargé")

#Charger les trips
trips = pd.read_csv('../Data/dumps_motiontag/Trips2023-04-01--2023-08-31.csv')
print("Fichier des déplacements chargé")

#Charger les bases officielles pour le sous-échantillonage géographique https://opendata.swiss/de/dataset/vm-uvek-zones-2017/resource/29b98f2c-42f2-4e72-b8b1-a39500ed0ad0
TAZ = gpd.read_file('../../Vague1/Verkehrszonen_Schweiz_NPVM_2017_shp/Verkehrszonen_Schweiz_NPVM_2017.shp')
TAZ = TAZ[['ID_Agglo', 'N_Agglo', 'N_KT', 'ID_Gem', 'geometry']]
TAZ = TAZ.to_crs(crs=target_crs)
#repare anomalies
TAZ['geometry'] = TAZ['geometry'].buffer(0)
shp_KT = TAZ.dissolve(by='N_KT').reset_index()
print("Fichier Zones de traffic chargé")

# Get world countries GeoDataFrame
def get_world_countries():
    world_countries = gpd.read_file('../Data/other_shp/countries/ne_110m_admin_0_countries.shp')
    world_countries = world_countries[['SOVEREIGNT','geometry']]
    return world_countries
world_countries = get_world_countries()
print("Fichier Map Monde chargé")

# Get perimetre panel GeoDataFrame
perimetre_panel = gpd.read_file('../Data/other_shp/perimetre_panel/perimetre_panel_08.01.24.shp')
perimetre_panel = perimetre_panel.to_crs(crs=target_crs)
perimetre_panel = perimetre_panel[['COMM_ID','COMM_NAME','Typo_panel','geometry']]
perimetre_panel['panel_area'] = 1

perimetre_panel_full = perimetre_panel.dissolve().geometry.apply(lambda p: close_holes(p))
perimetre_panel_full = gpd.GeoDataFrame(geometry=[perimetre_panel_full.iloc[0]], crs=target_crs)
perimetre_panel_full['panel_area'] = 1
print("Fichier Périmètre panel chargé")

#### Ne garder que les user_id dans user_stats
L'utilisateur 'CH9872' est enlevée du fichier legs car il ne contient qu'une observation (une seule étape).
Toutes les autres observaitions sont conservées (n = 2806)

In [None]:
legs = legs_.loc[legs_.user_id_motiontag.isin(usr_stats.user_id_motiontag.unique())].copy()

#### Ajouter un ID par usager-jour

In [None]:
#Ajouter le user_id_day
legs.insert(1, 'legs_date',legs.started_at.dt.date)
legs['legs_date'] = pd.to_datetime(legs['legs_date'])

legs.insert(
    1,"user_id_day",legs["user_id_fors"]
    + "_" 
    + legs.started_at.dt.year.astype(str)
    + legs.started_at.dt.month.astype(str).str.zfill(2)
    + legs.started_at.dt.day.astype(str).str.zfill(2),
)

#### Ajouter le *next activity_id* aux étapes

In [None]:
# Sort 'points' and 'legs' by 'started_at' to ensure data is in chronological order
staypoints.sort_values(by=['user_id_fors','started_at'], inplace=True, ignore_index=True)
legs.sort_values(by=['user_id_fors','started_at'], inplace=True)

In [None]:
legs.head(1)

In [None]:
result = staypoints.copy()
result['finished_at'] = pd.to_datetime(result['finished_at'], format='%Y-%m-%d %H:%M:%S')
result.sort_values(by=['user_id_fors','finished_at'], inplace=True)


previous_leg = legs[['user_id_fors', 'finished_at', 'leg_id', 'mode']].rename(columns={'finished_at': 'started_at', 'leg_id': 'previous_leg_id', 'mode':'previous_mode'})
previous_leg['started_at'] = pd.to_datetime(previous_leg['started_at'], format='%Y-%m-%d %H:%M:%S')
previous_leg.sort_values(by=['user_id_fors','started_at'], inplace=True)
previous_leg.dropna(inplace=True)

#for user_id_fors in result.user_id_fors.unique():
result = pd.merge(result, previous_leg, on=['user_id_fors','started_at'], how='left')


# Merge 'staypoints' with 'legs' to find the next leg
next_leg = legs[['user_id_fors', 'started_at', 'leg_id', 'mode']].rename(columns={'started_at': 'finished_at', 'leg_id': 'next_leg_id', 'mode':'next_mode'})
next_leg['finished_at'] = pd.to_datetime(next_leg['finished_at'], format='%Y-%m-%d %H:%M:%S')
next_leg.sort_values(by=['user_id_fors','finished_at'], inplace=True)
next_leg.dropna(inplace=True)

#for user_id_fors in result.user_id_fors.unique():
result = pd.merge(result, next_leg, on=['user_id_fors','finished_at'], how='left')


# Drop unnecessary columns from the result
result.sort_values(by=['user_id_fors','started_at'], inplace=True)#.drop(['next_leg_started_at', 'past_leg_started_at'], axis=1, inplace=True)

# 
staypoints = result.copy()

In [None]:
staypoints.head(2)

In [None]:
legs = pd.merge(legs, staypoints[['activity_id', 'previous_leg_id','purpose']],
               left_on='leg_id', right_on='previous_leg_id', how='left')
legs.rename(columns={'activity_id':'leading_stay_id','purpose':'leading_stay_purpose'}, inplace=True)
del legs['previous_leg_id']

###  Ajouter la durée et la longueur des étapes

In [None]:
%%time 
# Add length in meters
legs['length'] = legs.to_crs('EPSG:2056').length
# Add the duration in seconds
legs['duration'] = (legs['finished_at'] - legs['started_at']).dt.total_seconds()

### Extraire les aires géographiques et les sous-échantillons (Genève et Vaud)
Nous utilisons les zones de traffic du Modèle Voyageur de l'ARE.

We want to sample :
- all the residents of Canton de Genève
- all the activities that happen in Canton de Genève

To do that we flag all destionation Kantons in the oclumns _leading_stay_id_in_KT_

In [None]:
#staypoints_ = staypoints.copy()
#staypoints = staypoints_.copy()

In [None]:
%%time
# Perform spatial join of staypoints with world_countries
staypoints = gpd.sjoin(staypoints, world_countries, how='left', predicate='within').rename(columns={'SOVEREIGNT':'activity_in_country'})

# Fill NaN values in the 'country_name' column with 'Unknown'
staypoints['activity_in_country'] = staypoints['activity_in_country'].fillna('Unknown')
staypoints.drop(columns=['index_right'], inplace=True)

# Perform spatial join with TAZ
staypoints = gpd.sjoin(staypoints, TAZ[['N_KT', 'geometry']], how='left', predicate='within').rename(columns={'N_KT': 'activity_in_KT'})
staypoints.drop(columns=['index_right'], inplace=True)
# Adjust the saptial join for corner cases
staypoints.loc[~staypoints.activity_in_KT.isna(),'activity_in_country'] = 'Switzerland'
staypoints['activity_in_KT'] = staypoints['activity_in_KT'].fillna('Other')

# Perform spatial join with Panel Lemanique area
# Function to check if a point is within the panel's geometry
staypoints = gpd.sjoin(staypoints, perimetre_panel_full.dissolve(), how='left', predicate='within')
staypoints.loc[staypoints.panel_area.isna(),'panel_area'] = 0
staypoints['panel_area'] = staypoints.panel_area.astype(int)
staypoints.drop(columns=['index_right'], inplace=True)

# Get the home and motorization of the user_
staypoints = pd.merge(staypoints, usr_stats[['KT_home_survey','user_id_fors','car_in_HH_count','main_motor']], on='user_id_fors', how='left')

In [None]:
%%time
legs = pd.merge(legs, staypoints[['activity_id','activity_in_KT','panel_area','KT_home_survey']].dropna(subset='activity_id'),
                left_on='leading_stay_id',
                right_on='activity_id',
                how='left')
del legs['activity_id']

#### Cartographie pour vérifier les filtres

In [None]:
staypoints.head(2)

In [None]:
%autoreload
xyt.plot_gps(staypoints[staypoints.activity_in_KT == 'GE'].rename(columns={'user_id_fors':'user_id'}).dropna()[:2000], geo_columns='geometry')

#### Ajouter indicateur si valeur extrême

In [None]:
len(legs)

In [None]:
# Function to add quantile columns
def add_quantile_flags(group, column_prefix):
    quantile_95 = group['length_leg'].quantile(0.95)
    quantile_98 = group['length_leg'].quantile(0.98)
    quantile_99 = group['length_leg'].quantile(0.99)
    group[f'extreme95_length_{column_prefix}'] = group['length_leg'] > quantile_95
    group[f'extreme98_length_{column_prefix}'] = group['length_leg'] > quantile_98
    group[f'extreme99_length_{column_prefix}'] = group['length_leg'] > quantile_99
    return group

# List of grouping columns
grouping_columns = ['detected_mode', 'mode']

# Loop through each grouping column
for mode_col in grouping_columns:
    # Create a copy of the relevant subset
    quant_detected_mode = legs[['leg_id', mode_col, 'length_leg']].copy()
    
    # Initialize an empty list to collect results
    dfs_to_concat = []
    
    # Loop through unique values of mode_col
    for mode in quant_detected_mode[mode_col].unique():
        # Filter dataframe for the current mode_col value
        subset = 0
        subset = quant_detected_mode[quant_detected_mode[mode_col] == mode].copy()
        
        # Apply quantile flags function
        subset = add_quantile_flags(subset, mode_col)
        
        # Append to list
        dfs_to_concat.append(subset)
    
    # Concatenate all dataframes in the list
    concatenated_df = pd.concat(dfs_to_concat, ignore_index=True)
    
    # Drop unnecessary columns before merge
    drop_columns = ['length_leg', mode_col]
    concatenated_df.drop(columns=drop_columns, inplace=True)
    concatenated_df.drop_duplicates(inplace=True)
    
    # Merge concatenated_df back to legs based on leg_id
    legs = pd.merge(legs,concatenated_df, on='leg_id', how='left').drop_duplicates(subset=['leg_id','mode','detected_mode','user_id_day'])

In [None]:
len(legs)

In [None]:
legs.groupby('mode')['length_leg'].mean() / 1000

In [None]:
legs[legs.extreme95_length_mode].groupby('mode')['length_leg'].mean() / 1000

In [None]:
legs[legs.extreme99_length_mode].groupby('mode')['length_leg'].mean() / 1000

#### Arranger le fichier

In [None]:
legs.columns

In [None]:
#Remove 'usr_w_constant_bad_signal', length


col_order = col = ['leg_id', 'user_id_day', 'user_id_fors', 'user_id_motiontag', 'type',
 'geometry', 'legs_date', 'started_at', 'started_at_timezone', 
 'finished_at','finished_at_timezone', 
 'length_leg', 'detected_mode', 'mode', 'leading_stay_purpose',
 'confirmed_at', 'started_on', 'misdetected_completely', 'merged',
 'created_at', 'updated_at', 'started_at_in_timezone',
 'finished_at_in_timezone', 'confirmed_at_in_timezone',
 'created_at_in_timezone', 'updated_at_in_timezone',
 'point_per_linestring', 'max_signlalloss_meters',
 'rel_max_signalloss', 'low_quality_legs_1', 'low_quality_legs_2', 'leading_stay_id',
 'duration', 'activity_in_KT', 'panel_area', 'KT_home_survey',
    'extreme95_length_detected_mode',
       'extreme98_length_detected_mode', 'extreme99_length_detected_mode',
       'extreme95_length_mode', 'extreme98_length_mode',
       'extreme99_length_mode']


legs = legs[col_order].dropna(subset='user_id_fors')
legs.rename(columns={'panel_area':'activity_in_panel_area'}, inplace=True)
legs.sort_values(by=['user_id_fors','started_at'], inplace=True)

#### Exporter la données pour l'app streamlit

In [None]:
# Output for streamlit app
legs_nogeometry = legs.copy()
del legs_nogeometry['geometry']
del legs_nogeometry['user_id_motiontag']
#legs_nogeometry.to_pickle('../Data/processed_feuille_de_route/legs_nogeometry.pkl')

# And for other usages
#legs.to_pickle('../Data/processed_feuille_de_route/legs.pkl')

In [None]:
len(legs)

In [None]:
import pandas as pd

def get_daily_modal_distances(df):
    
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    df['length'] = df['length'].astype(float)
    # Group by 'user_id_day', 'previous_mode', and 'previous_leg_id', then sum the distances
    grouped = df.groupby(['user_id_fors', 'user_id_day', 'mode'])['length'].sum().reset_index()

    # Pivot the table to have modes as columns
    pivoted = grouped.pivot_table(
        index=['user_id_fors', 'user_id_day'],
        columns='mode',
        values='length',
        aggfunc='sum'
    ).reset_index()

    # Resample to include missing days and fill NaNs with different values in different columns
    pivoted['date'] = pd.to_datetime(pivoted['user_id_day'].str[-8:])
    # Create a date range covering the entire date range for each ID
    date_ranges = pivoted.groupby('user_id_fors')['date'].agg(['min', 'max']).reset_index()
    date_ranges['legs_date'] = date_ranges.apply(lambda row: pd.date_range(row['min'], row['max'], freq='D'), axis=1)

    # Create a Cartesian product of IDs and date ranges
    cartesian = date_ranges.explode('legs_date').reset_index(drop=True)

    # Complete the original df with a continuous timeline
    pivoted_filled = pd.merge(pivoted, cartesian[['user_id_fors', 'legs_date']], how='outer', left_on=['user_id_fors', 'date'],
                              right_on=['user_id_fors', 'legs_date'])

    # Create 'days_without_track' column and mark as True for added rows, False otherwise
    pivoted_filled['days_without_track'] = pivoted_filled['date'].isnull().astype(int)
    del pivoted_filled['date']

    # Fill missing values in the user_id_day column
    pivoted_filled['user_id_day'] = pivoted_filled.apply(
        lambda row: row['user_id_day'] if not pd.isnull(row['user_id_day'])
        else row['user_id_fors'] + "_" +
             row['legs_date'].strftime('%Y%m%d'),
        axis=1
    )

    # Fill missing values in the modes columns
    # Get the columns that start with 'Mode::'
    modes_columns = [col for col in pivoted_filled.columns if col.startswith('Mode::')]

    # Fill missing values in the 'modes_columns' with 0
    pivoted_filled[modes_columns] = pivoted_filled[modes_columns].fillna(0)

    # Sort the resulting DataFrame
    pivoted_filled.sort_values(by=['user_id_fors', 'legs_date'], inplace=True)

    return pivoted_filled


#### Fonctions pour l'app streamlit

In [1]:
import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

from shapely import geometry, ops
from shapely.geometry import MultiLineString, LineString, Point
import os
import concurrent.futures
from shapely.ops import unary_union
from shapely.geometry import JOIN_STYLE, Polygon, MultiPolygon

import pycountry
import xyt

import time

from panel_functions import *

In [2]:
legs_nogeometry = pd.read_pickle('../Streamlit/data/legs_nogeometry.pkl')
usr_stats = pd.read_pickle('../Streamlit/data/usr_stats_nogeometry.pkl')

In [8]:
# SET PARAMETERS

#KT = st.sidebar.selectbox('**Sélectionner le canton pour échantillonnage**', ['GE', 'VD', 'Tous'])
#weight = st.sidebar.selectbox('**Sélectionner la pondération**', ['wgt_cant_trim_gps', 'Aucun']) #,'wgt_agg_trim_gps'
#period_of_tracking = st.sidebar.selectbox("**Sélectionner la période d'observation à considérer\*\***", ['active_days_count', 'days_with_track','days_in_range'])
#
#mode_aggreg = st.sidebar.selectbox("**Sélectionner le niveau d'aggrégation des modes**", 
#                                   ["Motiontag", "MRMT", "Niveau 1", "Niveau 2"])
#
#bad_users= st.sidebar.checkbox('Inclure les utilisateurs avec mauvais signal récurrent', value=False)
#visitors = st.sidebar.checkbox('Inclure les visiteurs', value=False)
#airplane = st.sidebar.checkbox('Inclure les étapes en avion', value=False)
#incl_signal_loss = st.sidebar.checkbox('Inclure les étapes avec une perte de signal importante (recommandé)', value=True)
#outliers = st.sidebar.selectbox('**Exclure les distances extrêmes**', ['Quantile95', 'Quantile98', 'Quantile99', 'Aucune'], index=2)
#mode_col = st.sidebar.selectbox("**Sélectionner la colonne des modes de transport**", ['detected_mode', 'mode'])

mode_col = 'detected_mode'

bad_users = False
period_of_tracking = 'days_in_range'

airplane = False
KT = 'GE'
visitors = False

incl_signal_loss = 'Non'


weight = 'wgt_cant_trim_gps'

outliers = 'Aucune'

mode_aggreg = 'Motiontag'

In [9]:
# SUBSET USERS

# Filter on bad users
bad_users_condition = (usr_stats['usr_w_constant_bad_signal'] == 0) if not bad_users else np.full(len(usr_stats), True)

usr_stats_sub_list = usr_stats.loc[bad_users_condition, 'user_id_fors'].to_list()

# Creating a dictionary mapping user IDs to their corresponding period of tracking values
active_days_mapping = usr_stats.set_index('user_id_fors').loc[usr_stats_sub_list, period_of_tracking].to_dict()

# Creating a dictionary mapping user IDs to their corresponding weight values
# If weight is 'Aucun', map each user ID to the value 1
if weight == 'Aucun':
    weight_mapping = usr_stats.set_index('user_id_fors').loc[usr_stats_sub_list].apply(lambda x: 1, axis=1).to_dict()
else:
    weight_mapping = usr_stats.set_index('user_id_fors').loc[usr_stats_sub_list, weight].to_dict()

# SUBSET LEGS
legs_sub = legs_nogeometry[legs_nogeometry.user_id_fors.isin(usr_stats_sub_list)].copy()

# Filter Airplane if needed
airplane_condition = (legs_sub[mode_col] != 'Mode::Airplane') if not airplane else np.full(len(legs_sub), True)


# Filter on residents and visitors
resid_condition = (legs_sub['KT_home_survey'] == KT) if KT != 'Tous' else np.full(len(legs_sub), True)
visit_condition = (legs_sub['activity_in_KT'] == KT) if visitors else np.full(len(legs_sub), True)
if visitors:
    resident_visit_condition = resid_condition | visit_condition
else:
    resident_visit_condition = resid_condition

# Filter tracks with signal loss
# Handle selection
if incl_signal_loss == "0.05 de perte":
    signal_loss_threshold = 'low_quality_legs_1'
elif incl_signal_loss == "0.07 de perte":
    signal_loss_threshold = 'low_quality_legs_2'
signal_loss_condition = (legs_sub[signal_loss_threshold] == 0) if incl_signal_loss != "Non" else np.full(len(legs_sub), True)

# Filter outliers if needed
# Handle selection
if outliers == "Quantile95":
    outlier_threshold = f"extreme95_length_{mode_col}"
elif outliers == "Quantile98":
    outlier_threshold = f"extreme98_length_{mode_col}"
elif outliers == "Quantile99":
    outlier_threshold = f"extreme99_length_{mode_col}"

outliers_condition = (~legs_sub[outlier_threshold]) if outliers != "Aucune" else np.full(len(legs_sub), True)

In [None]:
combined_dmd_condition = airplane_condition & resident_visit_condition & signal_loss_condition & outliers_condition

In [10]:
outliers_condition

array([ True,  True,  True, ...,  True,  True,  True])

In [7]:
outliers_condition

0         True
1         True
2         True
3         True
4         True
          ... 
669272    True
669273    True
669274    True
669275    True
669276    True
Name: extreme95_length_detected_mode, Length: 661273, dtype: bool

In [None]:
resident_visit_condition

In [None]:
pd.Series(signal_loss_condition)

In [None]:
airplane_condition

In [None]:
outliers_condition=legs_sub.copy()

In [None]:
legs_sub[mode_col] != 'Mode::Airplane' if not airplane else np.full(len(legs_sub), True)

In [None]:
len(legs_nogeometry)

In [None]:
#Compute daily modal distances
def get_daily_modal_distances(df, mode_col):
    
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    df['length_leg'] = df['length_leg'].astype(float)
    # Group by 'user_id_day', 'previous_mode', and 'previous_leg_id', then sum the distances
    grouped = df.groupby(['user_id_fors', 'user_id_day', mode_col])['length_leg'].sum().reset_index()

    # Pivot the table to have modes as columns
    pivoted = grouped.pivot_table(
        index=['user_id_fors', 'user_id_day'],
        columns=mode_col,
        values='length_leg',
        aggfunc='sum'
    ).reset_index()

    # Resample to include missing days and fill NaNs with different values in different columns
    pivoted['date'] = pd.to_datetime(pivoted['user_id_day'].str[-8:])
    # Create a date range covering the entire date range for each ID
    date_ranges = pivoted.groupby('user_id_fors')['date'].agg(['min', 'max']).reset_index()
    date_ranges['legs_date'] = date_ranges.apply(lambda row: pd.date_range(row['min'], row['max'], freq='D'), axis=1)

    # Create a Cartesian product of IDs and date ranges
    cartesian = date_ranges.explode('legs_date').reset_index(drop=True)

    # Complete the original df with a continuous timeline
    pivoted_filled = pd.merge(pivoted, cartesian[['user_id_fors', 'legs_date']], how='outer', left_on=['user_id_fors', 'date'],
                              right_on=['user_id_fors', 'legs_date'])

    # Create 'days_without_track' column and mark as True for added rows, False otherwise
    pivoted_filled['days_without_track'] = pivoted_filled['date'].isnull().astype(int)
    del pivoted_filled['date']

    # Fill missing values in the user_id_day column
    pivoted_filled['user_id_day'] = pivoted_filled.apply(
        lambda row: row['user_id_day'] if not pd.isnull(row['user_id_day'])
        else row['user_id_fors'] + "_" +
             row['legs_date'].strftime('%Y%m%d'),
        axis=1
    )

    # Fill missing values in the modes columns
    # Get the columns that start with 'Mode::'
    modes_columns = [col for col in pivoted_filled.columns if col.startswith('Mode::')]

    # Fill missing values in the 'modes_columns' with 0
    pivoted_filled[modes_columns] = pivoted_filled[modes_columns].fillna(0)

    # Sort the resulting DataFrame
    pivoted_filled.sort_values(by=['user_id_fors', 'legs_date'], inplace=True)

    return pivoted_filled

In [None]:
#Compute daily modal distances
def calculate_dmd(legs_nogeom, usr_stats, KT, weight, period_of_tracking, bad_users, visitors, airplane, incl_signal_loss, outliers, mode_col):

    # SUBSET USERS
    
    # Filter on bad users
    bad_users_condition = (usr_stats['usr_w_constant_bad_signal'] == 0) if not bad_users else np.full(len(usr_stats), True)
    
    usr_stats_sub_list = usr_stats.loc[bad_users_condition, 'user_id_fors'].to_list()
    
    # Creating a dictionary mapping user IDs to their corresponding period of tracking values
    active_days_mapping = usr_stats.set_index('user_id_fors').loc[usr_stats_sub_list, period_of_tracking].to_dict()
    
    # Creating a dictionary mapping user IDs to their corresponding weight values
    # If weight is 'Aucun', map each user ID to the value 1
    if weight == 'Aucun':
        weight_mapping = usr_stats.set_index('user_id_fors').loc[usr_stats_sub_list].apply(lambda x: 1, axis=1).to_dict()
    else:
        weight_mapping = usr_stats.set_index('user_id_fors').loc[usr_stats_sub_list, weight].to_dict()
    
    # SUBSET LEGS
    legs_sub = legs_nogeom[legs_nogeom.user_id_fors.isin(usr_stats_sub_list)].copy()
    
    # Filter Airplane if needed
    airplane_condition = (legs_sub[mode_col] != 'Mode::Airplane') if not airplane else np.full(len(legs_sub), True)
    
    
    # Filter on residents and visitors
    resid_condition = (legs_sub['KT_home_survey'] == KT) if KT != 'Tous' else np.full(len(legs_sub), True)
    visit_condition = (legs_sub['activity_in_KT'] == KT) if visitors else np.full(len(legs_sub), True)
    if visitors:
        resident_visit_condition = resid_condition | visit_condition
    else:
        resident_visit_condition = resid_condition
    
    # Filter tracks with signal loss
    # Handle selection
    if incl_signal_loss == "0.05 de perte":
        signal_loss_threshold = 'low_quality_legs_1'
    elif incl_signal_loss == "0.07 de perte":
        signal_loss_threshold = 'low_quality_legs_2'
    signal_loss_condition = (legs_sub[signal_loss_threshold] == 0) if incl_signal_loss != "Non" else pd.Series(np.full(len(legs_sub), True))
    
    # Filter outliers if needed
    # Handle selection
    if outliers == "Quantile95":
        outlier_threshold = f"extreme95_length_{mode_col}"
    elif outliers == "Quantile98":
        outlier_threshold = f"extreme98_length_{mode_col}"
    elif outliers == "Quantile99":
        outlier_threshold = f"extreme99_length_{mode_col}"
    
    outliers_condition = (~legs_sub[outlier_threshold]) if outliers != "Aucune" else np.full(len(legs_sub), True)
    
    # Combine all conditions
    def check_boolean_series(condition, name):
        if not isinstance(condition, pd.Series) or condition.dtype != 'bool':
            raise ValueError(f"{name} is not a boolean series.")
    
    check_boolean_series(airplane_condition, 'airplane_condition')
    check_boolean_series(resident_visit_condition, 'resident_visit_condition')
    check_boolean_series(signal_loss_condition, 'signal_loss_condition')
    check_boolean_series(outliers_condition, 'outliers_condition')
    
    combined_dmd_condition = airplane_condition & resident_visit_condition & signal_loss_condition & outliers_condition
    
    dmd = get_daily_modal_distances(legs_sub[combined_dmd_condition], mode_col)
    
    # Filtering columns that start with 'Mode::' for further calculations
    mode_columns = dmd.filter(like='Mode::')
    
    # Calculating the sum for each 'Mode::' column for each user_id
    sum_mode_per_user = mode_columns.groupby(dmd['user_id_fors']).apply(lambda x: x.sum())
    
    # Weighting the sum of each 'Mode::' column based on user weights and active days
    sum_mode_per_user_w = sum_mode_per_user.mul(sum_mode_per_user.index.map(weight_mapping), axis=0).div(sum_mode_per_user.index.map(active_days_mapping), axis=0).dropna()

    return sum_mode_per_user_w.astype(int)

In [None]:
# Aggregate modes in dmd
def dmd_aggreg_modes(dmd, level):
    df = dmd.copy()

    if level == "Motiontag":
        return df
    else:
        if level == "MRMT":
            # First level of mode mapping
            mode_mapping = {
                'Voiture conducteur': ['Mode::Car', 'Mode::Carsharing','Mode::Ecar'],
                'Taxi': ['Mode::TaxiUber'],
                '2RM': ['Mode::KickScooter','Mode::Motorbike'],
                'Train': ['Mode::RegionalTrain','Mode::Train'],
                'Bus': ['Mode::Bus'],
                'Tram/Métro': ['Mode::LightRail','Mode::Subway','Mode::Tram'],
                'Bateau': ['Mode::Boat'],
                'Marche': ['Mode::Walk'],
                'Vélo conventionnel': ['Mode::Bicycle', 'Mode::Bikesharing'],
                'Vélo électrique': ['Mode::Ebicycle'],
                'Engins assimilés à des véhicules': ['Mode::Other'],
                'Avion': ['Mode::Airplane']
            }
    
        elif level == "Niveau 1":
            # Second level of mode mapping
            mode_mapping = {
                'Voiture': ['Mode::Car', 'Mode::Carsharing','Mode::Ecar','Mode::TaxiUber'],
                '2RM': ['Mode::KickScooter', 'Mode::Motorbike'],
                'Train': ['Mode::Train','Mode::RegionalTrain'],
                'Autre TP': ['Mode::Bus','Mode::LightRail','Mode::Subway','Mode::Tram','Mode::Boat'],
                'Marche': ['Mode::Walk'],
                'Vélo': ['Mode::Bicycle', 'Mode::Bikesharing','Mode::Ebicycle'],
                'Autre': ['Mode::Other'],
                'Avion': ['Mode::Airplane']
            }
    
        elif level == "Niveau 2":
            # Third level of mode mapping
            mode_mapping = {
                'TIM': ['Mode::Car', 'Mode::Carsharing','Mode::Ecar', 'Mode::KickScooter','Mode::Motorbike','Mode::TaxiUber'],
                'TP': ['Mode::Boat','Mode::Bus','Mode::LightRail','Mode::RegionalTrain', 'Mode::Subway','Mode::Train', 'Mode::Tram'],
                'MD': ['Mode::Bicycle', 'Mode::Bikesharing','Mode::Ebicycle', 'Mode::Walk'],
                'Avion': ['Mode::Airplane'],
                'Autre': ['Mode::Other']
            }
        
        else:
            raise ValueError("Invalid level. Please choose Motiontag, MRMT, Niveau 1 or Niveau 2 for the desired level.")
        
        # Create new columns based on the mapping
        for new_column, modes in mode_mapping.items():
            # Check if modes exist in columns before summing
            valid_modes = [mode for mode in modes if mode in df.columns]
            df[new_column] = df[valid_modes].sum(axis=1, min_count=1)
        
        # Create a new DataFrame with the new columns
        new_dmd = df[list(mode_mapping.keys())].copy()
        
        # Check if 'Avion' column is full of NaN, then drop it
        if 'Avion' in new_dmd.columns and new_dmd['Avion'].isnull().all():
            new_dmd.drop(columns=['Avion'], inplace=True)
    
        return new_dmd

In [None]:
dmd_w = calculate_dmd(legs_nogeometry, usr_stats, KT, weight, period_of_tracking, bad_users, visitors, airplane, incl_signal_loss, outliers, mode_col)

In [None]:
dmd_w

In [None]:
import pandas as pd

def dmd_aggreg_modes(dmd, level):
    df = dmd.copy()

    if level == "Motiontag":
        return df
    else:
        if level == "MRMT":
            # First level of mode mapping
            mode_mapping = {
                'Voiture conducteur': ['Mode::Car', 'Mode::Carsharing','Mode::Ecar'],
                'Taxi': ['Mode::TaxiUber'],
                '2RM': ['Mode::KickScooter','Mode::Motorbike'],
                'Train': ['Mode::RegionalTrain','Mode::Train'],
                'Bus': ['Mode::Bus'],
                'Tram/Métro': ['Mode::LightRail','Mode::Subway','Mode::Tram'],
                'Bateau': ['Mode::Boat'],
                'Marche': ['Mode::Walk'],
                'Vélo conventionnel': ['Mode::Bicycle', 'Mode::Bikesharing'],
                'Vélo électrique': ['Mode::Ebicycle'],
                'Engins assimilés à des véhicules': ['Mode::Other'],
                'Avion': ['Mode::Airplane']
            }
    
        elif level == "Niveau 1":
            # Second level of mode mapping
            mode_mapping = {
                'Voiture conducteur': ['Mode::Car', 'Mode::Carsharing','Mode::Ecar','Mode::TaxiUber'],
                '2RM': ['Mode::KickScooter', 'Mode::Motorbike'],
                'Train': ['Mode::Train','Mode::RegionalTrain'],
                'Autre TP': ['Mode::Bus','Mode::LightRail','Mode::Subway','Mode::Tram','Mode::Boat'],
                'Marche': ['Mode::Walk'],
                'Vélo': ['Mode::Bicycle', 'Mode::Bikesharing','Mode::Ebicycle'],
                'Autre': ['Mode::Other'],
                'Avion': ['Mode::Airplane']
            }
    
        elif level == "Niveau 2":
            # Third level of mode mapping
            mode_mapping = {
                'TIM': ['Mode::Car', 'Mode::Carsharing','Mode::Ecar', 'Mode::KickScooter','Mode::Motorbike','Mode::TaxiUber'],
                'TP': ['Mode::Boat','Mode::Bus','Mode::LightRail','Mode::RegionalTrain', 'Mode::Subway','Mode::Train', 'Mode::Tram'],
                'MD': ['Mode::Bicycle', 'Mode::Bikesharing','Mode::Ebicycle', 'Mode::Walk'],
                'Avion': ['Mode::Airplane'],
                'Autre': ['Mode::Other']
            }
        
        else:
            raise ValueError("Invalid level. Please choose Motiontag, MRMT, Niveau 1 or Niveau 2 for the desired level.")
        
        # Create new columns based on the mapping
        for new_column, modes in mode_mapping.items():
            # Check if modes exist in columns before summing
            valid_modes = [mode for mode in modes if mode in df.columns]
            df[new_column] = df[valid_modes].sum(axis=1, min_count=1)
        
        # Create a new DataFrame with the new columns
        new_dmd = df[list(mode_mapping.keys())].copy()
        
        # Check if 'Avion' column is full of NaN, then drop it
        if 'Avion' in new_dmd.columns and new_dmd['Avion'].isnull().all():
            new_dmd.drop(columns=['Avion'], inplace=True)
    
        return new_dmd

In [None]:
# Possible values: 'GE', 'VD', 'Tous'
KT = 'Tous'
# Possible values: 'wgt_agg_trim_gps', 'wgt_cant_gps', 'wgt_agg_gps', 'wgt_cant_trim_gps', 'Aucun'
weight = 'wgt_agg_trim_gps' 

# Selecting the period of tracking for user activities
# Possible values: 'active_days_count', 'days_with_track'
period_of_tracking = 'active_days_count'

visitors = False
airplane = False
incl_signal_loss = True
dmd, active = calculate_dmd(legs_nogeometry, usr_stats, KT, weight, 
              period_of_tracking, visitors, airplane,incl_signal_loss)

dmd

In [None]:
dist_user_id_day = dmd_aggreg_modes(dmd, level='Niveau 1')
dist_user_id_day.head()

In [None]:
legs_nogeometry_lql = legs_nogeometry[legs_nogeometry.low_quality_legs_1 == 1].copy().reset_index(drop=True)
dmd_lql = calculate_dmd(legs_nogeometry_lql, usr_stats, KT, weight, 
              period_of_tracking, visitors, airplane,incl_signal_loss)

dist_user_id_day_lql = dmd_aggreg_modes(dmd_lql, level='Niveau 1')
dist_user_id_day_lql.head()

In [None]:
dist_user_id_day_lql.sum() / dist_user_id_day.sum()

In [None]:
dist_user_id_day_lql.sum().sum() / dist_user_id_day.sum().sum()

In [None]:
(dist_user_id_day_lql / dist_user_id_day).fillna(0).mean()

In [None]:
len(legs_nogeometry[legs_nogeometry.low_quality_legs_1 == 1]) / len(legs_nogeometry)

In [None]:
sum_mode_per_user_w.head()

In [None]:
len(sum_mode_per_user_w)

In [None]:
modal_share = pd.DataFrame(sum_mode_per_user.sum()) #/ len(sum_mode_per_user))/ sum_mode_per_user.sum().sum() *100 
modal_share.astype(int).rename(columns={0:'Distance_cumulée_metre'}).T

In [None]:
import matplotlib.pyplot as plt

# Plotting a Pie Chart
plt.figure(figsize=(10, 6))
mode_means = sum_mode_per_user_w.sum() / sum_mode_per_user_w.sum().sum()
plt.pie(mode_means, labels=mode_means.index, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Modal Shares')
plt.show()

In [None]:
legs.started_at.min()

In [None]:
legs.started_at.max()

In [None]:
import matplotlib.pyplot as plt


# Plot the polygons with no background, grey lines
fig, ax = plt.subplots(figsize=(8, 8))

# Plot the polygons
perimetre_panel.plot(ax=ax, facecolor='none', linewidth=1)

# Remove axes
ax.set_axis_off()

In [None]:
# Save the plot to a PNG file
output_file = "../Data/temp_files/contour_panel.png"
plt.savefig(output_file, bbox_inches='tight', pad_inches=0, transparent=True)
plt.close()

In [None]:
perimetre_panel#.plot()

In [None]:
TAZ

In [None]:
TAZ[TAZ.N_KT=='VD'].plot()