#  Module 1 : Parts modales

**Description** : Le but de ce module est de mener un premier calcul des parts modales kilométriques i.e. les distances journalières moyennisées par mode et par motif.

**Durée estimée première partie** : 7 jours

**Objectifs spécifiques** :
- [ ] Sous-échantillonnage des résidents et visiteurs par canton (basé sur le GPS)
- [ ] Rééchantillonnage des jours d’observation pour avoir un calendrier continue par usager
- [ ] Intégrer le détail des transit
- [ ] Distinguer de façon aussi systématique que possible les jours sans déplacement des jours
non-détectés et comparaison statistique au jours non-déplacé dans d’autres bases de
données
- [ ] Recodage des modes et motifs selon besoin des cantons
- [ ] Calcul liminaire des parts modales kilométriques et par déplacements
- [ ] Ajout des données d’équipement (e.g. type de motorisation principale du ménage)
- [ ] Documenter les hypothèses et limites du calcul liminaire des parts modales (e.g. aspects
saisonniers, échantillonnage, perte de signal, moyennisation des données longitudinales, ...)

**Résultats attendus** : Parts modales kilométriques par mode pour les résidents et visiteurs de chaque canton en vue du calcul des émissions carbone. Il doit être possible de calculer les parts modales en tenant compte des jours non-mobiles.

**Sous-échantillonnage** :
- Vaud : résident·es du canton
- Genève : résident·es du canton

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

from shapely import geometry, ops
from shapely.geometry import MultiLineString, LineString, Point
import os
import concurrent.futures
from shapely.ops import unary_union
import xyt

import time

### Charger les données

In [None]:
%%time
# Définir le CRS du projet (EPSG:4326 for WGS84)
target_crs = 'EPSG:4326'
print("CRS du projet: WGS84 \n")

#Charger les étapes
# Ask the user for input
choice = input("Do you want to load a sample or the full leg data frame? \n Enter 'sample' or 'full': ")

# Define the file paths
if choice.lower() == 'sample':
    file_path = '../Data/time_space_filters/legs_filtered_randsample.pkl'
elif choice.lower() == 'full':
    file_path = '../Data/time_space_filters/legs_filtered.pkl'
else:
    print("Invalid choice. Please enter 'sample' or 'full'.")

# Load the selected data frame
try:
    legs = pd.read_pickle(file_path)
    print("Fichier étape chargé")
except FileNotFoundError:
    print("File not found. Please check the file path.")
legs = gpd.GeoDataFrame(legs, geometry="geometry")

del legs['canton_dep']

#Charger les activités
staypoints = pd.read_pickle('../Data/time_space_filters/staypoints_filtered.pkl').reset_index(drop=True)
staypoints = gpd.GeoDataFrame(staypoints, geometry="geometry")
print("Fichier activité chargé")

#Charger les user_statistics
usr_stats = pd.read_csv('../Data/gps_user_statistics.csv')
print("Fichier statistiques utilisateur·ices chargé")

###  Formater les données

In [None]:
staypoints['started_at'] = pd.to_datetime(staypoints['started_at'])
staypoints['finished_at'] = pd.to_datetime(staypoints['finished_at'])

legs['started_at'] = pd.to_datetime(legs['started_at'])
legs['finished_at'] = pd.to_datetime(legs['finished_at'])

staypoints.rename(columns={'IDNO':'user_id', 'id':'activity_id'}, inplace = True)
legs.rename(columns={'IDNO':'user_id', 'id':'leg_id'}, inplace = True)

staypoints['lon'] = staypoints.geometry.x
staypoints['lat'] = staypoints.geometry.y

### Ajouter le *next activity_id* aux étapes

In [None]:
# Sort 'points' and 'legs' by 'started_at' to ensure data is in chronological order
staypoints.sort_values(by=['user_id','started_at'], inplace=True, ignore_index=True)
legs.sort_values(by=['user_id','started_at'], inplace=True)

In [None]:
legs = pd.merge(legs, staypoints[['activity_id', 'previous_leg_id']],
               left_on='leg_id', right_on='previous_leg_id', how='left')
legs.rename(columns={'activity_id':'leading_stay_id'}, inplace=True)
del legs['previous_leg_id']

###  Ajouter la durée et la longueur des étapes

In [None]:
%%time 
# Add length in meters
legs['length'] = legs.to_crs('EPSG:2056').length
# Add the duration in seconds
legs['duration'] = (legs['finished_at'] - legs['started_at']).dt.total_seconds()

### Extraire les aires géographiques et les sous-échantillons (Genève et Vaud)
Nous utilisons les zones de traffic du Modèle Voyageur de l'ARE.

We want to sample :
- all the residents of Canton de Genève
- all the activities that happen in Canton de Genève

In [None]:
%%time

# Prompt the user for N_KT value
n_kt = 'GE'

# Unir les zones de trafic
shp_KT = unary_union(TAZ[TAZ.N_KT == n_kt].geometry)

# Lister les résident·es du KT
list_residents_N_KT = dom.loc[dom.within(shp_KT), 'IDNO'].tolist()

# Sous Echantillon des legs des résident·es du KT
legs_N_KT = legs.loc[legs.user_id.isin(list_residents_N_KT)].copy()

# Liste des activités des résident·es du KT
list_staypoints_residents_N_KT = legs_N_KT.dropna().leading_stay_id.tolist()

In [None]:
%%time
# Identifier les activités qui se passent dans le KT
staypoints_N_KT = staypoints[staypoints.activity_id.isin(list_staypoints_residents_N_KT)]
list_activity_id_in_KT = staypoints_N_KT.loc[staypoints_N_KT.within(shp_KT), 'activity_id'].tolist()

#Flagger les activités qui se passent dans le KT
legs_N_KT['leading_stay_id_in_KT'] = 0
legs_N_KT.loc[legs_N_KT.leading_stay_id.isin(list_activity_id_in_KT), 'leading_stay_id_in_KT'] = 1

In [None]:
#Ajouter le user_id_day
legs_N_KT.insert(
    1,"user_id_day",legs_N_KT["user_id"]
    + "_" 
    + legs_N_KT.started_at.dt.year.astype(str)
    + legs_N_KT.started_at.dt.month.astype(str).str.zfill(2)
    + legs_N_KT.started_at.dt.day.astype(str).str.zfill(2),
)
legs_N_KT.insert(1, 'leg_date',legs_N_KT.started_at.dt.date)
legs_N_KT['leg_date'] = pd.to_datetime(legs_N_KT['leg_date'])

In [None]:
%autoreload
xyt.plot_gps(staypoints[staypoints.activity_id.isin(list_activity_id_in_KT)].dropna()[:2000], geo_columns='geometry')

In [None]:
usr = legs_N_KT.user_id.sample(20).tolist()
df_ = legs_N_KT.loc[legs_N_KT.user_id.isin(usr)]
df_.head()

In [None]:
df_.leg_date.max() - df_.leg_date.min()

In [None]:
import pandas as pd

def get_daily_modal_distances(df):
    
    # Create a copy of the DataFrame to avoid modifying the original
    df = df.copy()
    
    df['length'] = df['length'].astype(float)
    # Group by 'user_id_day', 'previous_mode', and 'previous_leg_id', then sum the distances
    grouped = df.groupby(['user_id', 'user_id_day', 'mode'])['length'].sum().reset_index()

    # Pivot the table to have modes as columns
    pivoted = grouped.pivot_table(
        index=['user_id', 'user_id_day'],
        columns='mode',
        values='length',
        aggfunc='sum'
    ).reset_index()

    # Resample to include missing days and fill NaNs with different values in different columns
    pivoted['date'] = pd.to_datetime(pivoted['user_id_day'].str[-8:])
    # Create a date range covering the entire date range for each ID
    date_ranges = pivoted.groupby('user_id')['date'].agg(['min', 'max']).reset_index()
    date_ranges['leg_date'] = date_ranges.apply(lambda row: pd.date_range(row['min'], row['max'], freq='D'), axis=1)

    # Create a Cartesian product of IDs and date ranges
    cartesian = date_ranges.explode('leg_date').reset_index(drop=True)

    # Complete the original df with a continuous timeline
    pivoted_filled = pd.merge(pivoted, cartesian[['user_id', 'leg_date']], how='outer', left_on=['user_id', 'date'],
                              right_on=['user_id', 'leg_date'])

    # Create 'resample' column and mark as True for added rows, False otherwise
    pivoted_filled['resample'] = pivoted_filled['date'].isnull()
    del pivoted_filled['date']

    # Fill missing values in the user_id_day column
    pivoted_filled['user_id_day'] = pivoted_filled.apply(
        lambda row: row['user_id_day'] if not pd.isnull(row['user_id_day'])
        else row['user_id'] + "_" +
             row['leg_date'].strftime('%Y%m%d'),
        axis=1
    )

    # Fill missing values in the modes columns
    # Get the columns that start with 'Mode::'
    modes_columns = [col for col in pivoted_filled.columns if col.startswith('Mode::')]

    # Fill missing values in the 'modes_columns' with 0
    pivoted_filled[modes_columns] = pivoted_filled[modes_columns].fillna(0)

    # Sort the resulting DataFrame
    pivoted_filled.sort_values(by=['user_id', 'leg_date'], inplace=True)

    return pivoted_filled


In [None]:
dmd = get_daily_modal_distances(df_)#.tail(20)
dmd

###  Get the mean distance per user in meter

In [None]:
# Assuming your DataFrame is named df
# Filter columns that start with 'Mode::'
mode_columns = dmd.filter(like='Mode::')

# Calculate the mean for each user_id, considering zeros
mean_mode_per_user = mode_columns.groupby(dmd['user_id']).apply(lambda x: x.mean())
mean_mode_per_user

###  Get the sum distance per user in meter

In [None]:
import pandas as pd

# Assuming your DataFrame is named df
# Filter columns that start with 'Mode::'
mode_columns = dmd.filter(like='Mode::')

# Calculate the mean for each user_id, considering zeros
sum_mode_per_user = mode_columns.groupby(dmd['user_id']).apply(lambda x: x.sum())

# Count the total entries grouped by user_id
sum_mode_per_user['days_in_range_count'] = mode_columns.groupby(dmd['user_id']).size()

sum_mode_per_user


In [None]:
user_stat = pd.read_csv('../Data/dumps_fors/UserStatistics.EPFL-Panel.2023-04-24--2023-06-05.csv', sep=';')
stats = user_stat.loc[user_stat.IDNO.isin(usr),['IDNO','inactive_days_count','days_in_range_count']]

sum_mode_per_user_ = pd.merge(sum_mode_per_user.reset_index(), stats, how='left', left_on='user_id', right_on='IDNO')
del sum_mode_per_user_['IDNO']
sum_mode_per_user_

In [None]:
def get_user_activity_stats(count_act):
    # Convert 'started_at' column to datetime
    count_act['started_at'] = pd.to_datetime(count_act['started_at'])

    # Extract only the date part
    count_act['date'] = count_act['started_at'].dt.date

    # Group by 'user_id', then find the min and max dates
    user_stats = count_act.groupby('user_id')['date'].agg(['min', 'max']).reset_index()

    # Calculate the total days in the range for each user
    user_stats['days_in_range'] = (pd.to_datetime(user_stats['max']) - pd.to_datetime(user_stats['min'])).dt.days + 1

    # Create a date range covering the entire date range for each user
    date_ranges = user_stats.apply(lambda row: pd.date_range(row['min'], row['max'], freq='D'), axis=1)
    user_stats['date_range'] = date_ranges

   # Group by 'user_id' and count the unique dates
    user_unique_dates = count_act.groupby(['user_id'])['date'].nunique().reset_index()

    # Merge with user_unique_dates to get active_days_count
    user_stats = pd.merge(user_stats, user_unique_dates, on='user_id', how='left')
    user_stats.rename(columns={'date': 'active_days_count'}, inplace=True)

    # Calculate the number of missing days within the range for each user
    user_stats['missing_days'] = user_stats['days_in_range'] - user_stats['date_range'].apply(len)

    # Drop unnecessary columns
    user_stats.drop(columns=['date_range'], inplace=True)

    # Rename the min/may columns
    user_stats.rename(columns={'min':'first_activity_date','max':'last_activity_date'}, inplace=True)

    return user_stats

In [None]:
#subset of staypoints
staypoints_ = staypoints.loc[staypoints.user_id.isin(usr),['user_id','started_at']]

get_user_activity_stats(staypoints_)

In [None]:
test = staypoints.loc[staypoints.user_id.isin(['CH2158']),['user_id','started_at']]
test['date'] = test.started_at.dt.date
test

In [None]:
len(test.date.unique())