# Explorer les valeurs extrêmes
Dans ce notebook nous explorons les valeurs extrêmes liées aux pertes de signal, aux durées par motif et aux distances par mode.
Nous explorons aussi qui sont les usagers qui disparaissent.

In [None]:
%load_ext autoreload

import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

from shapely import geometry, ops
from shapely.geometry import MultiLineString, LineString, Point
import os
from shapely import wkb
import binascii
import pandas as pd
import geopandas as gpd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import xyt

import time

## Charger les données

In [None]:
#Charger les activités
staypoints = pd.read_pickle('../Data/dumps_motiontag/storyline_formated/staypoints.pkl').reset_index(drop=True)
print("Fichier activité chargé")

#Charger les étapes
legs = pd.read_pickle('../Data/processed_feuille_de_route/legs.pkl')
print("Fichier étape chargé")

## Durée des activités aux waypoints

In [None]:
durations = staypoints[['purpose','started_at','finished_at']].copy()
durations['duration'] = (durations.finished_at - durations.started_at).dt.total_seconds() / 60
durations.head(3)

In [None]:
durations[(durations.duration > 1440) & (durations.purpose != 'home')]

In [None]:
# Plotting the distribution of duration per purpose using seaborn
plt.figure(figsize=(10, 6))

# Sorting by mean duration per purpose
sorted_order = durations.groupby('purpose')['duration'].mean().sort_values().index

# Creating a violin plot to show distribution, with durations less than 800
sns.violinplot(x='purpose', y='duration', data=durations[durations['duration'] < 1440], 
               order=sorted_order, cut=0, hue='purpose', inner=None,
               palette=['white'], linewidth=1.5)

# Adding titles and labels
plt.title("Distribution de la durées des 'waypoints' par motif \n tronqué à 1440 min (24h) \n", fontsize=12)
plt.xlabel("Motif de l'étape \n", fontsize=12)
plt.ylabel('Durée (minutes) \n', fontsize=12)

# Customizing x-axis labels
plt.xticks(rotation=45, fontsize=12, ha='right')
# Customizing y-axis labels
plt.yticks(fontsize=12)

# Removing legend since it's not necessary for this plot
#plt.legend().set_visible(False)

# Save the plot as a PNG file
#plt.savefig('../Data/temp_files/duration_distribution_per_purpose.png', format='png',bbox_inches='tight')

# Display the plot
plt.show()

In [None]:
# Add count percentage column
total_count = durations['purpose'].value_counts(normalize=True) * 100

# Statistical summary for the 'duration' grouped by 'purpose'
summary = durations.groupby('purpose')['duration'].describe(percentiles=[0.95, 0.98])
summary = summary.rename(columns={
    'mean': 'Durée moyenne [min]',
    'std': 'Écart-type',
    'min': 'Durée min  [min]',
    '25%': '1er Quartile',
    '50%': 'Médiane',
    '75%': '3e Quartile',
    '95%': '95e Percentile',
    '98%': '98e Percentile',
    'max': 'Durée max  [min]'
})

# Add count percentage to summary
summary['Occurence (%)'] = total_count
del summary['count']

# Keep only one decimal place
summary = summary.round(1)

# Sort by mean
summary = summary.sort_values(by='Durée moyenne [min]', ascending=False)

summary

## Modes detectés et modes inferrés

In [None]:
legs.columns

In [None]:
infer = legs[['detected_mode','mode', 'misdetected_completely', 'confirmed_at', 'updated_at']]
infer.head()

In [None]:
len(infer.iloc[np.where(infer['detected_mode'] !=infer['mode'])]) / len(infer)

In [None]:
infer.iloc[np.where(infer['detected_mode'] !=infer['mode'])]

In [None]:
sum(infer.confirmed_at.isna()) / len(infer)

In [None]:
sum(infer.updated_at.isna())

In [None]:
%autoreload
xyt.plot_gps(legs[legs.misdetected_completely=='t'].rename(columns={'user_id_fors':'user_id'}), geo_columns='geometry')


In [None]:
legs[legs.misdetected_completely=='t'].rename(columns={'user_id_fors':'user_id'}).dropna()

In [None]:
legs.misdetected_completely.value_counts(normalize=True)

## Distances des étapes par mode

In [None]:
legs.columns

In [None]:
distances = legs[['leg_id','started_at','finished_at','length','detected_mode','mode','misdetected_completely']].copy()
distances['duration'] = (distances.finished_at - distances.started_at).dt.total_seconds() / 60
distances.head()

In [None]:
# Explore
#distances[(distances.length < 10000) & (distances['mode'] == 'Mode::Airplane')]

In [None]:
# Statistical summary for the 'duration' grouped by 'mode'
summary = distances.groupby('detected_mode')['duration'].describe(percentiles=[0.95, 0.98]).astype(int)
summary = summary.rename(columns={
    'mean': 'Durée moyenne [min]',
    'std': 'Écart-type',
    'min': 'Durée min  [min]',
    '25%': '1er Quartile',
    '50%': 'Médiane',
    '75%': '3e Quartile',
    '95%': '95e Percentile',
    '98%': '98e Percentile',
    'max': 'Durée max  [min]'
})
summary

In [None]:
# Statistical summary for the 'length' grouped by 'mode'
summary = distances.groupby('detected_mode')['length'].describe(percentiles=[0.95, 0.98]).astype(int)
summary = summary.rename(columns={
    'mean': 'Distance moyenne [m]',
    'std': 'Écart-type',
    'min': 'Distance min  [m]',
    '25%': '1er Quartile',
    '50%': 'Médiane',
    '75%': '3e Quartile',
    '95%': '95e Percentile',
    '98%': '98e Percentile',
    'max': 'Distance max [m]'
})
summary

In [None]:
# Statistical summary for the 'length' grouped by 'mode'
summary = distances[distances.misdetected_completely == 'f'].groupby('mode')['length'].describe(percentiles=[0.95, 0.98]).astype(int)
summary = summary.rename(columns={
    'mean': 'Distance moyenne [m]',
    'std': 'Écart-type',
    'min': 'Distance min  [m]',
    '25%': '1er Quartile',
    '50%': 'Médiane',
    '75%': '3e Quartile',
    '95%': '95e Percentile',
    '98%': '98e Percentile',
    'max': 'Distance max [m]'
})
summary

In [None]:
# Plot extreme values
extreme_plane = legs.loc[(legs['mode'] == 'Mode::Airplane') & (legs.length_leg < 10000)].rename(columns={'user_id_fors':'user_id'})
xyt.plot_gps(extreme_plane, geo_columns='geometry')


In [None]:
legs.loc[(legs['mode'] == 'Mode::Airplane') & (legs.length_leg < 1000)]

## Précision de détection

In [None]:
len(legs)

In [None]:
# Pourcentage de trace gps validées
len(legs[['mode','detected_mode','confirmed_at']].dropna()) / len(legs)

In [None]:
condition1 = ~legs.confirmed_at.isna()
condition2 = (pd.to_datetime(legs.confirmed_at) -  pd.to_datetime(legs.created_at)).dt.total_seconds() / 3600 < 48

df = legs.loc[condition1 & condition2, ['mode','detected_mode']]

# Step 1: Count occurrences where mode == detected_mode for each mode
match_counts = df[df['mode'] == df['detected_mode']].groupby('mode').size()

# Step 2: Count total occurrences of each mode in the 'mode' column
total_counts = df.groupby('mode').size()

# Step 3: Calculate the percentage of similarity
similarity_percentage = (match_counts / total_counts) 

# Fill NaN values with 0 (if there are modes that don't have any matches)
similarity_percentage = similarity_percentage.fillna(0)

pd.DataFrame(similarity_percentage)

## Pertes de signal

## Discontinuités temporelles