#  Générer les statistiques usager

**Description** : Le but de ce notebook est de générer des statistiques sur le suivi des usagers. Un fichier user_statistics a déjà été livré par Motiontag mais celui-ci donne des statistiques sur une période donnée (2023-04-24--2023-06-05), or la date de début et de fin de tracking pour chaque répondant peut être variable. Notre objectif est donc de reproduire le fichier user_statistics par individu.

**Objectifs spécifiques** :
- [ ] Obtenir le nombre de jours avec observation et le nombre de jour non-détecté 
- [ ] Connaître les périodes d'observaiton pour chaque personne
- [ ] Connaître le canton de résidence de chaque répondant·e
- [ ] Ajouter des variables fournies par Motiontag sur les devices

**Résultats attendus** : Un fichier avec une ligne par répondant·e.

**Sous-échantillonnage** :
Tout l'échantillon

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import geopandas as gpd
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

from shapely import geometry, ops
from shapely.geometry import MultiLineString, LineString, Point
import os
import concurrent.futures
from shapely.ops import unary_union
import xyt

import time

from panel_functions import *

### Charger les données

In [None]:
%%time
# Définir le CRS du projet (EPSG:4326 for WGS84)
target_crs = 'EPSG:4326'
print("CRS du projet: WGS84 \n")

#Charger les étapes
legs = pd.read_pickle('../Data/dumps_motiontag/storyline_time_space_filters/legs_filter.pkl')
print("Fichier étape chargé")

#Charger les activités
staypoints = pd.read_pickle('../Data/dumps_motiontag/storyline_formated/staypoints.pkl')
#staypoints = gpd.GeoDataFrame(staypoints, geometry="geometry", crs=target_crs)
print("Fichier activité chargé")


#Charger le questionnaire de la Vague 1 Mobilité pour Domicile déclaré
columns_to_keep = ['longitude_Q14', 'latitude_Q14', 'IDNO', 'Pays', 'Numéro_INSEE', 'Numéro_OFS']
home_declared = pd.read_csv('../../Vague1/EPFL_vague1_v4.csv', low_memory=False, usecols=columns_to_keep)
#Remove those who did not declare any address
home_declared = home_declared[home_declared.longitude_Q14 > 0]
# Create a geometry column from longitude and latitude
geometry = [Point(xy) for xy in zip(home_declared['longitude_Q14'], home_declared['latitude_Q14'])]
# Combine Numéro_INSEE and Numéro_OFS into a single column 'ID_municipality'
home_declared['ID_municipality_survey'] = home_declared.apply(lambda x: x['Numéro_OFS'] if x['Pays'] == '1' else x['Numéro_INSEE'], axis=1)
home_declared = home_declared.drop(columns=['Numéro_OFS', 'Numéro_INSEE', 'longitude_Q14','latitude_Q14'])
# Create a GeoDataFrame
home_declared = gpd.GeoDataFrame(home_declared, geometry=geometry, crs=target_crs)
home_declared = home_declared.rename(columns={'geometry':'home_geometry_from_survey','IDNO':'user_id_fors'})
home_declared = gpd.GeoDataFrame(home_declared, geometry='home_geometry_from_survey', crs=target_crs)
print("Fichier domicile chargé")

#Charger le fichier user_statistics livré par Motiontag mais inexploitable
usr_stat_motion_tag  = pd.read_csv('../Data/dumps_motiontag/UserStatistics.EPFL-Panel.2023-04-24--2023-06-05.csv', sep=';')
print("Fichier user_statistics chargé (FORS et Motiontag)")

#Charger le fichier de pondération par Alexis Gumy
weights = pd.read_csv('../Data/weights_lasur/data_wgt.csv', sep=',').rename(columns={'IDNO':'user_id_fors'})
w_cols = ['gdr','prof','age_fr','user_id_fors','wgt_cant_gps','wgt_agg_gps','wgt_cant_trim_gps','wgt_agg_trim_gps']
weights = weights[w_cols]
print("Fichier de pondération chargé")

#Charger le questionnaire de la Vague 1 Mobilité
full_survey = pd.read_csv('../../Vague1/EPFL_vague1_v4.csv', low_memory=False)
print("Fichier Questionnaire vague 1 chargé")

#Charger les bases officielles pour le sous-échantillonage géographique https://opendata.swiss/de/dataset/vm-uvek-zones-2017/resource/29b98f2c-42f2-4e72-b8b1-a39500ed0ad0
TAZ = gpd.read_file('../../Vague1/Verkehrszonen_Schweiz_NPVM_2017_shp/Verkehrszonen_Schweiz_NPVM_2017.shp')
TAZ = TAZ[['ID_Agglo', 'N_Agglo', 'N_KT', 'ID_Gem', 'geometry']]
TAZ = TAZ.to_crs(crs=target_crs)
#repare anomalies
TAZ['geometry'] = TAZ['geometry'].buffer(0)
shp_KT = TAZ.dissolve(by='N_KT').reset_index()
print("Fichier Zones de traffic chargé")

#Charger la table de correspondance entre les userid Motiontag et FORS
match_userid = pd.read_csv('../Data/dumps_fors/correspondance_user_id.csv')
print("Fichier table de correspondance user id chargé")

# Get perimetre panel GeoDataFrame
perimetre_panel = gpd.read_file('../Data/other_shp/perimetre_panel/perimetre_panel_08.01.24.shp')
perimetre_panel = perimetre_panel.to_crs(crs=target_crs)
perimetre_panel = perimetre_panel[['COMM_ID','COMM_NAME','Typo_panel','geometry']]

###  Aperçu des données

In [None]:
staypoints.head(2)

In [None]:
legs.head(2)

### Calculer les statistiques par répondant·e

####  Get usr activity statistics based on staypoints

In [None]:
len(staypoints.user_id_motiontag.unique())

In [None]:
def get_user_activity_stats(staypoints):
    
    # Make a copy to avoid error
    count_act = staypoints.copy()
    
    # Convert 'started_at' column to datetime
    count_act['started_at'] = pd.to_datetime(count_act['started_at'])

    # Extract only the date part
    count_act['date'] = count_act['started_at'].dt.date

    # Group by 'user_id_motiontag', then find the min and max dates
    user_stats = count_act.groupby(['user_id_motiontag','user_id_fors'])['date'].agg(['min', 'max']).reset_index()

    # Calculate the total days in the range for each user
    user_stats['days_in_range'] = (pd.to_datetime(user_stats['max']) - pd.to_datetime(user_stats['min'])).dt.days + 1

    # Create a date range covering the entire date range for each user
    date_ranges = user_stats.apply(lambda row: pd.date_range(row['min'], row['max'], freq='D'), axis=1)
    user_stats['date_range'] = date_ranges
    # Group by 'user_id_motiontag' and count the unique dates
    user_unique_dates = count_act.groupby('user_id_motiontag')['date'].nunique().reset_index()
#
    # Merge with user_unique_dates to get active_days_count
    user_stats = pd.merge(user_stats, user_unique_dates, on='user_id_motiontag', how='left')
    user_stats.rename(columns={'date': 'active_days_count'}, inplace=True)

    # Calculate the number of missing days within the range for each user
    user_stats['days_without_event'] = user_stats['days_in_range'] - user_stats['active_days_count']

    # Drop unnecessary columns
    user_stats.drop(columns=['date_range'], inplace=True)

    # Rename the min/may columns
    user_stats.rename(columns={'min':'first_activity_date','max':'last_activity_date'}, inplace=True)

    return user_stats

In [None]:
usr_stats = get_user_activity_stats(staypoints)

In [None]:
len(usr_stats.user_id_motiontag.unique())

In [None]:
# IMPORTANT Check the value of active_days_count
#id_ = 'CH10068'
#usr_stats[usr_stats.user_id_motiontag == id_]['active_days_count'].iloc[0]

In [None]:
# Must be identical to:
#len(count_act[count_act.user_id == id_].date.unique())

In [None]:
#len(usr_stats)

In [None]:
usr_stats = pd.merge(usr_stats, 
                     get_user_activity_stats(legs).rename(columns={'active_days_count':'days_with_track','days_without_event':'days_without_track'}),#[['user_id_motiontag','days_without_track']],
                    on=['user_id_motiontag','user_id_fors'],
                    how='left')

usr_stats.loc[~usr_stats.user_id_motiontag.isin(legs.user_id_motiontag.unique().tolist()),'days_without_track'] = 99

usr_stats['days_without_track'] = usr_stats['days_without_track'].astype(int)

#### Get the max period of observation between legs and staypoints

In [None]:
# Convert date columns to datetime
date_cols = ['first_activity_date_x', 'last_activity_date_x', 'first_activity_date_y', 'last_activity_date_y']
usr_stats[date_cols] = usr_stats[date_cols].apply(pd.to_datetime)

# Calculate new columns
usr_stats['first_activity_date'] = usr_stats[['first_activity_date_x', 'first_activity_date_y']].min(axis=1)
usr_stats['last_activity_date'] = usr_stats[['last_activity_date_x', 'last_activity_date_y']].max(axis=1)
usr_stats['days_in_range'] = usr_stats[['days_in_range_x', 'days_in_range_y']].max(axis=1)

# Drop old columns
columns_to_drop = ['first_activity_date_x', 'last_activity_date_x', 'days_in_range_x',
                   'first_activity_date_y', 'last_activity_date_y', 'days_in_range_y']
usr_stats = usr_stats.drop(columns=columns_to_drop)

#### Add stats on track and stay confirmation by usr

In [None]:
confirmation_track_count = legs[['user_id_motiontag', 'confirmed_at','leg_id']].groupby('user_id_motiontag').count().reset_index()
confirmation_track_count.rename(columns={'confirmed_at':'confirmed_tracks_count','leg_id':'tracks_count'},inplace=True)
confirmation_track_count['unconfirmed_tracks_count'] = confirmation_track_count.tracks_count - confirmation_track_count.confirmed_tracks_count
confirmation_track_count['confirmation_tracks_score'] = (confirmation_track_count.confirmed_tracks_count / confirmation_track_count.tracks_count * 100).astype(int)
confirmation_track_count.head(2)

In [None]:
confirmation_stay_count = staypoints[['user_id_motiontag', 'confirmed_at','activity_id']].groupby('user_id_motiontag').count().reset_index()
confirmation_stay_count.rename(columns={'confirmed_at':'confirmed_stay_count','activity_id':'stay_count'},inplace=True)
confirmation_stay_count['unconfirmed_stay_count'] = confirmation_stay_count.stay_count - confirmation_stay_count.confirmed_stay_count
confirmation_stay_count['confirmation_stay_score'] = (confirmation_stay_count.confirmed_stay_count / confirmation_stay_count.stay_count * 100).astype(int)
confirmation_stay_count.head(2)

In [None]:
#Add
merge_on_col_track = ['user_id_motiontag','confirmed_tracks_count','tracks_count','confirmation_tracks_score']
merge_on_col_stay = ['user_id_motiontag','confirmed_stay_count','stay_count','confirmation_stay_score']
usr_stats = pd.merge(usr_stats,
                     confirmation_track_count[merge_on_col_track],
                     how='left', on='user_id_motiontag')
usr_stats = pd.merge(usr_stats,
                     confirmation_stay_count[merge_on_col_stay],
                     how='left', on='user_id_motiontag')
#Reorder columns
cols = ['user_id_motiontag', 'user_id_fors', 
 'first_activity_date', 'last_activity_date', 'days_in_range', 'active_days_count',
 'days_without_event', 'days_with_track', 'days_without_track',
 'confirmed_tracks_count','tracks_count','confirmation_tracks_score','confirmed_stay_count','stay_count','confirmation_stay_score']
usr_stats = usr_stats[cols]

In [None]:
stat_day_confirmation = usr_stats[['confirmed_tracks_count','tracks_count','confirmed_stay_count','stay_count']]
stat_day_confirmation.describe().astype(int)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.patheffects as path_effects


# Customizing the box plot
fig, ax = plt.subplots(figsize=(8, 8))


# Plotting the box plot for the selected columns
stat_day_confirmation.plot.box(
    ax=ax, 
    notch=True,             # Draws a notch to represent the confidence interval
    patch_artist=True,      # Fill the box with color
    boxprops=dict(facecolor='white', color='black'),  # Customize box appearance
    medianprops=dict(color='black', linewidth=2),          # Customize median line
    whiskerprops=dict(color='black', linewidth=1.5),      # Customize whiskers
    capprops=dict(color='black', linewidth=1.5),          # Customize caps
    flierprops=dict(marker='o', color='black', alpha=0.5)  # Customize outliers
)

# Adding titles and labels in French with larger font sizes
ax.set_title(' ', fontsize=16)
ax.set_xlabel(' ', fontsize=16)
ax.set_ylabel("Nombre d'événements GPS", fontsize=12)

# Customizing x-axis labels to French
ax.set_xticklabels(['Etapes \nconfirmées', 'Toutes les \nétapes ', 'Arrêts \nconfirmés', 'Tous les \narrêts'], fontsize=12)

# Customizing y-axis labels
ax.tick_params(axis='y', labelsize=12)

# Customizing grid
ax.grid(True, linestyle='--', linewidth=0.7, alpha=0.7, color='black')

# Save the plot as a PNG file
#plt.savefig('../Data/temp_files/box_plot_observations.png', format='png')

# Displaying the plot
plt.show()

In [None]:
usr_stats[['confirmation_tracks_score','confirmation_stay_score']].describe().astype(int)

#### Add users with constant bad signal

In [None]:
len(legs.loc[legs.usr_w_constant_bad_signal == 1, 'user_id_motiontag'].unique().tolist())

In [None]:
usr_stats['usr_w_constant_bad_signal'] = 0
usr_stats.loc[usr_stats.user_id_motiontag.isin(legs.loc[legs.usr_w_constant_bad_signal == 1, 'user_id_motiontag'].unique().tolist()), 'usr_w_constant_bad_signal'] = 1

#### Add home location from GPS data

In [None]:
%%time 
# Obtenir la localisation du domicile sur la base des données GPS
import pandas as pd
from shapely.geometry import Point
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Step 1: Convert the 'geometry' column to Shapely Point objects and remove nans
staypoints_ = staypoints.loc[staypoints.purpose == 'home'].copy()#.dropna()

# Step 2: Define a function to cluster points within a specified radius using 'lon' and 'lat' columns
def cluster_points(points_, eps, min_samples):
    """
    Clusters points using DBSCAN algorithm.

    Parameters:
        points_ (DataFrame): DataFrame with 'lon' and 'lat' columns.
        eps (float): The maximum distance between two samples for one to be considered as in the neighborhood of the other.
        min_samples (int): The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.

    Returns:
        List of cluster labels for each input point.
    """

    points = points_.copy()
    
    # Check if DataFrame is empty
    if points.empty:
        return []
    
    scaler = StandardScaler()
    scaled_points = scaler.fit_transform(points[['lon', 'lat']])

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    dbscan.fit(scaled_points)

    return dbscan.labels_

# Step 3: Apply the clustering function to your points
staypoints_['cluster'] = cluster_points(staypoints_, eps=0.00045, min_samples=15)

# Step 4: Define a function to calculate the centroid of each cluster
def calculate_centroid(points):
    """
    Calculates the centroid of a list of points.

    Parameters:
        points (DataFrame): DataFrame with 'lon' and 'lat' columns.

    Returns:
        Centroid Point object.
    """
    centroid_lon = points['lon'].mean()
    centroid_lat = points['lat'].mean()
    return Point(centroid_lon, centroid_lat)

# Step 5: Create a list to store the most recurrent cluster for each user_id
most_recurrent_clusters = []

# Step 6: Calculate the most recurrent cluster for each user_id and store its centroid
for user_id, user_group in staypoints_.groupby('user_id_motiontag'):
    cluster_counts = user_group['cluster'].value_counts()
    most_recurrent_cluster = cluster_counts.idxmax()
    
    # Get the group of points for the most recurrent cluster
    most_recurrent_group = user_group[user_group['cluster'] == most_recurrent_cluster]
    
    # Calculate the centroid for the most recurrent cluster group
    centroid = calculate_centroid(most_recurrent_group)
    
    most_recurrent_clusters.append({'user_id_motiontag': user_id, 'centroid': centroid})

# Step 7: Create a new DataFrame from the most_recurrent_clusters list
most_recurrent_clusters_df = pd.DataFrame(most_recurrent_clusters)

# Step 8: Prepare for merging
most_recurrent_clusters_df.rename(columns={'centroid':'home_geometry_from_gps'}, inplace=True)


In [None]:
usr_stats = pd.merge(usr_stats,most_recurrent_clusters_df,how='left', on='user_id_motiontag')


#### Add home location from declaration in survey

In [None]:
home_declared.head(2)

In [None]:
usr_stats = pd.merge(usr_stats,home_declared,how='left', on='user_id_fors')

#### Add administrative bounderies

In [None]:
TAZ.plot()

In [None]:
TAZ.head(2)

In [None]:
%%time
usr_stats = gpd.GeoDataFrame(usr_stats, geometry="home_geometry_from_survey", crs=target_crs)
usr_stats = gpd.sjoin(usr_stats, 
                      TAZ.rename(columns={'N_KT':'KT_home_survey','ID_Agglo':'ID_Agglo_survey','N_Agglo':'N_Agglo_survey','ID_Gem':'ID_Commune_survey'}), 
                      how='left', predicate='intersects')
del usr_stats['index_right']
usr_stats[['ID_Commune_survey','ID_Agglo_survey']] = usr_stats[['ID_Commune_survey','ID_Agglo_survey']].astype(int, errors='ignore') 

usr_stats = gpd.GeoDataFrame(usr_stats, geometry="home_geometry_from_gps", crs=target_crs)
usr_stats = gpd.sjoin(usr_stats, 
                      shp_KT[['N_KT','geometry']].rename(columns={'N_KT':'KT_home_gps'}), 
                      how='left', predicate='intersects')
del usr_stats['index_right']
usr_stats = gpd.GeoDataFrame(usr_stats, geometry="home_geometry_from_survey", crs=target_crs)

In [None]:
usr_stats.iloc[np.where(usr_stats['user_id_fors'].str.startswith('FR').isna())].head(2)

In [None]:
# Create a boolean mask for rows that start with 'FR'
mask = usr_stats['user_id_fors'].str.startswith('FR')

# Use the boolean mask to filter rows and then assign values
usr_stats.loc[mask, ['KT_home_gps', 'KT_home_survey', 'N_Agglo_survey']] = 'Fra'
usr_stats.loc[mask, ['ID_Agglo_survey', 'ID_Commune_survey']] = 99

# Fill NaN values in 'ID_Commune_survey' and 'ID_Agglo_survey' columns with 99
usr_stats['ID_Commune_survey'] = usr_stats['ID_Commune_survey'].fillna(99).astype(int)
usr_stats['ID_Agglo_survey'] = usr_stats['ID_Agglo_survey'].fillna(99).astype(int)

# Calculate 'days_with_track' column
#usr_stats['days_with_track'] = usr_stats['active_days_count'] - usr_stats['days_without_track']


In [None]:
usr_stats.N_Agglo_survey.value_counts()

#### Add typo panel

In [None]:
perimetre_panel[['geometry','Typo_panel']].plot()

In [None]:
usr_stats = gpd.sjoin(usr_stats, 
                      perimetre_panel[['geometry','Typo_panel']].rename(columns={'Typo_panel':'typo_panel_home_survey'}), 
                      how='left', predicate='intersects')
del usr_stats['index_right']


In [None]:
# Remove columns 'Pays', 'ID_Commune_survey'

reorder_cols = ['user_id_motiontag', 'user_id_fors', 
 'first_activity_date','last_activity_date', 'days_in_range', 'active_days_count',
 'days_without_event', 'days_with_track', 'days_without_track',
 'usr_w_constant_bad_signal',
 'confirmed_tracks_count', 'tracks_count', 'confirmation_tracks_score',
 'confirmed_stay_count', 'stay_count', 'confirmation_stay_score',
 'KT_home_survey', 'home_geometry_from_survey', 'ID_municipality_survey','ID_Agglo_survey', 'typo_panel_home_survey', 'N_Agglo_survey',
 'KT_home_gps','home_geometry_from_gps']

usr_stats = usr_stats[reorder_cols]
usr_stats = gpd.GeoDataFrame(usr_stats, geometry="home_geometry_from_survey", crs=target_crs)

#### Ajouter les pondérations

In [None]:
usr_stats = pd.merge(usr_stats, weights, on='user_id_fors', how='left')

In [None]:
usr_stats.head()

In [None]:
len(usr_stats)

#### Ajouter la motorisation des répondant·es

In [None]:
motor_map = {
    1: 'essence',
    2: 'diesel',
    3: 'electrique',
    4: 'hybride rechargeable',
    5: 'hybride nonrechargeable',
    6: 'gaznat_biogaz',
    7: 'GPL'
}

# Selecting columns
motorization = full_survey[['IDNO','Q4_1_1_R', 'Q4_1_2_R', 'Q5_R']].rename(columns={'Q5_R':'main_motor','IDNO':'user_id_fors'}).set_index('user_id_fors').copy()
# Replacing '#NULL!' and '-99' with NaN
motorization.replace(to_replace={'#NULL!': np.nan, '-99': np.nan}, inplace=True)
# Converting columns to float
motorization = motorization.astype(float)
# Summing the columns and creating a new 'car_in_HH' column
motorization['car_in_HH_count'] = motorization[['Q4_1_1_R', 'Q4_1_2_R']].sum(axis=1)
# Dropping the individual columns if needed
motorization.drop(columns=['Q4_1_1_R', 'Q4_1_2_R'], inplace=True)
# Mapping values in 'Q5_R' column
motorization['main_motor'] = motorization['main_motor'].map(motor_map)
# Reset index
motorization = motorization.drop_duplicates().reset_index()

usr_stats = pd.merge(usr_stats, motorization, on='user_id_fors',how='left')


In [None]:
usr_stats[usr_stats.duplicated()]

In [None]:
len(usr_stats)

In [None]:
usr_stats

#### Sauvegarde du fichier user_statistics

In [None]:
#usr_stats.to_pickle('../Data/processed_feuille_de_route/gps_user_statistics.pkl')
#usr_stats.to_csv('../Data/processed_feuille_de_route/gps_user_statistics.csv')

In [None]:
usr_stats

In [None]:
usr_stats_nogeometry = usr_stats.copy()

del usr_stats_nogeometry['home_geometry_from_survey']
del usr_stats_nogeometry['home_geometry_from_gps']
del usr_stats_nogeometry['user_id_motiontag']
#usr_stats_nogeometry.to_pickle('../Data/processed_feuille_de_route/usr_stats_nogeometry.pkl')

In [None]:
usr_stats_nogeometry#.columns

# Eléments remarquables de user_stats 

In [None]:
len(usr_stats.loc[usr_stats['days_without_event']>12])

In [None]:
usr_stats['days_without_event'].value_counts().sort_index()

In [None]:
import matplotlib.pyplot as plt

# Assuming usr_stats is your DataFrame
usr_stats['days_without_event'].value_counts().sort_index().plot(kind='line', color='#97C8A6')

# Adding labels and title
plt.xlabel('Jours sans événement (ni déplacement, ni activité)', size=11)
plt.ylabel('Fréquence', size=11)
plt.title('Compte des jours sans événement gps', size=11)

# Display the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.patheffects as path_effects

stat_day_count = usr_stats[['days_in_range','active_days_count','days_without_track','days_without_event']]
#stat_day_count.describe().astype(int)

# Customizing the box plot
fig, ax = plt.subplots(figsize=(8, 8))


# Plotting the box plot for the selected columns
stat_day_count.plot.box(
    ax=ax, 
    notch=True,             # Draws a notch to represent the confidence interval
    patch_artist=True,      # Fill the box with color
    boxprops=dict(facecolor='white', color='black'),  # Customize box appearance
    medianprops=dict(color='black', linewidth=2),          # Customize median line
    whiskerprops=dict(color='black', linewidth=1.5),      # Customize whiskers
    capprops=dict(color='black', linewidth=1.5),          # Customize caps
    flierprops=dict(marker='o', color='black', alpha=0.5)  # Customize outliers
)

# Adding titles and labels in French with larger font sizes
ax.set_title(' ', fontsize=16)
ax.set_xlabel(' ', fontsize=16)
ax.set_ylabel('Jours', fontsize=12)

# Customizing x-axis labels to French
ax.set_xticklabels(['Période \n d\'observations', 'Jours avec \n activité(s)', 'Jours sans \n déplacement', 'Jours sans \n observation'], fontsize=12)

# Customizing y-axis labels
ax.tick_params(axis='y', labelsize=12)

# Customizing grid
ax.grid(True, linestyle='--', linewidth=0.7, alpha=0.7, color='black')

# Save the plot as a PNG file
#plt.savefig('../Data/temp_files/box_plot_observations.png', format='png')

# Displaying the plot
plt.show()

In [None]:
usr_stats.describe()

In [None]:
len(usr_stats[usr_stats.KT_home_survey != usr_stats.KT_home_gps])

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

# Sample GeoDataFrame usr_stats with 'home_geometry_from_gps' and 'home_geometry_from_survey' columns
# Assuming the initial CRS is EPSG:4326 (WGS84)

# Define the target CRS in meters
target_crs_meters = 'EPSG:2056'  # Web Mercator

# Convert the 'home_geometry_from_gps' and 'home_geometry_from_survey' columns to the target CRS
usr_stats['home_geometry_from_gps_m'] = usr_stats['home_geometry_from_gps'].to_crs(target_crs_meters)
usr_stats['home_geometry_from_survey_m'] = usr_stats['home_geometry_from_survey'].to_crs(target_crs_meters)

# Calculate the distances in meters
usr_stats['distance_home_gps_survey_m'] = usr_stats['home_geometry_from_gps_m'].distance(usr_stats['home_geometry_from_survey_m'])

# Define the distance thresholds
distance_thresholds = [0, 250, 500, 1000, 2000, 3000, 4000, 5000, 7500, 10000]  
  # Add more thresholds as needed

# Count occurrences for each threshold
count_per_threshold = []
for threshold in distance_thresholds:
    count = usr_stats['distance_home_gps_survey_m'][usr_stats['distance_home_gps_survey_m'] > threshold].count()
    count_per_threshold.append(count)

# Plotting the distribution of distances for each threshold as a line chart
plt.figure(figsize=(8, 6))
plt.plot(distance_thresholds, count_per_threshold, marker='o', linestyle='-', color='skyblue')
plt.xlabel('Seuils de distance entre les deux localisations de domicile (mètres)')
plt.ylabel("Compte de répondant·es qui dépassent le seuil")
plt.title("Distance à vol d'oiseau entre \n le domicile déclaré (questionnaire) et le domicile inféré (GPS)")
plt.grid(True)
plt.xticks(distance_thresholds)
plt.xticks(rotation=45)
plt.show()


In [None]:
usr_stats['distance_home_gps_survey_m'].plot.hist(bins=100)