In [1]:
import pandas as pd
import gzip
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
import random

In [2]:
LOCAL = 'Avenches'
# LOCAL = 'Lausanne'
FRACT_ACT = 0.02
FRACT_POP = 0.1

#### Generic functions :

In [3]:
def stratified_sample(df, column, fraction):
    """
    Perform stratified sampling on df based on column.
    :param df: Input dataframe
    :param column: Column name for stratification
    :param fraction: Fraction of rows to sample from each group
    :return: Sampled dataframe
    """
    return df.groupby(column).apply(lambda x: x.sample(frac=fraction)).reset_index(drop=True)

In [4]:
def round_to_horizon(t):
    """ Help to round a time to 5m intervals """
    # Convert datetime.time to datetime.datetime for calculations
    dt = datetime.combine(datetime.today(), t)
    
    # Find the number of seconds since midnight
    seconds_since_midnight = (dt - dt.replace(hour=0, minute=0, second=0, microsecond=0)).seconds

    # Round to the closest 5 minutes (300 seconds)
    rounded_seconds = round(seconds_since_midnight / 300) * 300
    rounded_dt = dt.replace(hour=0, minute=0, second=0) + timedelta(seconds=rounded_seconds)

    return rounded_dt.time()


In [5]:
def time_to_horizon_interval(t):
    """ Return time horizon corresponding to a timestamp """
    rounded_time = round_to_horizon(t)
    
    # Convert datetime.time to datetime.datetime for calculations
    dt = datetime.combine(datetime.today(), rounded_time)
    
    # Get total minutes since midnight
    minutes_since_midnight = (dt - dt.replace(hour=0, minute=0, second=0)).seconds // 60

    # Convert total minutes to horizon intervals (5 minute intervals)
    horizon_interval = minutes_since_midnight // 5

    return horizon_interval


In [6]:
def convert_to_time(value):
    """ Check that all time values are correct (hours<24)"""
    try:
        # Try converting the value to datetime and extract the time
        return pd.to_datetime(value).time()
    except:
        # Return a placeholder for out-of-range values
        return 1

In [29]:
def get_preferences(id, type, df):
    ''' Returns the duration and starting preference for an individual and an activity'''
    row = df.loc[(df['id'] == id) & (df['type'] == type)]
    
    if row.empty:
        return default_durations[type], default_starting[type]
    else:
        # Assuming there's only one matching row, so taking the first one
        duration = row['duration_interval'].iloc[0]
        starting_time = row['start_time_interval'].iloc[0]
        return duration, starting_time

#### Preprocessing

Read the data and filter the irrelevants columns

In [30]:
activity_file = 'Data/Original/vaud_activities.csv.gz'
population_file = 'Data/Original/vaud_population.csv.gz'
trip_file = 'Data/Original/vaud_trips.csv.gz'

def read_gzipped_csv(file_path):
    with gzip.open(file_path, 'rt') as file:
        df = pd.read_csv(file)
    return df

activity_vaud = read_gzipped_csv(activity_file)
population_vaud = read_gzipped_csv(population_file)[['id', 'home_x', 'home_y', 'local']]
# trip_vaud = read_gzipped_csv(trip_file)[['Unnamed: 0', 'id', 'mode', 'dep_time','trav_time','start_link','end_link']].drop_duplicates()

Filter the population by the city

In [31]:
population_local = population_vaud[population_vaud['local'] == LOCAL] 
# print(len(population_local))
# population_local.head(1)

Use the filtered population to extract activities of the same city. Also count the activities by type in this city. 

In [32]:
population_local_ids = population_local['id'].unique()
activity_local = activity_vaud[activity_vaud['id'].isin(population_local_ids)]
activity_local_filt = activity_local[~activity_local['type'].isin(['other', 'pt interaction', 'home'])] 
activity_local_filt_nowork = activity_local[~activity_local['type'].isin(['other', 'pt interaction', 'home', 'work'])] 
# count_act_by_types = activity_local_filt.groupby('type')['facility'].nunique().reset_index()
# print(f"Here's the count of facilities by types in {LOCAL} : {count_act_by_types}")
# print(len(activity_local_filt))
# activity_local_filt.head(2)

Sample the activities of this city by keeping the proportion between each type (stratified sampling)

In [33]:
# peut etre associer un seed pour avoir toujours le meme resultat ? 
# Attention : stratified sampling != proportional sampling 
activity_local_filt_sampled = stratified_sample(activity_local_filt_nowork, column='type', fraction=FRACT_ACT) # 0.001 to compare exact / heuristic
# activity_local_filt_sampled.drop(columns=['start_time', 'end_time'], inplace=True)
# print(len(activity_local_filt))
# print(len(activity_local_filt_sampled))
# count_act_by_types_sampled = activity_local_filt_sampled.groupby('type')['facility'].nunique().reset_index()
# print(f"Here's the count of facilities by types in the sample : {count_act_by_types_sampled}")
# activity_local_filt_sampled.head()

Create new columns for each activity with their caracterics (anticipate initialization)

In [34]:
# 0 = 00:00 // 288 = 24:00 (total len = 289)
# Code to fix the following values for each activity type : 
    # t1 = earliest time to start
    # t2 = latest time to start
    # t3 = max duration
    # min duration
    # des duration

for i, row in activity_local_filt_sampled.iterrows():
    type_ = row['type']
    match type_: 
        case 'education':
            activity_local_filt_sampled.at[i, 'group'] = 1
            activity_local_filt_sampled.at[i, 'earliest_start'] = 84 # 7h
            activity_local_filt_sampled.at[i, 'latest_start'] = 276 # 23h
            activity_local_filt_sampled.at[i, 'max_duration'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
        case 'leisure':
            activity_local_filt_sampled.at[i, 'group'] = 3
            activity_local_filt_sampled.at[i, 'earliest_start'] = 0 # 0h
            activity_local_filt_sampled.at[i, 'latest_start'] = 276 # 23h
            activity_local_filt_sampled.at[i, 'max_duration'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
        case 'shop':
            activity_local_filt_sampled.at[i, 'group'] = 4
            activity_local_filt_sampled.at[i, 'earliest_start'] = 84 # 7h
            activity_local_filt_sampled.at[i, 'latest_start'] = 240 # 20h
            activity_local_filt_sampled.at[i, 'max_duration'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
            
int_columns = ['earliest_start', 'latest_start', 'max_duration', 'min_duration', 'x', 'y', 'group']
activity_local_filt_sampled[int_columns] = activity_local_filt_sampled[int_columns].astype(int) 

# Proof that we have a activity id : 
# print(len(activity_local_filt_sampled))
# print(activity_local_filt_sampled.index.nunique())

Sample the individuals among the city inhabitants and converts home coordinates in `int`

In [35]:
population_local_sample = population_local.sample(frac = FRACT_POP)
int_columns_2 = ['home_x', 'home_y'] 
population_local_sample[int_columns_2] = population_local_sample[int_columns_2].astype(int) 
# print(len(population_local))
# print(len(population_local_sample))
# population_local_sample.head(1)
# activity_local_filt_sampled.head()

Filter invalid times (hours < 24)

In [36]:
pd.set_option('mode.chained_assignment', None)

In [37]:
# Assuming 'start_time' and 'end_time' are in a format recognized by pandas (like 'HH:MM:SS')
activity_local_filt['start_time'] = pd.to_timedelta(activity_local_filt['start_time'].astype(str))
activity_local_filt['end_time'] = pd.to_timedelta(activity_local_filt['end_time'].astype(str))

# Filter out any NaT values or times that are not within the correct range
# For example, checking that 'start_time' and 'end_time' are less than 24 hours
activity_local_filt = activity_local_filt[
    (activity_local_filt['start_time'] < pd.Timedelta('1 days')) &
    (activity_local_filt['end_time'] < pd.Timedelta('1 days'))
]

# Convert 'start_time' and 'end_time' to 'datetime.time' if they are within a single day.
activity_local_filt['start_time'] = activity_local_filt['start_time'].apply(
    lambda x: (datetime.min + x).time() if isinstance(x, pd.Timedelta) and x < pd.Timedelta(days=1) else x
)
activity_local_filt['end_time'] = activity_local_filt['end_time'].apply(
    lambda x: (datetime.min + x).time() if isinstance(x, pd.Timedelta) and x < pd.Timedelta(days=1) else x
)

# print(len(activity_local_filt))
# activity_local_filt.head()


Convert time object in terms of horizons  

In [38]:
activity_local_filt['start_time_interval'] = activity_local_filt['start_time'].apply(time_to_horizon_interval)
activity_local_filt['end_time_interval'] = activity_local_filt['end_time'].apply(time_to_horizon_interval)
activity_local_filt['duration_interval'] = activity_local_filt['end_time_interval'] - activity_local_filt['start_time_interval']

For each individual, and for each activity type, add starting time and duration preferences

In [39]:
# activity_local_filt.head()
# population_local_sample.head()
rows_of_interessed = activity_local_filt[activity_local_filt['id'].isin(population_local_sample['id'])]
# rows_of_interessed.head()

In [40]:
# valeurs par default : c'est facile de faire la moyenne des time horizon pour type (starting et duration) !
# default_durations = {'shop': 216, 'leisure': 216, 'work': 108, 'education': 108}
# default_starting = {'shop': 12, 'leisure': 12, 'work': 48, 'education': 48}
# default_durations = {'shop': 0, 'leisure': 0, 'work': 0, 'education': 0}
# default_starting = {'shop': 0, 'leisure': 0, 'work': 0, 'education': 0}

default_durations = {}
default_starting = {}
facility_types = ['shop', 'leisure', 'education', 'work']
for facility in facility_types: 
    temp_df = activity_local_filt[activity_local_filt['type'] == facility]
    default_durations[facility] = int(np.floor(temp_df['duration_interval'].mean()))
    default_starting[facility] = int(np.floor(temp_df['start_time_interval'].mean()))
    
# print(default_starting)
# print(default_durations)

In [41]:
def get_preferences(id, type, df):
    ''' Returns the duration and starting preference for an individual and an activity
    Returns default value for each activity that the individual didn't do'''
    row = df.loc[(df['id'] == id) & (df['type'] == type)]
    
    if row.empty:
        return default_durations[type], default_starting[type], 0
    else:
        # Assuming there's only one matching row, so taking the first one
        duration = row['duration_interval'].iloc[0]
        starting_time = row['start_time_interval'].iloc[0]
        return duration, starting_time, 1

In [43]:
facility_types = ['shop', 'leisure', 'education', 'work']
for facility in tqdm(facility_types): 
    
    # function extracts both duration and starting time preferences and assigns them
    preferences = population_local_sample.apply(
        lambda row: get_preferences(row['id'], facility, rows_of_interessed), axis=1
    )
    
    population_local_sample[f'{facility}_duration'] = preferences.apply(lambda x: x[0])
    population_local_sample[f'{facility}_starting'] = preferences.apply(lambda x: x[1])
    population_local_sample[f'{facility}_participation'] = preferences.apply(lambda x: x[2])

# population_local_sample.head()

100%|██████████| 4/4 [00:00<00:00,  4.42it/s]


In [44]:
df_work_facilities = activity_local[activity_local['type'] == 'work']
print(f"There are {df_work_facilities['facility'].nunique()} work facilities in {LOCAL}")
work_facilities_count = df_work_facilities.groupby('facility')['id'].count()

# Créer un DataFrame résumé avec les coordonnées moyennes pour chaque établissement
facility_coords = df_work_facilities.groupby('facility')[['x', 'y']].mean()

# Créer le dictionnaire pour les coordonnées x et y des établissements
facility_coords_dict = facility_coords.to_dict('index')

# Choisir un établissement pour chaque individu dans l'échantillon
N = len(population_local_sample)
facilities = work_facilities_count.index.tolist()
workers = work_facilities_count.tolist()
chosen_facilities = random.choices(facilities, weights=workers, k=N)

# Assigner les coordonnées x, y et l'ID de l'établissement choisi
population_local_sample['work_x'] = [int(facility_coords_dict[facility]['x']) for facility in chosen_facilities]
population_local_sample['work_y'] = [int(facility_coords_dict[facility]['y']) for facility in chosen_facilities]
population_local_sample['work_id'] = chosen_facilities
# population_local_sample.head()

There are 791 work facilities in Avenches


Write the final preprocessed dataframes into .csv files

In [45]:
activity_local_filt_sampled.to_csv(f'Data/PreProcessed/activity.csv', index=False)
population_local_sample.to_csv(f'Data/PreProcessed/population.csv', index=False) 