In [1]:
import pandas as pd
import gzip
import numpy as np
from random import randint
from datetime import datetime, timedelta
from tqdm import tqdm

In [2]:
# LOCAL = 'Avenches'
LOCAL = 'Lausanne'

#### Generic functions :

In [3]:
def stratified_sample(df, column, fraction):
    """
    Perform stratified sampling on df based on column.
    :param df: Input dataframe
    :param column: Column name for stratification
    :param fraction: Fraction of rows to sample from each group
    :return: Sampled dataframe
    """
    return df.groupby(column).apply(lambda x: x.sample(frac=fraction)).reset_index(drop=True)

In [4]:
def round_to_horizon(t):
    """ Help to round a time to 5m intervals """
    # Convert datetime.time to datetime.datetime for calculations
    dt = datetime.combine(datetime.today(), t)
    
    # Find the number of seconds since midnight
    seconds_since_midnight = (dt - dt.replace(hour=0, minute=0, second=0, microsecond=0)).seconds

    # Round to the closest 5 minutes (300 seconds)
    rounded_seconds = round(seconds_since_midnight / 300) * 300
    rounded_dt = dt.replace(hour=0, minute=0, second=0) + timedelta(seconds=rounded_seconds)

    return rounded_dt.time()


In [5]:
def time_to_horizon_interval(t):
    """ Return time horizon corresponding to a timestamp """
    rounded_time = round_to_horizon(t)
    
    # Convert datetime.time to datetime.datetime for calculations
    dt = datetime.combine(datetime.today(), rounded_time)
    
    # Get total minutes since midnight
    minutes_since_midnight = (dt - dt.replace(hour=0, minute=0, second=0)).seconds // 60

    # Convert total minutes to horizon intervals (5 minute intervals)
    horizon_interval = minutes_since_midnight // 5

    return horizon_interval


In [6]:
def convert_to_time(value):
    """ Check that all time values are correct (hours<24)"""
    try:
        # Try converting the value to datetime and extract the time
        return pd.to_datetime(value).time()
    except:
        # Return a placeholder for out-of-range values
        return "out_of_range"

In [7]:
def get_preferences(id, type, df):
    ''' Returns the duration and starting preference for an individual and an activity'''
    row = df.loc[(df['id'] == id) & (df['type'] == type)]
    
    if row.empty:
        return default_durations[type], default_starting[type]
    else:
        # Assuming there's only one matching row, so taking the first one
        duration = row['duration_interval'].iloc[0]
        starting_time = row['start_time_interval'].iloc[0]
        return duration, starting_time

#### Preprocessing

Read the data and filter the irrelevants columns

In [8]:
# Specify the file paths
activity_file = 'data_original/vaud_activities.csv.gz'
population_file = 'data_original/vaud_population.csv.gz'
trip_file = 'data_original/vaud_trips.csv.gz'

# Read the gzipped CSV files
def read_gzipped_csv(file_path):
    with gzip.open(file_path, 'rt') as file:
        df = pd.read_csv(file)
    return df

# Read the dataframes
activity_vaud = read_gzipped_csv(activity_file)
population_vaud = read_gzipped_csv(population_file)[['id', 'age', 'home_x', 'home_y', 'local']].drop_duplicates()
trip_vaud = read_gzipped_csv(trip_file)[['Unnamed: 0', 'id', 'mode', 'dep_time','trav_time','start_link','end_link']].drop_duplicates()

Filter the population by the city

In [9]:
population_local = population_vaud[population_vaud['local'] == LOCAL] # individual id deja uniques
# print(len(population_local))
# population_local.head(1)

Use the filtered population to extract activities of the same city. Also count the activities by type in this city. 

In [10]:
population_local_ids = population_local['id'].unique()
activity_local = activity_vaud[activity_vaud['id'].isin(population_local_ids)]
activity_local_filt = activity_local[~activity_local['type'].isin(['other', 'pt interaction', 'home'])] # peut etre besoin du home facilities id ? 
# count_act_by_types = activity_local_filt.groupby('type')['facility'].nunique().reset_index()
# print(f"Here's the count of facilities by types in {LOCAL} : {count_act_by_types}")
# print(len(activity_local_filt))
# activity_local_filt.head(2)

Sample the activities of this city by keeping the proportion between each type (stratified sampling)

In [11]:
# peut etre associer un seed pour avoir toujours le meme resultat ? 
# Attention : stratified sampling != proportional sampling 
activity_local_filt_sampled = stratified_sample(activity_local_filt, 'type', 0.0001) # 0.001 to compare exact / heuristic
# activity_local_filt_sampled.drop(columns=['start_time', 'end_time'], inplace=True)
# print(len(activity_local_filt))
# print(len(activity_local_filt_sampled))
# count_act_by_types_sampled = activity_local_filt_sampled.groupby('type')['facility'].nunique().reset_index()
# print(f"Here's the count of facilities by types in the sample : {count_act_by_types_sampled}")
# activity_local_filt_sampled.head()

Create new columns for each activity with their caracterics (anticipate initialization)

In [12]:
# 0 = 00:00 // 288 = 24:00 (total len = 289)
# Code to fix the following values for each activity type : 
    # t1 = earliest time to start
    # t2 = latest time to start
    # t3 = max duration
    # min duration
    # des duration

for i, row in activity_local_filt_sampled.iterrows():
    type_ = row['type']
    match type_: 
        case 'education':
            # activity_local_filt_sampled.at[i, 'feasible_start'] = pd.to_datetime('07:00:00')
            activity_local_filt_sampled.at[i, 'group'] = 1
            activity_local_filt_sampled.at[i, 'earliest_start'] = 84 # 7h
            activity_local_filt_sampled.at[i, 'latest_start'] = 276 # 23h
            activity_local_filt_sampled.at[i, 'max_duration'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
            # activity_local_filt_sampled.at[i, 'des_duration'] = randint(12, 60) # entre 1h et 5h
        case 'work':
            activity_local_filt_sampled.at[i, 'group'] = 2
            activity_local_filt_sampled.at[i, 'earliest_start'] = 60 # 5h
            activity_local_filt_sampled.at[i, 'latest_start'] = 276 # 23h
            activity_local_filt_sampled.at[i, 'max_duration'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
            # activity_local_filt_sampled.at[i, 'des_duration'] = randint(24, 60) # entre 2h et 5h
        case 'leisure':
            activity_local_filt_sampled.at[i, 'group'] = 3
            activity_local_filt_sampled.at[i, 'earliest_start'] = 0 # 0h
            activity_local_filt_sampled.at[i, 'latest_start'] = 276 # 23h
            activity_local_filt_sampled.at[i, 'max_duration'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
            # activity_local_filt_sampled.at[i, 'des_duration'] = randint(12, 60) # entre 1h et 5h
        case 'shop':
            activity_local_filt_sampled.at[i, 'group'] = 4
            activity_local_filt_sampled.at[i, 'earliest_start'] = 84 # 7h
            activity_local_filt_sampled.at[i, 'latest_start'] = 240 # 20h
            activity_local_filt_sampled.at[i, 'max_duration'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
            # activity_local_filt_sampled.at[i, 'des_duration'] = randint(12, 60) # entre 1h et 5h
            
# int_columns = ['earliest_start', 'latest_start', 'max_duration', 'min_duration', 'des_duration', 'x', 'y', 'group']
int_columns = ['earliest_start', 'latest_start', 'max_duration', 'min_duration', 'x', 'y', 'group']
activity_local_filt_sampled[int_columns] = activity_local_filt_sampled[int_columns].astype(int) 

# Proof that we have a activity id : 
# print(len(activity_local_filt_sampled))
# print(activity_local_filt_sampled.index.nunique())

Sample the individuals among the city inhabitants and converts home coordinates in `int`

In [13]:
population_local_sample = population_local.sample(frac = 0.0001)
int_columns_2 = ['home_x', 'home_y'] 
population_local_sample[int_columns_2] = population_local_sample[int_columns_2].astype(int) 
# print(len(population_local))
# print(len(population_local_sample))
# population_local_sample.head(1)
# activity_local_filt_sampled.head()

Filter invalid times (hours < 24)

In [14]:
pd.set_option('mode.chained_assignment', None)

In [15]:
# JUST RUN ONCE !!! 2m17 si 0.0001 de lausanne
# Convert 'start_time' and 'end_time' columns to datetime.time format : JUST RUN ONCE
activity_local_filt['start_time'] = activity_local_filt['start_time'].apply(convert_to_time)
activity_local_filt['end_time'] = activity_local_filt['end_time'].apply(convert_to_time)

# Filter out rows with "out_of_range" value
activity_local_filt = activity_local_filt[
    (activity_local_filt['start_time'] != "out_of_range") & 
    (activity_local_filt['end_time'] != "out_of_range")
]

Convert time object in terms of horizons  

In [16]:
activity_local_filt['start_time_interval'] = activity_local_filt['start_time'].apply(time_to_horizon_interval)
activity_local_filt['end_time_interval'] = activity_local_filt['end_time'].apply(time_to_horizon_interval)
activity_local_filt['duration_interval'] = activity_local_filt['end_time_interval'] - activity_local_filt['start_time_interval']

For each individual, and for each activity type, add starting time and duration preferences

In [17]:
# activity_local_filt.head()

In [18]:
# population_local_sample.head()

In [19]:
rows_of_interessed = activity_local_filt[activity_local_filt['id'].isin(population_local_sample['id'])]
# rows_of_interessed.head()

In [20]:
# valeurs par default : c'est facile de faire la moyenne des time horizon pour type (starting et duration) !
# default_durations = {'shop': 216, 'leisure': 216, 'work': 108, 'education': 108}
# default_starting = {'shop': 12, 'leisure': 12, 'work': 48, 'education': 48}
# default_durations = {'shop': 0, 'leisure': 0, 'work': 0, 'education': 0}
# default_starting = {'shop': 0, 'leisure': 0, 'work': 0, 'education': 0}

default_durations = {}
default_starting = {}
facility_types = ['shop', 'leisure', 'work', 'education']
for facility in facility_types: 
    temp_df = activity_local_filt[activity_local_filt['type'] == facility]
    default_durations[facility] = int(np.floor(temp_df['duration_interval'].mean()))
    default_starting[facility] = int(np.floor(temp_df['start_time_interval'].mean()))
    
print(default_starting)
print(default_durations)

{'shop': 163, 'leisure': 180, 'work': 119, 'education': 126}
{'shop': 6, 'leisure': 20, 'work': 65, 'education': 46}


In [21]:
facility_types = ['shop', 'leisure', 'work', 'education']
for facility in tqdm(facility_types): 
    # function extracts both duration and starting time preferences and assigns them
    preferences = population_local_sample.apply(
        lambda row: get_preferences(row['id'], facility, rows_of_interessed), axis=1
    )
    
    population_local_sample[f'{facility}_duration'] = preferences.apply(lambda x: x[0])
    population_local_sample[f'{facility}_starting'] = preferences.apply(lambda x: x[1])

100%|██████████| 4/4 [00:00<00:00, 79.36it/s]


In [22]:
# population_local_sample.head()

Write the final preprocessed dataframes into .csv files

In [23]:
activity_local_filt_sampled.to_csv(f'data_preprocessed/activity.csv', index=False)
population_local_sample.to_csv(f'data_preprocessed/population.csv', index=False) 

# TEST CODE TO REMOVE

In [None]:
# merge with start_link
trip_local_merged = pd.merge(trip_local, link_df, left_on='start_link', right_index=True)
trip_local_merged.rename(columns={'x': 'start_x', 'y': 'start_y'}, inplace=True)

# merge with end_link
trip_local_merged = pd.merge(trip_local_merged, link_df, left_on='end_link', right_index=True)
trip_local_merged.rename(columns={'x': 'end_x', 'y': 'end_y'}, inplace=True)

trip_local_merged.head()

# pour chaque tripreer une nouvelle colonne avcec direct la distance calcule from le df des activities

In [None]:
trip_local_merged['distance'] = np.sqrt((trip_local_merged['start_x'] - trip_local_merged['end_x']) ** 2 +
                                (trip_local_merged['start_y'] - trip_local_merged['end_y']) ** 2)
# Ensure that trav_time is in timedelta format
trip_local_merged['trav_time'] = pd.to_timedelta(trip_local_merged['trav_time'])

average_duration = trip_local_merged['trav_time'].mean()  # convertir en minutes TO CHECK


# Now, calculate the average_speed
trip_local_merged['average_speed'] = (trip_local_merged['distance'] / 1000) / (trip_local_merged['trav_time'].dt.total_seconds() / 3600)

# print(trip_local_merged.head(5)) # There is duplicates via the dep time or mode !!!

average_distance = trip_local_merged['distance'].mean() / 1000  # convert to km
overall_average_speed = trip_local_merged['average_speed'].mean()

print(f"Average durations of trips in {LOCAL} : {average_duration}m")
print("Average Distance: {:.2f} km".format(average_distance))
print("Overall Average Speed: {:.2f} km/h".format(overall_average_speed))


In [None]:
work_facilities_df = activity_local_filtered[activity_local_filtered['type'] == 'work'][['id', 'facility', 'x', 'y']].drop_duplicates() # verifier via les homes
population_local_merged = population_local.merge(work_facilities_df, on='id', how='left')
population_local_merged.rename(columns={'facility': 'work_facility', 'x': 'work_facility_x', 'y': 'work_facility_y'}, inplace=True)

population_local_merged.head(1)