In [82]:
import pandas as pd
import gzip
import numpy as np
from random import randint

In [83]:
LOCAL = 'Lausanne'

In [84]:
# Specify the file paths
activity_file = 'data_original/vaud_activities.csv.gz'
population_file = 'data_original/vaud_population.csv.gz'
trip_file = 'data_original/vaud_trips.csv.gz'

# Read the gzipped CSV files
def read_gzipped_csv(file_path):
    with gzip.open(file_path, 'rt') as file:
        df = pd.read_csv(file)
    return df

# Read the dataframes
activity_vaud = read_gzipped_csv(activity_file)
population_vaud = read_gzipped_csv(population_file)[['id', 'age', 'home_x', 'home_y', 'local']].drop_duplicates()
trip_vaud = read_gzipped_csv(trip_file)[['Unnamed: 0', 'id', 'mode', 'dep_time','trav_time','start_link','end_link']].drop_duplicates()

In [85]:
population_local = population_vaud[population_vaud['local'] == LOCAL] # individual id deja uniques
# print(len(population_local))
# population_local.head(1)

In [86]:
population_local_ids = population_local['id'].unique()
activity_local = activity_vaud[activity_vaud['id'].isin(population_local_ids)]
activity_local_filt = activity_local[~activity_local['type'].isin(['other', 'pt interaction', 'home'])] # peut etre besoin du home facilities id ? 
# count_act_by_types = activity_local_filt.groupby('type')['facility'].nunique().reset_index()
# print(f"Here's the count of facilities by types in {LOCAL} : {count_act_by_types}")
# print(len(activity_local_filt))
# activity_local_filt.head(1)

In [87]:
def stratified_sample(df, column, fraction):
    """
    Perform stratified sampling on df based on column.
    :param df: Input dataframe
    :param column: Column name for stratification
    :param fraction: Fraction of rows to sample from each group
    :return: Sampled dataframe
    """
    return df.groupby(column).apply(lambda x: x.sample(frac=fraction)).reset_index(drop=True)

In [88]:
# peut etre associer un seed pour avoir toujours le meme resultat ? 
# Attention : stratified sampling != proportional sampling 
activity_local_filt_sampled = stratified_sample(activity_local_filt, 'type', 0.001) # 0.001 to compare exact / heuristic
activity_local_filt_sampled.drop(columns=['start_time', 'end_time'], inplace=True)
# print(len(activity_local_filt))
# print(len(activity_local_filt_sampled))
# count_act_by_types_sampled = activity_local_filt_sampled.groupby('type')['facility'].nunique().reset_index()
# print(f"Here's the count of facilities by types in the sample : {count_act_by_types_sampled}")
# activity_local_filt_sampled.head()

In [92]:
# 0 = 00:00 // 288 = 24:00 (total len = 289)
# Code to fix the following values for each activity type : 
    # t1 = earliest time to start
    # t2 = latest time to start
    # t3 = max duration
    # min duration
    # des duration
for i, row in activity_local_filt_sampled.iterrows():
    type_ = row['type']
    match type_: 
        case 'education':
            # activity_local_filt_sampled.at[i, 'feasible_start'] = pd.to_datetime('07:00:00')
            activity_local_filt_sampled.at[i, 'group'] = 1
            activity_local_filt_sampled.at[i, 't1'] = 84 # 7h
            activity_local_filt_sampled.at[i, 't2'] = 276 # 23h
            activity_local_filt_sampled.at[i, 't3'] = 108 # 9h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
            activity_local_filt_sampled.at[i, 'des_duration'] = randint(12, 60) # entre 1h et 5h
        case 'work':
            activity_local_filt_sampled.at[i, 'group'] = 2
            activity_local_filt_sampled.at[i, 't1'] = 60 # 5h
            activity_local_filt_sampled.at[i, 't2'] = 276 # 23h
            activity_local_filt_sampled.at[i, 't3'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
            activity_local_filt_sampled.at[i, 'des_duration'] = randint(24, 60) # entre 2h et 5h
        case 'leisure':
            activity_local_filt_sampled.at[i, 'group'] = 3
            activity_local_filt_sampled.at[i, 't1'] = 0 # 0h
            activity_local_filt_sampled.at[i, 't2'] = 288 # 24h
            activity_local_filt_sampled.at[i, 't3'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
            activity_local_filt_sampled.at[i, 'des_duration'] = randint(12, 60) # entre 1h et 5h
        case 'shop':
            activity_local_filt_sampled.at[i, 'group'] = 4
            activity_local_filt_sampled.at[i, 't1'] = 84 # 7h
            activity_local_filt_sampled.at[i, 't2'] = 240 # 20h
            activity_local_filt_sampled.at[i, 't3'] = 132 # 11h
            activity_local_filt_sampled.at[i, 'min_duration'] = 6 # 30m
            activity_local_filt_sampled.at[i, 'des_duration'] = randint(12, 60) # entre 1h et 5h
            
int_columns = ['t1', 't2', 't3', 'min_duration', 'des_duration', 'x', 'y', 'group'] # x and y in meters ? 
activity_local_filt_sampled[int_columns] = activity_local_filt_sampled[int_columns].astype(int) 

# Proof that we have a activity id : 
# print(len(activity_local_filt_sampled))
# print(activity_local_filt_sampled.index.nunique())

In [93]:
population_local_sample = population_local.sample(frac = 0.0001)
# print(len(population_local))
# print(len(population_local_sample))
# population_local_sample.head(1)


In [None]:
int_columns_2 = ['home_x', 'home_y'] 
population_local_sample[int_columns_2] = population_local_sample[int_columns_2].astype(int) 

In [94]:
activity_local_filt_sampled.to_csv(f'data_preprocessed/activity.csv', index=False)
population_local_sample.to_csv(f'data_preprocessed/population.csv', index=False) # convertir aussi en int home x et home y ? 

# TESTS

In [None]:
trip_local = trip_vaud[trip_vaud['id'].isin(population_local)]

trip_local.head(1)

In [None]:
print(activity_local_filtered['Unnamed: 0'].nunique()) # nb de lignes
print(activity_local_filtered['id'].nunique()) # nb d'habitants
print(activity_local_filtered['link'].nunique()) # nb de link
print(activity_local_filtered['facility'].nunique()) # nb de facility
# il a 92k facilities pour 15k link : plusieurs facility a un link
# pourtnat les link sont pas forcement les memes coord mais bon a chaque fois tres proche
# il faut que je fasse un dataframe des liens avec leur mean coord ou juste la 1ere apparition (groupby et mean)
# puis ensuite sur ca je fais un merge
link_df = activity_local_filtered[['link', 'x', 'y']]
link_df = link_df.groupby('link').mean()
link_df.head(1)

In [None]:
# merge with start_link
trip_local_merged = pd.merge(trip_local, link_df, left_on='start_link', right_index=True)
trip_local_merged.rename(columns={'x': 'start_x', 'y': 'start_y'}, inplace=True)

# merge with end_link
trip_local_merged = pd.merge(trip_local_merged, link_df, left_on='end_link', right_index=True)
trip_local_merged.rename(columns={'x': 'end_x', 'y': 'end_y'}, inplace=True)

trip_local_merged.head()

# pour chaque tripreer une nouvelle colonne avcec direct la distance calcule from le df des activities

In [None]:
trip_local_merged['distance'] = np.sqrt((trip_local_merged['start_x'] - trip_local_merged['end_x']) ** 2 +
                                (trip_local_merged['start_y'] - trip_local_merged['end_y']) ** 2)
# Ensure that trav_time is in timedelta format
trip_local_merged['trav_time'] = pd.to_timedelta(trip_local_merged['trav_time'])

average_duration = trip_local_merged['trav_time'].mean()  # convertir en minutes TO CHECK


# Now, calculate the average_speed
trip_local_merged['average_speed'] = (trip_local_merged['distance'] / 1000) / (trip_local_merged['trav_time'].dt.total_seconds() / 3600)

# print(trip_local_merged.head(5)) # There is duplicates via the dep time or mode !!!

average_distance = trip_local_merged['distance'].mean() / 1000  # convert to km
overall_average_speed = trip_local_merged['average_speed'].mean()

print(f"Average durations of trips in {LOCAL} : {average_duration}m")
print("Average Distance: {:.2f} km".format(average_distance))
print("Overall Average Speed: {:.2f} km/h".format(overall_average_speed))


In [None]:
work_facilities_df = activity_local_filtered[activity_local_filtered['type'] == 'work'][['id', 'facility', 'x', 'y']].drop_duplicates() # verifier via les homes
population_local_merged = population_local.merge(work_facilities_df, on='id', how='left')
population_local_merged.rename(columns={'facility': 'work_facility', 'x': 'work_facility_x', 'y': 'work_facility_y'}, inplace=True)

population_local_merged.head(1)

In [None]:
# Dataframes for validations on a small numbers

n = 1 # nb of individuals
m = 10 # nb of activities availables

activity_local_filtered_sample = activity_local_filtered.sample(m)
population_local_merged_sample = population_local_merged.sample(n)

In [None]:
activity_local_filtered.to_csv(f'data_preprocessed/activity.csv', index=False)
trip_local.to_csv(f'data_preprocessed/trip.csv', index=False)
population_local_merged.to_csv(f'data_preprocessed/population.csv', index=False)