In [233]:
import osmnx as ox
import pandas as pd
import geopandas as gpd
import skmob
from skmob.preprocessing import compression, detection
from datetime import timedelta
from shapely.geometry import Point
from geopy.distance import geodesic 

# Data loading and pre-processing

Replace with your local path to "timediariesanswers-0.parquet" and "locationeventpertime_rd"

In [234]:
df_td = pd.read_parquet(r"C:\Users\ninav\Desktop\Data\SU2_contributions_parquet_rd_anonymized\Answers\timediariesanswers\timediariesanswers-0.parquet")
df_locationeventpertime = pd.read_parquet(r"C:\Users\ninav\Desktop\Data\SU2_position_parquet_rd_anonymized-001\Sensors\locationeventpertime_rd")

## Data loading OSM data

In [None]:
# Method copied from https://github.com/chiarap2/MAT_Builder/blob/cd5c73a9ba3c7bbc0297c31664916444c4fbfc06/core/modules/Enrichment.py#L98
def download_poi_osm(list_pois, place):

        # Final list of the columns that are expected to be found in the POI dataframe.
        list_columns_df_poi = ['osmid', 'element_type', 'name', 'name:en', 'geometry', 'category']

        # Here we download the POIs from OSM if the list of types of POIs is not empty.
        gdf_ = gpd.GeoDataFrame(columns=list_columns_df_poi, crs="EPSG:4326")
        if list_pois:

            print(f"Downloading POIs from OSM for the location {place}...")
            for key in list_pois:

                # downloading POI
                print(f"Downloading {key} POIs from OSM...")
                poi = ox.features_from_place(place, tags={key: True})
                print(f"Download completed!")

                # Immediately return the empty dataframe if it doesn't contain any suitable POI...
                if poi.empty:
                    print(f"No POI found for category {key}!")
                    break

                # Remove the POIs that do not have a name.
                poi.reset_index(inplace=True)
                poi.drop(columns='category', inplace = True, errors='ignore') # Delete the column 'category' if it exists.
                poi.rename(columns={key: 'category'}, inplace=True)
                poi.drop(columns = poi.columns.difference(list_columns_df_poi), inplace=True)
                poi = poi.loc[~poi['name'].isna()]
                poi['category'].replace({'yes': key}, inplace=True)

                # And finally, concatenate this subset of POIs to the other POIs
                # that have been added to the main dataframe so far.
                gdf_ = pd.concat([gdf_, poi])

            gdf_.reset_index(drop=True, inplace=True)
            return gdf_

list_pois = ['amenity', 'shop', 'tourism', 'aeroway', 'building', 'historic', 'healthcare', 'landuse', 'office', 'public_transport']
poi_place = "Trento"
df_poi = download_poi_osm(list_pois, poi_place)

## Data loading and pre-processing SU2 timediaries

Load timediaries dataset

In [236]:
df_td = pd.read_parquet(r"C:\Users\ninav\Desktop\Data\SU2_contributions_parquet_rd_anonymized\Answers\timediariesanswers\timediariesanswers-0.parquet")
df_td['answer_datetime']  = pd.to_datetime(df_td['answertimestamp'], format='%Y%m%d%H%M%S%f')

Translate the A2 answers (where are you?) to English

In [237]:
df_td['answer_datetime']  = pd.to_datetime(df_td['answertimestamp'], format='%Y%m%d%H%M%S%f')

rep = {
    "Shop, supermarket, etc": "Shop",
    "Pizzeria, pub, bar, restaurant": "Restaurant",
    "House (friends, others)": "Others home",
    "Relatives Home": "Relatives home",
    "Movie Theater, Museum, ...": "Cultural",
    "Classroom / Laboratory": "Classroom",
    "Classroom / Study hall": "Study hall"
}

translate_it_en = {
    'Casa, Appartamento, Stanza': 'Home, Apartment, Room',
    'Aula / Laboratorio': 'Classroom / Laboratory',
    'Casa (amici, altri)': 'House (friends, others)',
    'Casa (Genitori/parenti)': 'Relatives Home',
    'Altro luogo in universit (Corridoi cortili, ecc.)': 'Other university place',
    'Aula / Sala studio': 'Classroom / Study hall', 
    'All aperto': 'Outdoors', 
    'Cinema, Teatro, Museo, ...': 'Movie Theater, Theater, Museum, ...',
    'Altro luogo': 'Other place', 
    'Pizzeria, pub, bar, ristorante': 'Pizzeria, pub, bar, restaurant',
    'Posto di lavoro': 'Work place',
    'Mensa': 'Canteen',
    'Negozio, supermercato, ecc': 'Shop, supermarket, etc',
    'Palestra, struttura sportiva': 'Gym', 
    'Biblioteca UNITN': 'UNITN Library',
    'Altra Biblioteca': 'Other Library',
}

def get_place_cat(val):
    # Assuming translate_it_en and rep are your translation dictionaries
    if val in translate_it_en:
        val = translate_it_en[val]

    if val in rep:
        val = rep[val]

    return val

# Apply the function to create a new column 'A1_en'
df_td['A1_en'] = df_td['A2'].apply(get_place_cat)

## Data loading and pre-processing SU2 GPS data

Swap column names to fix data inconsistency where certain columns have values swapped
    In the original DataFrame, a data inconsistency exists where certain columns have values swapped. 
    For example 'latitude' containing longitude coordinates in some rows etc.
        - For most users the order is correct => LAT, LNG, ALT
        - For the users with ids [25,32,41,43,56,57,58,59,60,61,62] => LNG, ALT, LAT
        - For users with ids [28,36] => LAT, ALT, LNG
        - For users with ids [30,34,37,38,42] => ALT, LNG, LAT 

In [None]:
def swap_columns(df):
    df = df[df['userid'] != 31]

    lng_alt_lat = [25, 32, 41, 43, 56, 57, 58, 59, 60, 61, 62]
    df.loc[df['userid'].isin(lng_alt_lat)] = df.loc[df['userid'].isin(lng_alt_lat)].rename(columns={'latitude': 'longitude', 'longitude':'altitude', 'altitude': 'latitude'})

    lat_alt_lng = [28, 36,82,84,86,87,88,92,93,116,126,132]
    df.loc[df['userid'].isin(lat_alt_lng)] = df.loc[df['userid'].isin(lat_alt_lng)].rename(columns={'longitude': 'altitude', 'altitude':'longitude'})

    alt_lng_lat = [30, 34, 37, 38, 42]
    df.loc[df['userid'].isin(alt_lng_lat)] = df.loc[df['userid'].isin(alt_lng_lat)].rename(columns={'altitude': 'latitude', 'latitude':'altitude'})

    df.rename(columns={"userid": "user_id"}, inplace=True)

    df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y%m%d%H%M%S%f')
    df = df.sort_values(by=['user_id', 'timestamp'])

    return df

df = swap_columns(df_locationeventpertime)
df = df[pd.to_numeric(df['accuracy'], errors='coerce') <= 20] # Filter rows with accuracy higher than 20
df

## Add column traj_id
We want to add a new column, "traj_id," to the existing DataFrame. This column assigns a unique trajectory ID to each row based on either a time gap of more than 1 hour or a change in user_id compared to the previous row. This helps in segmenting the trajectory data into distinct trajectories based on these criteria.

In [None]:
def generate_traj_id(row):
    global current_traj_id, prev_timestamp, prev_user_id

    # If it's a new user, assign a new traj_id
    if row['user_id'] != prev_user_id:
        current_traj_id += 1

    # If the time difference is greater than 1 hour, assign a new traj_id
    elif (row['timestamp'] - prev_timestamp) > timedelta(hours=1):
        current_traj_id += 1

    prev_timestamp = row['timestamp']
    prev_user_id = row['user_id']

    return current_traj_id

# Initialize variables
current_traj_id = 1
prev_timestamp = df['timestamp'].iloc[0]
prev_user_id = df['user_id'].iloc[0]

# Apply the generate_traj_id function to create the 'traj_id' column
df['traj_id'] = df.apply(generate_traj_id, axis=1)
df

# Stop detection

In [None]:
# Create a TrajDataFrame from the DataFrame
tdf = skmob.TrajDataFrame(df, latitude = 'latitude', longitude = 'longitude',
                            datetime = 'timestamp', user_id = 'user_id', trajectory_id='traj_id')

print("Compressing the trajectories...")
ctdf = compression.compress(tdf, spatial_radius_km = 0.2)
ctdf

In [None]:
duration = 5    # value specifying the minimum duration of a stop.
radius =  40    # value specifying the maximum radius a stop can have.
stdf = detection.stay_locations(tdf,
                                stop_radius_factor = 0.5,
                                minutes_for_a_stop = duration,
                                spatial_radius_km = radius,
                                leaving_time = True)
stops = pd.DataFrame(stdf)

# Here stops' index contains the IDs of the stops. We reset the index such
# that the old index becomes a column.
stops.reset_index(inplace=True)
stops.rename(columns={'index': 'stop_id'}, inplace=True)
stops

# Combine GPS dataset (stops) and timediaries dataset

Find the homes etc

## Approach 1:
We can get the timediary and then get the GPS location at the moment when the time diary was sent.

## Approach 2
We have all stop locations of a user
We have all timediaries where the user indicated being at home.
We want to connect these two

In [242]:
def connect_stops_and_td(td_category):
    prev_user = 0
    homes_df = pd.DataFrame(columns=['lat', 'lng', 'freq'])
    homes_users_new = {}

    for index, row in stops.iterrows():
        # STOP DATA
        user = row['uid']
        entry_dt = row['datetime']
        leaving_dt = row['leaving_datetime']
        stop_lat = row['lat']
        stop_lng = row['lng']

        if user != prev_user:
            # for prev_user
            if not homes_df.empty:
                # Find the index of the row with the highest frequency
                max_freq_index = homes_df['freq'].idxmax()

                max_freq_value = homes_df.loc[max_freq_index, 'freq']

                if max_freq_value > 10:
                    # Get the lat and lng values for the row with the highest frequency
                    max_freq_lat = homes_df.loc[max_freq_index, 'lat']
                    max_freq_lng = homes_df.loc[max_freq_index, 'lng']

                    homes_users_new[prev_user] = [max_freq_lat, max_freq_lng]
                # else:
                    # print(f"Not enough timediary data points to determine home of user {prev_user}")
            homes_df = pd.DataFrame(columns=['lat', 'lng', 'freq'])

        # find td for stop
        td = df_td[(df_td['userid'] == user) & (df_td['answer_datetime'] >= entry_dt) & (df_td['answer_datetime'] <= leaving_dt)]

        if not td.empty:
            for td_i, td_row in td.iterrows():
                if td_row["A1_en"] == td_category:
                    homes_existing = homes_df[(homes_df['lat'] == stop_lat) & (homes_df['lng'] == stop_lng)]
                    
                    if not homes_existing.empty:
                        first_matching_row = homes_existing.iloc[0]

                        # Update the 'freq' column by 1 for the first matching row
                        homes_df.loc[first_matching_row.name, 'freq'] += 1
                    else:
                        # Add a new row
                        new_row = {"lat": stop_lat, "lng": stop_lng, "freq": 1}
                        homes_df.loc[len(homes_df)] = new_row 
        prev_user = user
    
    return homes_users_new

## Find students home

In [None]:
homes_users_new = connect_stops_and_td("Home, Apartment, Room")
homes_users_new

df_homes = pd.DataFrame(columns=['user_id', 'home_coords', 'home_lat', 'home_lng'])
data_to_append = []

for k in homes_users_new:
    home_point = Point(homes_users_new[k][1], homes_users_new[k][0])
    data_to_append.append({'user_id': k, 'home_coords': home_point, 'home_lat': homes_users_new[k][0], 'home_lng': homes_users_new[k][1]})

# Merge the list of dictionaries into the DataFrame
df_homes = pd.concat([df_homes, pd.DataFrame(data_to_append)], ignore_index=True)
df_homes

Merge stops

In [244]:
stops = stops.merge(df_homes[['user_id', 'home_coords', 'home_lat', 'home_lng']], left_on='uid', right_on='user_id', how='left')

# Combine stops and df_homes

Add home column to stops dataframe

In [245]:
# Augments stops dataframe by adding a 'home' column. 
# It iterates through each stop, checks if the associated 'uid' is present in df_homes (containing home coordinates).
# If a match is found, it calculates the distance between the stop's coordinates and the home coordinates. 
# If the distance is less than 'max_distance_home' meters, it sets the 'home' column to True; otherwise, no action is taken.

max_distance_home = 10

# Add a 'home' column to df_stops
stops['home'] = None

# Loop through df_stops
for index, row in stops.iterrows():
    uid = row['uid']

    # Check if uid is in df_homes
    if uid in df_homes['user_id'].values and uid != 131:
        # Get 'home_coords' for the uid
        home_coords = df_homes.loc[df_homes['user_id'] == uid, 'home_coords'].values[0]

        # Create Shapely Point for the current row in df_stops
        current_coords = Point(row['lng'], row['lat'])


        # Compute distance
        distance = geodesic(home_coords.coords[0], current_coords.coords[0]).meters

        # Check if distance is smaller than 'max_distance_home' meters
        if distance < max_distance_home:
            stops.at[index, 'home'] = True

In [None]:
stops_at_home = stops[stops['home'] == True].copy()
stops_at_home

# Combine stops with POI dataset


In [247]:
# Method copied from https://github.com/chiarap2/MAT_Builder/blob/cd5c73a9ba3c7bbc0297c31664916444c4fbfc06/core/modules/Enrichment.py#L399

def stop_enrichment_with_pois(df_stops, df_poi, suffix, max_distance):
    # Prepare the stops for the subsequent spatial join.
    stops = gpd.GeoDataFrame(df_stops,
                                geometry=gpd.points_from_xy(df_stops.lng, df_stops.lat),
                                crs="EPSG:4326")
    stops.to_crs('epsg:3857', inplace=True)
    stops['geometry_stop'] = stops['geometry']
    stops['geometry'] = stops['geometry_stop'].buffer(max_distance)

    
    pois = df_poi.copy()
    pois.to_crs('epsg:3857', inplace=True)

    # Filter out the POIs without a name!
    pois = pois.loc[pois['name'].notna(), :]

    # duplicate geometry column because we loose it during the sjoin_nearest
    pois['geometry_' + suffix] = pois['geometry']
    pois['element_type'] = pois['element_type'].astype(str)
    pois['osmid'] = pois['osmid'].astype(str)

    # Execute the spatial left join to associate POIs to the stops.
    enriched_stops = stops.sjoin_nearest(pois, max_distance=0.00001, how='left', rsuffix=suffix)
  
    # Remove the POIs that have been associated with the same stop multiple times.
    enriched_stops.drop_duplicates(subset=['stop_id', 'osmid'], inplace=True)

    # compute the distance between the stop point and the POI geometry
    enriched_stops['distance'] = enriched_stops['geometry_stop'].distance(enriched_stops['geometry_' + suffix])
    
    enriched_stops = enriched_stops.sort_values(['stop_id', 'distance'])
    enriched_stops.reset_index(drop = True, inplace = True)
    return enriched_stops

In [248]:
# Maximum distance from the centroid of the stops (in meters): 
max_distance = 50

print("Executing stop augmentation POIs...")

# We filter and look only at stops that are outside the student's home because we are not interested in finding POIs close to student's home
stops_visits = stops[~stops['stop_id'].isin(stops_at_home['stop_id'])]

# Calling functions internal to this method...
stops_enriched = stop_enrichment_with_pois(stops_visits, df_poi, 'poi', max_distance)

Executing stop augmentation POIs...


In [None]:
stops_enriched.columns

In [None]:
# Drop columns
stops_enriched = stops_enriched.dropna(subset=['osmid'])

# Fix ID
stops_enriched['uid'] = 'student_' + stops_enriched['uid'].astype(str)
stops_enriched['osmid_uri'] = stops_enriched.apply(lambda row: f'https://www.openstreetmap.org/{row["element_type"]}/{row["osmid"]}', axis=1)
stops_enriched['stop_id'] = 'stop_' + stops_enriched['stop_id'].astype(str)
stops_enriched['tid'] = 'traj_' + stops_enriched['tid'].astype(str)

stops_enriched = stops_enriched[['stop_id', 'uid', 'datetime', 'lat', 'lng', 'tid', 'leaving_datetime', 'home_lat', 'home_lng', 'osmid_uri' ]]
stops_enriched.to_csv("../generated_data/stops_with_closest_poi.csv", index=False)
stops_enriched