# Down sampling

In [49]:
import pandas as pd
import geopy.distance

In [50]:
# Set minimum distance threshold in meters
MIN_DISTANCE_METERS = 1000

In [51]:
def traj_downsampling(traj):

    # Initialize filtered list with the first point
    filtered_positions = [traj.iloc[0]]

    for i in range(1, len(traj)):
        prev_point = (filtered_positions[-1]["LATITUDE"], filtered_positions[-1]["LONGITUDE"])
        current_point = (traj.iloc[i]["LATITUDE"], traj.iloc[i]["LONGITUDE"])

        # Calculate the distance
        distance = geopy.distance.geodesic(prev_point, current_point).meters
        
        if distance > MIN_DISTANCE_METERS:
            filtered_positions.append(traj.iloc[i])
    
    return filtered_positions

In [52]:
df = pd.read_csv("input_data/denmark_positions.csv")
df

Unnamed: 0,POSITION_ID,POSITION_TRIP_ID,LATITUDE,LONGITUDE
0,0,8,55.508922,15.458938
1,1,8,55.504283,15.456170
2,2,8,55.503398,15.455532
3,3,8,55.500105,15.452538
4,4,8,55.500080,15.452507
...,...,...,...,...
8533101,8533101,5000,54.556833,13.914500
8533102,8533102,5000,54.557515,13.913878
8533103,8533103,5000,54.564587,13.906743
8533104,8533104,5000,54.583500,13.887500


In [53]:
unique_trip_ids = df["POSITION_TRIP_ID"].unique()
len(unique_trip_ids)

4404

In [54]:
downsampled_trajectories = []

for trip_id in unique_trip_ids:
    all_rows_for_traj = df[df["POSITION_TRIP_ID"] == trip_id]
    downsampled_traj = traj_downsampling(all_rows_for_traj)
    downsampled_trajectories.extend(downsampled_traj)

In [57]:
downsampled_trajectories_df = pd.DataFrame(downsampled_trajectories)
downsampled_trajectories_df["LAT_LONG"] = downsampled_trajectories_df["LATITUDE"].astype(str) + ", " + downsampled_trajectories_df["LONGITUDE"].astype(str)
downsampled_trajectories_df["POSITION_ID"] = df["POSITION_ID"].astype(int)
downsampled_trajectories_df["POSITION_TRIP_ID"] = df["POSITION_TRIP_ID"].astype(int)
downsampled_trajectories_df.to_csv("input_data/denmark_positions_downsampled.csv")
downsampled_trajectories_df

Unnamed: 0,POSITION_ID,POSITION_TRIP_ID,LATITUDE,LONGITUDE,LAT_LONG
0,0,8,55.508922,15.458938,"55.508922, 15.458938"
3,3,8,55.500105,15.452538,"55.500105, 15.452538"
9,9,8,55.492128,15.444578,"55.492128, 15.444578"
14,14,8,55.482085,15.438820,"55.482085, 15.43882"
23,23,8,55.471972,15.432787,"55.471972, 15.432787"
...,...,...,...,...,...
8533070,8533070,5000,54.534398,13.936615,"54.534398, 13.936615"
8533086,8533086,5000,54.542833,13.928500,"54.542833, 13.9285"
8533096,8533096,5000,54.550905,13.920560,"54.550905, 13.92056"
8533103,8533103,5000,54.564587,13.906743,"54.564587, 13.906743"


In [58]:
# TRIPS table
unique_trip_ids = downsampled_trajectories_df["POSITION_TRIP_ID"].unique()
trip_tab = pd.DataFrame(unique_trip_ids)
trip_tab.rename(columns={trip_tab.columns[0]: 'TRIP_ID'}, inplace=True)
trip_tab.to_csv("input_data/denmark_trips_downsampled.csv")
trip_tab

Unnamed: 0,TRIP_ID
0,8
1,9
2,10
3,11
4,24
...,...
4399,4996
4400,4997
4401,4998
4402,4999
