In [13]:
import polars as pl
import pickle
import math

In [14]:
my_path = r"C:\Users\20222612\Downloads\data_AIS_Custom_01062021_30112021_CarFisHigMilPasPleSaiTan_600_99999999_0.pkl"

In [15]:
max_traj = 5000 #TODO Remove

# Flatten data

### Read Pickle file

In [16]:
trajectories = []

with (open(my_path, "rb")) as openfile:
    #TODO Remove max_traj
    while True and len(trajectories) < max_traj:
        try:
            trajectories.append(pickle.load(openfile))
        except EOFError:
            break

### Flatten data

In [17]:
# Flatten the lists and create separate rows for each value
flattened_data = []

for index, entry in enumerate(trajectories, start=1):
    traj_id = index
    mmsi = entry['mmsi']
    shiptype = entry['shiptype']
    track_length = entry['track_length']
    lat_values = entry['lat']
    lon_values = entry['lon']
    speed_values = entry['speed']
    course_values = entry['course']
    timestamp_values = entry['timestamp']
    
    for lat, lon, speed, course, timestamp in zip(lat_values, lon_values, speed_values, course_values, timestamp_values):
        flattened_data.append({
            'trajectory_id': traj_id, 
            'mmsi': mmsi,
            'shiptype': shiptype,
            'track_length': track_length,
            'lat': float(lat),
            'lon': float(lon),
            'speed': float(speed),
            'course': float(course),
            'timestamp': int(timestamp)
        })

In [18]:
df = pl.DataFrame(flattened_data)
df

trajectory_id,mmsi,shiptype,track_length,lat,lon,speed,course,timestamp
i64,i64,i64,i64,f64,f64,f64,f64,i64
1,0,36,32,55.146145,15.110447,0.0,0.0,1626681097
1,0,36,32,55.146145,15.110447,0.0,0.0,1626681279
1,0,36,32,55.146145,15.110447,0.0,0.0,1626681462
1,0,36,32,55.146145,15.110447,0.0,0.0,1626681646
1,0,36,32,55.146145,15.110447,0.0,0.0,1626682008
…,…,…,…,…,…,…,…,…
5000,210095000,60,100,54.556833,13.9145,7.459444,330.0,1636258696
5000,210095000,60,100,54.557515,13.913878,7.562333,330.0,1636258707
5000,210095000,60,100,54.564587,13.906743,7.562333,330.0,1636258827
5000,210095000,60,100,54.5835,13.8875,7.562333,330.0,1636259152


# Filter data based on total distance

In [19]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)
    
    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    distance = R * c
    return distance

In [20]:
results = []

for trajectory_id in df["trajectory_id"].unique():
    # Filter the DataFrame for the current trajectory_id
    filtered_df = df.filter(df["trajectory_id"] == trajectory_id)
    
    # Get the first and last row for the trajectory using indexing
    first_row = filtered_df[0]  # First row
    last_row = filtered_df[-1]  # Last row
    
    # Extract lat/lon values
    lat1, lon1 = first_row["lat"][0], first_row["lon"][0]
    lat2, lon2 = last_row["lat"][0], last_row["lon"][0]
    
    # Calculate the distance
    distance = haversine(lat1, lon1, lat2, lon2)
    
    # Append the result
    results.append({"trajectory_id": trajectory_id, "distance_km": distance})

In [21]:
# Convert results to a Polars DataFrame
distance_df = pl.DataFrame(results)

# Show the result where distance_km > 10
filtered_df = distance_df.filter(distance_df["distance_km"] > 10)

print(filtered_df)

shape: (4_404, 2)
┌───────────────┬─────────────┐
│ trajectory_id ┆ distance_km │
│ ---           ┆ ---         │
│ i64           ┆ f64         │
╞═══════════════╪═════════════╡
│ 8             ┆ 53.761694   │
│ 9             ┆ 60.257592   │
│ 10            ┆ 92.405714   │
│ 11            ┆ 160.297479  │
│ 24            ┆ 13.890478   │
│ …             ┆ …           │
│ 4996          ┆ 12.027523   │
│ 4997          ┆ 13.779535   │
│ 4998          ┆ 72.611602   │
│ 4999          ┆ 109.323024  │
│ 5000          ┆ 11.660215   │
└───────────────┴─────────────┘


In [22]:
long_trajectories = df.join(filtered_df, on="trajectory_id", how="inner")
long_trajectories

trajectory_id,mmsi,shiptype,track_length,lat,lon,speed,course,timestamp,distance_km
i64,i64,i64,i64,f64,f64,f64,f64,i64,f64
8,0,36,557,55.508922,15.458938,3.446778,193.199997,1628675992,53.761694
8,0,36,557,55.504283,15.45617,3.446778,207.100006,1628676146,53.761694
8,0,36,557,55.503398,15.455532,3.652555,202.899994,1628676175,53.761694
8,0,36,557,55.500105,15.452538,3.241,215.699997,1628676297,53.761694
8,0,36,557,55.50008,15.452507,3.343889,216.800003,1628676297,53.761694
…,…,…,…,…,…,…,…,…,…
5000,210095000,60,100,54.556833,13.9145,7.459444,330.0,1636258696,11.660215
5000,210095000,60,100,54.557515,13.913878,7.562333,330.0,1636258707,11.660215
5000,210095000,60,100,54.564587,13.906743,7.562333,330.0,1636258827,11.660215
5000,210095000,60,100,54.5835,13.8875,7.562333,330.0,1636259152,11.660215


# Clean data

In [23]:
df_positions_for_export = long_trajectories.with_row_index("POSITION_ID")
df_positions_for_export = df_positions_for_export.rename({"trajectory_id": "POSITION_TRIP_ID", "mmsi": "VESSEL_ID", "shiptype": "SHIP_TYPE", "lat": "LATITUDE", "lon": "LONGITUDE"})
df_positions_for_export = df_positions_for_export.select(["POSITION_ID", "POSITION_TRIP_ID", "LATITUDE", "LONGITUDE"])
df_positions_for_export.write_csv("input_data/denmark_positions.csv")
df_positions_for_export

POSITION_ID,POSITION_TRIP_ID,LATITUDE,LONGITUDE
u32,i64,f64,f64
0,8,55.508922,15.458938
1,8,55.504283,15.45617
2,8,55.503398,15.455532
3,8,55.500105,15.452538
4,8,55.50008,15.452507
…,…,…,…
8533101,5000,54.556833,13.9145
8533102,5000,54.557515,13.913878
8533103,5000,54.564587,13.906743
8533104,5000,54.5835,13.8875


In [24]:
df_first = df_positions_for_export.unique(subset=["POSITION_TRIP_ID"], keep="first")
df_first = df_first.select(["POSITION_TRIP_ID"])
df_first.write_csv("input_data/denmark_trips.csv")
df_first

POSITION_TRIP_ID
i64
277
3421
3555
1325
2108
…
381
643
3138
1295
