In [2]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from sklearn.model_selection import train_test_split
from paths import PROCESSED_DATA_DIR, SPLITS_DIR

In [3]:
df = pq.read_table(PROCESSED_DATA_DIR).to_pandas()

In [4]:
# create segments, a smaller dataframe, with one row per segment, looking like this: MMSI, Segment
segments = df[['MMSI','Segment']].drop_duplicates()

# First split: train vs temp (train 70%, temp 30%)
train_segments, temp_segments = train_test_split(
    segments,
    test_size=0.3,
    random_state=42,
    shuffle=True
)

# Second split: temp -> validation and test (each 15% of total)
val_segments, test_segments = train_test_split(
    temp_segments,
    test_size=0.5,
    random_state=42,
    shuffle=True
)

In [5]:
#give train_segments a new column 'segment_id' with a unique id for each row starting from 0
train_segments['segment_id'] = range(len(train_segments))
val_segments['segment_id'] = range(len(val_segments))
test_segments['segment_id'] = range(len(test_segments))

In [6]:
train_df = df.merge(train_segments[['MMSI','Segment', 'segment_id']], on=['MMSI','Segment'])
val_df   = df.merge(val_segments[['MMSI','Segment', 'segment_id']], on=['MMSI','Segment'])
test_df  = df.merge(test_segments[['MMSI','Segment', 'segment_id']], on=['MMSI','Segment'])

In [7]:
#print length of each dataframe
print("Train dataframe length:", len(train_df))
print("Validation dataframe length:", len(val_df))
print("Test dataframe length:", len(test_df))

Train dataframe length: 52135706
Validation dataframe length: 10771374
Test dataframe length: 11205602


In [None]:
#Preprocessing: Remove segments with NaN values in COG or SOG from train_df, val_df and test_df
for df_name in ['train_df', 'val_df', 'test_df']:
    df = globals()[df_name]  # get the DataFrame by name
    # Find bad segments
    bad_segments = df.loc[df['COG'].isna() | df['SOG'].isna(), 'segment_id'].unique()
    # Drop all rows from bad segments
    globals()[df_name] = df[~df['segment_id'].isin(bad_segments)].copy()

In [9]:
#drop the Segment column from train_df, val_df and test_df
train_df = train_df.drop(columns=['Segment'])
val_df   = val_df.drop(columns=['Segment'])
test_df  = test_df.drop(columns=['Segment'])

In [10]:
#sort train_df, val_df and test_df by segment_id and Timestamp
train_df = train_df.sort_values(by=['segment_id', 'Timestamp'])
val_df   = val_df.sort_values(by=['segment_id', 'Timestamp'])
test_df  = test_df.sort_values(by=['segment_id', 'Timestamp'])

In [11]:
#After the removal of bad segments, update segment ids to be consecutive
def update_segment_ids(df):
    unique_segment_ids = df['segment_id'].unique()
    id_mapping = {old_id: new_id for new_id, old_id in enumerate(unique_segment_ids)}
    df['segment_id'] = df['segment_id'].map(id_mapping)
    return df
train_df = update_segment_ids(train_df)
val_df   = update_segment_ids(val_df)
test_df  = update_segment_ids(test_df)

In [12]:
#print the ten first unique segment ids in train_df, val_df and test_df
print("Train segment IDs:", train_df['segment_id'].unique()[:10])
print("Validation segment IDs:", val_df['segment_id'].unique()[:10])
print("Test segment IDs:", test_df['segment_id'].unique()[:10])
#print length of each dataframe
print("Train dataframe length:", len(train_df))
print("Validation dataframe length:", len(val_df))
print("Test dataframe length:", len(test_df))
#print the intervals of segment IDs for each split
print("Train segment ID range:", train_df['segment_id'].min(), "-", train_df['segment_id'].max())
print("Validation segment ID range:", val_df['segment_id'].min(), "-", val_df['segment_id'].max())
print("Test segment ID range:", test_df['segment_id'].min(), "-", test_df['segment_id'].max())
print(train_df.head())
print(train_df.describe())

Train segment IDs: [0 1 2 3 4 5 6 7 8 9]
Validation segment IDs: [0 1 2 3 4 5 6 7 8 9]
Test segment IDs: [0 1 2 3 4 5 6 7 8 9]
Train dataframe length: 39850008
Validation dataframe length: 8337407
Test dataframe length: 8522851
Train segment ID range: 0 - 59517
Validation segment ID range: 0 - 12846
Test segment ID range: 0 - 12701
                   Timestamp   Latitude  Longitude       SOG   COG       MMSI  \
35548948 2024-03-18 00:04:59  56.258310   7.193732  6.121884  38.6  235437000   
35548949 2024-03-18 00:05:02  56.258477   7.194003  6.121884  41.4  235437000   
35548950 2024-03-18 00:13:34  56.280622   7.227172  6.173328  40.8  235437000   
35548951 2024-03-18 00:23:54  56.307090   7.267263  6.121884  42.5  235437000   
35548952 2024-03-18 00:25:50  56.312018   7.274668  6.173328  41.0  235437000   

          segment_id  
35548948           0  
35548949           0  
35548950           0  
35548951           0  
35548952           0  
                           Timestamp     

In [13]:
train_df.to_parquet(f"{SPLITS_DIR}/train.parquet", index=False)
val_df.to_parquet(f"{SPLITS_DIR}/val.parquet", index=False)
test_df.to_parquet(f"{SPLITS_DIR}/test.parquet", index=False)