# shortNotebook for RF

In [1]:
import pandas as pd
import numpy as np
from geopy.distance import geodesic
from datetime import timedelta  # Added import for timedelta

from joblib import Parallel, delayed

In [2]:
# Define the path to your file in the bucket
file_path = '../../original_data/ais_train.csv'

# Load the file into a pandas dataframe
ais_train_df = pd.read_csv(file_path, delimiter= '|', encoding= 'utf-8')

# Display the dataframe
ais_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522065 entries, 0 to 1522064
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   time       1522065 non-null  object 
 1   cog        1522065 non-null  float64
 2   sog        1522065 non-null  float64
 3   rot        1522065 non-null  int64  
 4   heading    1522065 non-null  int64  
 5   navstat    1522065 non-null  int64  
 6   etaRaw     1522065 non-null  object 
 7   latitude   1522065 non-null  float64
 8   longitude  1522065 non-null  float64
 9   vesselId   1522065 non-null  object 
 10  portId     1520450 non-null  object 
dtypes: float64(4), int64(3), object(4)
memory usage: 127.7+ MB


In [3]:
ais_test_df = pd.read_csv('../../original_data/ais_test.csv')
ais_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51739 entries, 0 to 51738
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              51739 non-null  int64  
 1   vesselId        51739 non-null  object 
 2   time            51739 non-null  object 
 3   scaling_factor  51739 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.6+ MB


In [4]:
def preprocess_ais_train(ais_train_df):
    """
    Preprocess the ais_train_df by converting columns, handling missing or invalid values, 
    merging port information, and mapping NAVSTAT codes to descriptions.

    Parameters:
    - ais_train_df: DataFrame containing the raw AIS train data.
    - ports_df: DataFrame containing port information with portId, latitude, and longitude.

    Returns:
    - ais_train_df_cleaned: A cleaned and preprocessed version of ais_train_df.
    """
    # Step 1: Convert 'time' to datetime and drop 'etaRaw'
    ais_train_df['time'] = pd.to_datetime(ais_train_df['time'], format='%Y-%m-%d %H:%M:%S')
    ais_train_df.drop('etaRaw', axis=1, inplace=True)

    # Step 4: Convert relevant columns to float
    ais_train_df['cog'] = ais_train_df['cog'].astype(float)
    ais_train_df['sog'] = ais_train_df['sog'].astype(float)
    ais_train_df['rot'] = ais_train_df['rot'].astype(float)
    ais_train_df['heading'] = ais_train_df['heading'].astype(float)
    ais_train_df['latitude'] = ais_train_df['latitude'].astype(float)
    ais_train_df['longitude'] = ais_train_df['longitude'].astype(float)
    
    # Step 5: Replace invalid or default values with NaN
    ais_train_df['cog'] = np.where((ais_train_df['cog'] == 360) | (ais_train_df['cog'] > 360) | (ais_train_df['cog'] < 0), np.nan, ais_train_df['cog'])
    ais_train_df['sog'] = np.where((ais_train_df['sog'] == 1023) | (ais_train_df['sog'] < 0), np.nan, ais_train_df['sog'])
    ais_train_df['heading'] = np.where((ais_train_df['heading'] > 360) | (ais_train_df['heading'] == 511) | (ais_train_df['heading'] < 0), np.nan, ais_train_df['heading'])
    
    # Step 6: Drop 'rot' as it is difficult to find outliers and thus is bring more noise than information
    ais_train_df.drop(columns=['rot'], inplace=True)

    # Step 7: Sort by vesselId and time
    ais_train_df = ais_train_df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)

    return ais_train_df

baseDataset = preprocess_ais_train(ais_train_df)

baseDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522065 entries, 0 to 1522064
Data columns (total 9 columns):
 #   Column     Non-Null Count    Dtype         
---  ------     --------------    -----         
 0   time       1522065 non-null  datetime64[ns]
 1   cog        1516207 non-null  float64       
 2   sog        1522065 non-null  float64       
 3   heading    1517169 non-null  float64       
 4   navstat    1522065 non-null  int64         
 5   latitude   1522065 non-null  float64       
 6   longitude  1522065 non-null  float64       
 7   vesselId   1522065 non-null  object        
 8   portId     1520450 non-null  object        
dtypes: datetime64[ns](1), float64(5), int64(1), object(2)
memory usage: 104.5+ MB


# Outlier detection and removal

In [5]:
# Get the unique vesselIds from the test set
vessel_ids_test = set(ais_test_df['vesselId'].unique())

# Get the count of records per vesselId in the training set
vessel_record_counts = ais_train_df['vesselId'].value_counts()

# Get the 10 vessels with the lowest number of records
lowest_record_vessels = vessel_record_counts.nsmallest(20)

# Check if these vessels are in the test set
vessels_in_test = lowest_record_vessels.index.isin(vessel_ids_test)

# Combine the results into a dataframe for easy viewing
vessels_with_low_records = pd.DataFrame({
    'vesselId': lowest_record_vessels.index,
    'record_count': lowest_record_vessels.values,
    'in_test_set': vessels_in_test
})

# Display the result
print(vessels_with_low_records)

                    vesselId  record_count  in_test_set
0   61e9f3cbb937134a3c4bff09             1        False
1   61e9f3adb937134a3c4bfe37            31        False
2   61e9f3c6b937134a3c4bfed5           160        False
3   61e9f42cb937134a3c4c00f9           191        False
4   61e9f45cb937134a3c4c022b           196        False
5   61e9f39ab937134a3c4bfdb9           197        False
6   61e9f45eb937134a3c4c0235           250        False
7   61e9f3bcb937134a3c4bfe91           328         True
8   61e9f418b937134a3c4c0077           332        False
9   61e9f408b937134a3c4c0023           355        False
10  61e9f460b937134a3c4c0243           361        False
11  61e9f3f7b937134a3c4bffc5           373        False
12  61e9f409b937134a3c4c0027           391        False
13  620bf33a718775aca4a81900           401        False
14  61e9f423b937134a3c4c00c7           402        False
15  61e9f38eb937134a3c4bfd8b           402        False
16  61e9f456b937134a3c4c0203           408      

In [6]:
# List of vessel IDs to remove
vessels_to_remove = ['61e9f3cbb937134a3c4bff09', '61e9f3adb937134a3c4bfe37']

# Remove vessels from the dataset
baseDataset = baseDataset[~ais_train_df['vesselId'].isin(vessels_to_remove)]

In [7]:
def filter_short_intervals(data, min_time_diff_minutes=2):
    """
    Filters out records with time differences less than the specified threshold.
    
    Parameters:
    - data (pd.DataFrame): DataFrame containing 'vesselId' and 'time' columns.
    - min_time_diff_minutes (int): Minimum time difference in minutes to keep records.
    
    Returns:
    - pd.DataFrame: Filtered DataFrame with records having time differences >= min_time_diff_minutes.
    """
    # Ensure time column is in datetime format
    data['time'] = pd.to_datetime(data['time'])
    
    # Sort by vesselId and time to ensure proper order
    data = data.sort_values(['vesselId', 'time']).reset_index(drop=True)

    # Calculate time differences in minutes
    data['time_diff'] = data.groupby('vesselId')['time'].diff().dt.total_seconds() / 60  # in minutes

    # Filter out records with time differences less than the specified threshold
    filtered_data = data[(data['time_diff'].isna()) | (data['time_diff'] >= min_time_diff_minutes)].copy()

    # Drop the time_diff column after filtering
    filtered_data = filtered_data.drop(columns=['time_diff']).reset_index(drop=True)
    
    return filtered_data

baseDataset = filter_short_intervals(baseDataset)

In [8]:
def haversine(lat1, lon1, lat2, lon2):
    # Haversine formula to calculate the great-circle distance between two points
    R = 3440.065  # Radius of Earth in nautical miles
    lat1_rad = np.radians(lat1.astype(float))
    lon1_rad = np.radians(lon1.astype(float))
    lat2_rad = np.radians(lat2.astype(float))
    lon2_rad = np.radians(lon2.astype(float))
    
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c

def heuristic_clean(data, vthreshold=30):
    """
    Cleans the trajectory data by removing points with unrealistic speeds.
    
    :param data: DataFrame containing raw trajectory data
    :param vthreshold: Speed threshold in knots
    :return: Cleaned trajectory DataFrame
    """
    # Drop duplicates and reset index
    data = data.drop_duplicates().reset_index(drop=True)
    
    # Sort by vessel ID and time to ensure correct order
    data = data.sort_values(by=['vesselId', 'time']).reset_index(drop=True)
    
    # Shift the columns to align with the previous record within each vessel
    data['prev_latitude'] = data.groupby('vesselId')['latitude'].shift(1)
    data['prev_longitude'] = data.groupby('vesselId')['longitude'].shift(1)
    data['prev_time'] = data.groupby('vesselId')['time'].shift(1)
    
    # Calculate time differences in hours
    data['delta_time'] = (data['time'] - data['prev_time']).dt.total_seconds() / 3600.0  # in hours
    
    # Calculate distances using the haversine function where we have valid previous points
    valid_rows = data['prev_latitude'].notna() & data['prev_longitude'].notna()
    data.loc[valid_rows, 'delta_distance'] = haversine(
        data.loc[valid_rows, 'prev_latitude'].to_numpy(),
        data.loc[valid_rows, 'prev_longitude'].to_numpy(),
        data.loc[valid_rows, 'latitude'].to_numpy(),
        data.loc[valid_rows, 'longitude'].to_numpy()
    )
    
    # Calculate speed (distance/time) and filter by threshold
    data['speed'] = data['delta_distance'] / data['delta_time']
    
    # Filter out rows where speed exceeds the threshold
    cleaned_data = data[(data['speed'] <= vthreshold) | data['speed'].isna()].copy()
    
    # Drop intermediate calculation columns
    cleaned_data = cleaned_data.drop(columns=['prev_latitude', 'prev_longitude', 'prev_time', 
                                              'delta_time', 'delta_distance', 'speed'])
    
    return cleaned_data.reset_index(drop=True)

baseDataset = heuristic_clean(baseDataset)

# Historical Movement Feature Engineering

In [9]:
import numpy as np
import pandas as pd

def create_historical_movement_features(df):
    """
    Calculates historical movement features for vessel data:
    delta_latitude, delta_longitude, time_diff, movement_intensity, and directional_stability.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing sorted vessel data with columns:
                       'vesselId', 'time', 'latitude', and 'longitude'.
    
    Returns:
    pd.DataFrame: DataFrame with new features added.
    """
    # Calculate deltas in latitude, longitude, and time
    df['delta_latitude'] = df.groupby('vesselId')['latitude'].diff()
    df['delta_longitude'] = df.groupby('vesselId')['longitude'].diff()
    df['time_diff'] = df.groupby('vesselId')['time'].diff().dt.total_seconds() / 3600  # Time difference in hours

    # Calculate speed as distance over time
    df['speed'] = np.sqrt(df['delta_latitude']**2 + df['delta_longitude']**2) / df['time_diff']

    # Calculate movement intensity as a 5-step exponential moving average of speed
    df['movement_intensity'] = df.groupby('vesselId')['speed'].transform(lambda x: x.ewm(span=5, adjust=False).mean())

    # Calculate bearing as the direction of movement
    df['bearing'] = np.arctan2(df['delta_longitude'], df['delta_latitude'])

    # Calculate directional stability as the 3-step rolling standard deviation of bearing
    df['directional_stability'] = df.groupby('vesselId')['bearing'].transform(lambda x: x.rolling(window=3).std())

    # Drop intermediate columns if not needed
    df.drop(columns=['speed', 'bearing'], inplace=True)

    # Replace NaN values that result from the first differences with 0 (or other logic if preferred)
    df[['delta_latitude', 'delta_longitude', 'time_diff', 'movement_intensity', 'directional_stability']] = \
        df[['delta_latitude', 'delta_longitude', 'time_diff', 'movement_intensity', 'directional_stability']].fillna(0)

    return df


baseDataset = create_historical_movement_features(baseDataset)

In [10]:
baseDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1518801 entries, 0 to 1518800
Data columns (total 14 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   time                   1518801 non-null  datetime64[ns]
 1   cog                    1512956 non-null  float64       
 2   sog                    1518801 non-null  float64       
 3   heading                1513911 non-null  float64       
 4   navstat                1518801 non-null  int64         
 5   latitude               1518801 non-null  float64       
 6   longitude              1518801 non-null  float64       
 7   vesselId               1518801 non-null  object        
 8   portId                 1517195 non-null  object        
 9   delta_latitude         1518801 non-null  float64       
 10  delta_longitude        1518801 non-null  float64       
 11  time_diff              1518801 non-null  float64       
 12  movement_intensity     15188

# We now remove features that have show little prediction power

In [11]:
def drop_unnecessary_columns(df):
    """
    Drops the columns 'sog', 'cog', 'heading', 'navstat', and 'portId' from the DataFrame, if they exist.
    
    Parameters:
    df (pd.DataFrame): The DataFrame from which to drop the columns.
    
    Returns:
    pd.DataFrame: The DataFrame with specified columns removed.
    """
    columns_to_drop = ['sog', 'cog', 'heading', 'navstat', 'portId']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
    return df

baseDataset = drop_unnecessary_columns(baseDataset)

In [12]:
baseDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1518801 entries, 0 to 1518800
Data columns (total 9 columns):
 #   Column                 Non-Null Count    Dtype         
---  ------                 --------------    -----         
 0   time                   1518801 non-null  datetime64[ns]
 1   latitude               1518801 non-null  float64       
 2   longitude              1518801 non-null  float64       
 3   vesselId               1518801 non-null  object        
 4   delta_latitude         1518801 non-null  float64       
 5   delta_longitude        1518801 non-null  float64       
 6   time_diff              1518801 non-null  float64       
 7   movement_intensity     1518801 non-null  float64       
 8   directional_stability  1518801 non-null  float64       
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 104.3+ MB


# We now create the training dataset

In [13]:
# We crete the final features needed for our basedataset
def features_gen(df):
    df['y_lat'] = df['latitude']
    df['y_lon'] = df['longitude']
    return df

baseDataset = features_gen(baseDataset)

In [14]:
def process_vessel_data(vessel_data, batch_size_days, batch_segment):
    # Sort the vessel data by time
    vessel_data = vessel_data.sort_values('time').reset_index(drop=True)
    
    # Check for empty vessel data
    if vessel_data.empty:
        print("Vessel data is empty; skipping processing.")
        return pd.DataFrame()
    
    # Calculate batch interval to control overlap
    batch_interval = batch_size_days / batch_segment
    t_min = vessel_data['time'].min()
    t_max = vessel_data['time'].max()

    # Check if t_min and t_max are valid
    if t_min is pd.NaT or t_max is pd.NaT:
        print(f"Invalid timestamps for vesselId {vessel_data['vesselId'].iloc[0]}; skipping vessel.")
        return pd.DataFrame()

    # Generate batch start times
    batch_starts = pd.date_range(start=t_min, end=t_max - timedelta(days=batch_size_days), freq=f'{batch_interval}D')

    # Check if batch_starts is empty
    if batch_starts.empty:
        return pd.DataFrame()  # Skip this vessel

    # Prepare a list to hold batch data
    batches = []
    
    # Define features to overwrite outside the loop
    features_to_overwrite = vessel_data.columns.difference(['time', 'y_lat', 'y_lon', 'train'])

    for batch_start in batch_starts:
        batch_end = batch_start + timedelta(days=batch_size_days)
        batch_data = vessel_data[(vessel_data['time'] >= batch_start) & (vessel_data['time'] <= batch_end)].copy()

        # Skip if this batch window contains insufficient data points
        if batch_data.empty or len(batch_data) < 2:  # Require at least two points to define a movement
            continue
        
        # Process the batch as usual
        base_row = batch_data.iloc[0]
        batch_data['train'] = 0
        batch_data.at[batch_data.index[0], 'train'] = 1

        # Overwrite features using base row values for all rows
        base_values = base_row[features_to_overwrite].to_dict()
        batch_data.loc[:, features_to_overwrite] = batch_data.apply(lambda row: pd.Series(base_values), axis=1)

        # Calculate 'time_elapsed' in hours
        batch_data['time_elapsed'] = (batch_data['time'] - base_row['time']).dt.total_seconds() / 3600

        # Collect batch data
        batches.append(batch_data)

    # Handle the remaining data dynamically, even if it's shorter than batch_size_days
    remaining_data = vessel_data[vessel_data['time'] > batch_starts[-1]].copy()
    if not remaining_data.empty and len(remaining_data) >= 2:  # Require at least two points
        # Use the first row as the base point
        base_row = remaining_data.iloc[0]
        remaining_data['train'] = 0
        remaining_data.at[remaining_data.index[0], 'train'] = 1

        # Overwrite features using base row values for all rows
        base_values = base_row[features_to_overwrite].to_dict()
        remaining_data.loc[:, features_to_overwrite] = remaining_data.apply(lambda row: pd.Series(base_values), axis=1)

        # Calculate 'time_elapsed' in hours for remaining data
        remaining_data['time_elapsed'] = (remaining_data['time'] - base_row['time']).dt.total_seconds() / 3600

        # Add the final batch with remaining data
        batches.append(remaining_data)

    # Concatenate all batches for this vessel
    return pd.concat(batches, ignore_index=True) if batches else pd.DataFrame()


def create_batches(base_dataset, validation_split_ratio=0.1, batch_size_days=5, batch_segment=1, n_jobs=8):
    # Step 1: Sort by time for initial splitting
    base_dataset = base_dataset.sort_values('time').reset_index(drop=True)

    # Step 2: Determine the split index based on time for the entire dataset
    split_index = int(len(base_dataset) * (1 - validation_split_ratio))
    train_dataset = base_dataset.iloc[:split_index]
    val_dataset = base_dataset.iloc[split_index:]

    # Step 3: Re-sort by vesselId and time within each subset
    train_dataset = train_dataset.sort_values(['vesselId', 'time']).reset_index(drop=True)
    val_dataset = val_dataset.sort_values(['vesselId', 'time']).reset_index(drop=True)

    # Define a wrapper to apply process_vessel_data
    def process_group(vessel_data):
        return process_vessel_data(vessel_data, batch_size_days, batch_segment)

    # Step 4: Process each subset in parallel using joblib
    train_batches = Parallel(n_jobs=n_jobs)(
        delayed(process_group)(vessel_data) for _, vessel_data in train_dataset.groupby('vesselId')
    )
    val_batches = Parallel(n_jobs=n_jobs)(
        delayed(process_group)(vessel_data) for _, vessel_data in val_dataset.groupby('vesselId')
    )

    # Step 5: Concatenate results
    train_batches = pd.concat(train_batches, ignore_index=True) if train_batches else pd.DataFrame()
    val_batches = pd.concat(val_batches, ignore_index=True) if val_batches else pd.DataFrame()

    return train_batches, val_batches


In [15]:
# Parameters for batch creation
batch_size_days = 5            # Size of each batch in days
batch_segment = 3              # Number of segments per batch (controls overlap)
validation_split_ratio = 0.1   # 10% of each vessel's data for validation

# Run the batch creation for training and validation
trainset, valset = create_batches(baseDataset, validation_split_ratio, batch_size_days, batch_segment)

# Training Pipeline

In [16]:
# Drop features which make generalization more difficult and only add noise
def dropNoiseColumns(df):
    # Drop columns that add noise or are no longer needed
    #df = df.drop(columns=['vesselId', 'time', 'eta_port'])
    df = df[df['train'] != 1]
    df = df.drop(columns=['vesselId', 'time', 'train'])
    return df

trainset = dropNoiseColumns(trainset)
valset = dropNoiseColumns(valset)

#### Remove unused dataframes from pipeline

In [17]:
# Find all DataFrames in memory
dataframes_in_memory = {name: obj for name, obj in globals().items() if isinstance(obj, pd.DataFrame)}

# Print the names and memory usage of each DataFrame
for name, df in dataframes_in_memory.items():
    print(f"{name}: {df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

ais_train_df: 316.40 MB
ais_test_df: 8.54 MB
baseDataset: 233.23 MB
vessels_with_low_records: 0.00 MB
trainset: 329.63 MB
valset: 26.46 MB


In [18]:
import gc

del ais_train_df 

# Run garbage collection to free up memory
gc.collect()

26

## Random forest training

### Custom metric

In [19]:
def calculate_scaling_factor(time_elapsed):
    """Calculates the scaling factor based on time_elapsed in hours."""
    if time_elapsed < 24:
        return 0.3
    elif 24 <= time_elapsed < 48:
        return 0.25
    elif 48 <= time_elapsed < 72:
        return 0.2
    elif 72 <= time_elapsed < 96:
        return 0.15
    elif 96 <= time_elapsed < 120:
        return 0.1
    else:
        return 0.1
    
import pandas as pd
import numpy as np
from geopy.distance import geodesic

def calculate_scaling_factor(time_elapsed):
    """Calculates the scaling factor based on time_elapsed in hours."""
    if time_elapsed < 24:
        return 0.3
    elif 24 <= time_elapsed < 48:
        return 0.25
    elif 48 <= time_elapsed < 72:
        return 0.2
    elif 72 <= time_elapsed < 96:
        return 0.15
    elif 96 <= time_elapsed < 120:
        return 0.1
    else:
        return 0.1

def calculate_distance(row):
    """Calculates the weighted distance between the actual and predicted lat/long points."""
    if pd.isna(row['y_lat']) or pd.isna(row['latitude_predicted']):
        return np.nan

    # Calculate the geodesic distance in meters
    distance = geodesic(
        (row['y_lat'], row['y_lon']),
        (row['latitude_predicted'], row['longitude_predicted'])
    ).meters

    # Determine scaling factor based on time_elapsed
    scaling_factor = calculate_scaling_factor(row['time_elapsed'])
    
    # Calculate weighted distance
    weighted_distance = distance * scaling_factor
    return weighted_distance

def score(solution: pd.DataFrame, submission: pd.DataFrame) -> float:
    """
    Calculates the weighted distance between the actual and predicted latitude/longitude points.
    """
    # Ensure necessary columns are present
    required_columns = ["longitude_predicted", "latitude_predicted", "time_elapsed"]
    if not all(col in submission.columns for col in required_columns):
        raise ValueError(f'Submission must contain columns: {required_columns}')

    # Merge the predictions with the ground truth data (assuming aligned indexes)
    solution_submission = solution.join(
        submission[['longitude_predicted', 'latitude_predicted', 'time_elapsed']]
    )

    # Calculate weighted distance for each row
    solution_submission['weighted_distance'] = solution_submission.apply(calculate_distance, axis=1)
    
    # Calculate and return the mean weighted distance in kilometers
    weighted_distance = solution_submission['weighted_distance'].mean() / 1000.0
    return weighted_distance


### Hyper parameter tuning

In [20]:
# Retrieve the best hyperparameters from the Optuna study
best_params = study.best_params
print("Best parameters:", best_params)
print("Best score:", study.best_value)

# Train the final model using the best hyperparameters
final_model = MultiOutputRegressor(
    RandomForestRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        max_features=best_params['max_features'],
        random_state=42,
        n_jobs=-1
    )
)

# Train on the entire training set, including 'time_elapsed'
X_train = trainset.drop(columns=['y_lat', 'y_lon'])
y_train = trainset[['y_lat', 'y_lon']]

# Fit the final model on the full training set
final_model.fit(X_train, y_train)

# Evaluate on the validation set with 'time_elapsed' included
X_val = valset.drop(columns=['y_lat', 'y_lon'])
y_val = valset[['y_lat', 'y_lon']]
submission = pd.DataFrame(final_model.predict(X_val), columns=['latitude_predicted', 'longitude_predicted'])
submission['time_elapsed'] = X_val['time_elapsed'].values

# Calculate the final score using the custom metric
final_score = score(y_val, submission)
print("Final validation score:", final_score)


NameError: name 'study' is not defined

In [None]:
testset = ais_test_df.copy()

def configure_testset(base_df, test_df, train_df):
    # Sort by 'vesselId' and 'time'
    test_df = test_df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)

    # Drop the scaling factor column
    test_df = test_df.drop(columns=['scaling_factor'])

    # Identify the last known position of each vessel from training data
    last_known_positions = base_df.groupby('vesselId').tail(1).copy()
    last_known_positions['train'] = 1  # Set the 'train' flag to 1 for these rows

    # Prepare an empty list to collect updated test batches
    updated_test_set = []

    # Process each vessel in the test set individually
    for vessel_id in test_df['vesselId'].unique():
        # Select the test batch for this vessel and make a deep copy to avoid SettingWithCopyWarning
        vessel_test_batch = test_df[test_df['vesselId'] == vessel_id].copy()

        # Select the last known position from training for this vessel
        init_row = last_known_positions[last_known_positions['vesselId'] == vessel_id]
        
        # Concatenate the initial row to the start of this vessel's test batch
        vessel_test_batch = pd.concat([init_row, vessel_test_batch], ignore_index=True)
        
        # Set 'train' flag to 0 for all subsequent rows in the test batch
        vessel_test_batch.loc[1:, 'train'] = 0  # Use .loc to avoid chained assignment

        # Add the modified batch to the list
        updated_test_set.append(vessel_test_batch)

    # Concatenate all modified batches into a single test dataframe
    updated_test_df = pd.concat(updated_test_set, ignore_index=True)

    #updated_test_df = updated_test_df.drop(columns=['y_lat', 'y_lon', 'eta_port'])
    updated_test_df = updated_test_df.drop(columns=['y_lat', 'y_lon'])
    return updated_test_df


testset = configure_testset(baseDataset, testset, trainset)

In [None]:
def fillTestSet(testset, trainset):
    """
    Fills the testset by propagating the initial 'train' row values for each vessel and creating time_elapsed.

    Parameters:
    - testset (pd.DataFrame): The test dataframe, which includes columns for vesselId, time, and train indicator.

    Returns:
    - pd.DataFrame: Processed test dataframe with filled values, time_elapsed column, and removed unnecessary columns.
    """
    testset['time'] = pd.to_datetime(testset['time'])

    # Initialize a list to store filled data for each vessel
    filled_test_batches = []

    # Process each vessel individually
    for vessel_id in testset['vesselId'].unique():
        # Select data for the current vessel
        vessel_data = testset[testset['vesselId'] == vessel_id].copy()

        # Identify the first row where train = 1 to use as reference
        initial_row = vessel_data[vessel_data['train'] == 1].iloc[0]

        # Fill all rows with values from the initial row, except for 'time' and 'train'
        for col in vessel_data.columns:
            if col not in ['time', 'train', 'ID']:
                vessel_data.loc[vessel_data['train'] == 0, col] = initial_row[col]

        # Create the time_elapsed column in hours
        initial_time = initial_row['time']
        vessel_data['time_elapsed'] = (vessel_data['time'] - initial_time).dt.total_seconds() / 3600

        # Append the filled vessel data to the list
        filled_test_batches.append(vessel_data)

    # Concatenate all filled batches into a single DataFrame
    filled_testset = pd.concat(filled_test_batches, ignore_index=True)

    filled_testset = filled_testset[filled_testset['train'] == 0].reset_index(drop=True)
    
    filled_testset = filled_testset.sort_values(by=['ID']).reset_index(drop=True)

    test_ids = filled_testset['ID'].copy()

    # Drop 'time' and 'vesselId' columns, and re-order columns to match the training set
    filled_testset = filled_testset.drop(columns=['time', 'vesselId', 'ID', 'train'])
    training_order = [col for col in trainset.columns if col not in ['y_lat', 'y_lon']]
    filled_testset = filled_testset[training_order]

    return filled_testset, test_ids

testset, test_ids = fillTestSet(testset, trainset)

In [None]:
def predict_and_append_coordinates(predictor, testset):
    """
    Predicts longitude and latitude for the test set using a MultiOutputRegressor model
    and appends them as new columns.

    Parameters:
    - predictor (MultiOutputRegressor): The trained model for predicting both latitude and longitude.
    - testset (pd.DataFrame): The test set including all features required for prediction.

    Returns:
    - pd.DataFrame: The test set DataFrame with appended 'latitude_predicted' and 'longitude_predicted' columns.
    """
    # Predict both latitude and longitude on the test set
    predictions = predictor.predict(testset)

    # Extract predictions for latitude and longitude
    latitude_predictions = predictions[:, 0]
    longitude_predictions = predictions[:, 1]

    # Append predictions to the test set DataFrame as new columns
    testset_with_predictions = testset.copy()
    testset_with_predictions['latitude_predicted'] = latitude_predictions
    testset_with_predictions['longitude_predicted'] = longitude_predictions

    return testset_with_predictions

# Example usage:
testset_with_predictions = predict_and_append_coordinates(final_model, testset)
testset_with_predictions.head()  # To verify the appended predictions


In [None]:
def predict_and_prepare_submission(predictor, testset, test_ids, submission_path='submission.csv'):
    """
    Predicts latitude and longitude for the test set using a MultiOutputRegressor model, 
    merges the predictions with test IDs, and creates a submission file in the required Kaggle format.

    Parameters:
    - predictor (MultiOutputRegressor): The trained model for predicting both latitude and longitude.
    - testset (pd.DataFrame): The test set including all features required for prediction.
    - test_ids (pd.Series or list): The IDs for each entry in the test set.
    - submission_path (str): The path to save the submission CSV file.

    Returns:
    - pd.DataFrame: The final submission dataframe.
    """
    # Step 1: Predict both latitude and longitude on the test set
    predictions = predictor.predict(testset)

    # Step 2: Extract latitude and longitude predictions
    latitude_predictions = predictions[:, 0]
    longitude_predictions = predictions[:, 1]

    # Step 3: Combine predictions with test IDs into a single DataFrame
    submission_df = pd.DataFrame({
        'ID': test_ids,
        'longitude_predicted': longitude_predictions,
        'latitude_predicted': latitude_predictions
    })

    # Step 4: Save to CSV with the correct column order
    submission_df.to_csv(submission_path, index=False, columns=['ID', 'longitude_predicted', 'latitude_predicted'])
    print(f"Submission file saved to {submission_path}")

    return submission_dfshortNotebookRF

# Example usage:
submission_df = predict_and_prepare_submission(final_model, testset, test_ids, '../../submissions/submissionRF.csv')
submission_df.head()  # To verify the structure of the submission file
