# Final notebook 

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KDTree

In [2]:
# Define the path to your file in the bucket
file_path = '../../original_data/ais_train.csv'

# Load the file into a pandas dataframe
baseDataset = pd.read_csv(file_path, delimiter= '|', encoding= 'utf-8')

# Display the dataframe
baseDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522065 entries, 0 to 1522064
Data columns (total 11 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   time       1522065 non-null  object 
 1   cog        1522065 non-null  float64
 2   sog        1522065 non-null  float64
 3   rot        1522065 non-null  int64  
 4   heading    1522065 non-null  int64  
 5   navstat    1522065 non-null  int64  
 6   etaRaw     1522065 non-null  object 
 7   latitude   1522065 non-null  float64
 8   longitude  1522065 non-null  float64
 9   vesselId   1522065 non-null  object 
 10  portId     1520450 non-null  object 
dtypes: float64(4), int64(3), object(4)
memory usage: 127.7+ MB


In [3]:
testset = pd.read_csv('../../original_data/ais_test.csv')
testset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51739 entries, 0 to 51738
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              51739 non-null  int64  
 1   vesselId        51739 non-null  object 
 2   time            51739 non-null  object 
 3   scaling_factor  51739 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 1.6+ MB


### Initial pre-processing of training set

In [4]:
def preprocess_ais_train(df):

    # Step 1: Convert 'time' to datetime and drop useless columns
    df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S')
    df.drop('etaRaw', axis=1, inplace=True)

    # Step 2: Sort by vesselId and time
    df = df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)
    

    # Step 3: Convert relevant columns to float
    df['cog'] = df['cog'].astype(float)
    df['sog'] = df['sog'].astype(float)
    df['rot'] = df['rot'].astype(float)
    df['heading'] = df['heading'].astype(float)
    df['latitude'] = df['latitude'].astype(float)
    df['longitude'] = df['longitude'].astype(float)
    
    # Step 4: Replace invalid or default values with NaN
    df['cog'] = np.where((df['cog'] == 360) | (df['cog'] > 360) | (df['cog'] < 0), np.nan, df['cog'])
    df['sog'] = np.where((df['sog'] == 1023) | (df['sog'] < 0), np.nan, df['sog'])
    df['heading'] = np.where((df['heading'] > 360) | (df['heading'] == 511) | (df['heading'] < 0), np.nan, df['heading'])
    df['rot'] = np.where(df['rot'].isin([127, 128, -127, -128]), np.nan, df['rot'])

    # Step 5: Normalize 'cog' and 'heading'
    df['cog'] = (df['cog'] / 180) - 1
    df['heading'] = (df['heading'] / 180) - 1

    # Step 6: Remove all moored vessels
    """ 
    Moored vessels give little to no information about movement, and still vessel and be inferred by the model
     due to time elapsed. Hence, we remove all moored instance in light of having more informative data 
     """
    df = df[df['navstat'] != 5]

    # Step 7: Remove all rows with Nan values
    df = df.dropna()

    return df

baseDataset = preprocess_ais_train(baseDataset)

In [5]:
baseDataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 892056 entries, 0 to 1522064
Data columns (total 10 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   time       892056 non-null  datetime64[ns]
 1   cog        892056 non-null  float64       
 2   sog        892056 non-null  float64       
 3   rot        892056 non-null  float64       
 4   heading    892056 non-null  float64       
 5   navstat    892056 non-null  int64         
 6   latitude   892056 non-null  float64       
 7   longitude  892056 non-null  float64       
 8   vesselId   892056 non-null  object        
 9   portId     892056 non-null  object        
dtypes: datetime64[ns](1), float64(6), int64(1), object(2)
memory usage: 74.9+ MB


In [6]:
# Get the unique vesselIds from the test set
vessel_ids_test = set(testset['vesselId'].unique())

# Get the count of records per vesselId in the training set
vessel_record_counts = baseDataset['vesselId'].value_counts()

# Get the 10 vessels with the lowest number of records
lowest_record_vessels = vessel_record_counts.nsmallest(20)

# Check if these vessels are in the test set
vessels_in_test = lowest_record_vessels.index.isin(vessel_ids_test)

# Combine the results into a dataframe for easy viewing
vessels_with_low_records = pd.DataFrame({
    'vesselId': lowest_record_vessels.index,
    'record_count': lowest_record_vessels.values,
    'in_test_set': vessels_in_test
})

# Display the result
print(vessels_with_low_records)

                    vesselId  record_count  in_test_set
0   61e9f3cbb937134a3c4bff09             1        False
1   61e9f43fb937134a3c4c016f            77        False
2   61e9f39ab937134a3c4bfdb9           138        False
3   61e9f45eb937134a3c4c0235           140        False
4   61e9f45cb937134a3c4c022b           155        False
5   61e9f47ab937134a3c4c02f7           169        False
6   61e9f42cb937134a3c4c00f9           172        False
7   61e9f3afb937134a3c4bfe47           219        False
8   61e9f3bcb937134a3c4bfe91           220         True
9   61e9f456b937134a3c4c0203           223        False
10  61e9f408b937134a3c4c0023           229        False
11  61e9f418b937134a3c4c0077           239        False
12  61e9f460b937134a3c4c0243           246        False
13  61e9f3aeb937134a3c4bfe45           252        False
14  61e9f3c3b937134a3c4bfeb7           264        False
15  61e9f3bab937134a3c4bfe8b           267        False
16  61e9f42cb937134a3c4c00fb           292      

In [7]:
# List of vessel IDs to remove
vessels_to_remove = ['61e9f3cbb937134a3c4bff09', '61e9f3adb937134a3c4bfe37']

# Remove vessels from the dataset
baseDataset = baseDataset[~baseDataset['vesselId'].isin(vessels_to_remove)]

In [8]:
def filter_short_intervals(data, min_time_diff_minutes= 8):

    # Ensure time column is in datetime format
    data['time'] = pd.to_datetime(data['time'])
    
    # Sort by vesselId and time to ensure proper order
    data = data.sort_values(['vesselId', 'time']).reset_index(drop=True)

    # Calculate time differences in minutes
    data['time_diff'] = data.groupby('vesselId')['time'].diff().dt.total_seconds() / 60  # in minutes

    # Filter out records with time differences less than the specified threshold
    filtered_data = data[(data['time_diff'].isna()) | (data['time_diff'] >= min_time_diff_minutes)].copy()

    # Drop the time_diff column after filtering
    filtered_data = filtered_data.drop(columns=['time_diff']).reset_index(drop=True)
    
    return filtered_data

baseDataset = filter_short_intervals(baseDataset)

In [9]:
def haversine(lat1, lon1, lat2, lon2):
    # Haversine formula to calculate the great-circle distance between two points
    R = 3440.065  # Radius of Earth in nautical miles
    lat1_rad = np.radians(lat1.astype(float))
    lon1_rad = np.radians(lon1.astype(float))
    lat2_rad = np.radians(lat2.astype(float))
    lon2_rad = np.radians(lon2.astype(float))
    
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c

def heuristic_clean(data, vthreshold=30):
    """
    Cleans the trajectory data by removing points with unrealistic speeds.
    
    :param data: DataFrame containing raw trajectory data
    :param vthreshold: Speed threshold in knots
    :return: Cleaned trajectory DataFrame
    """
    # Drop duplicates and reset index
    data = data.drop_duplicates().reset_index(drop=True)
    
    # Sort by vessel ID and time to ensure correct order
    data = data.sort_values(by=['vesselId', 'time']).reset_index(drop=True)
    
    # Shift the columns to align with the previous record within each vessel
    data['prev_latitude'] = data.groupby('vesselId')['latitude'].shift(1)
    data['prev_longitude'] = data.groupby('vesselId')['longitude'].shift(1)
    data['prev_time'] = data.groupby('vesselId')['time'].shift(1)
    
    # Calculate time differences in hours
    data['delta_time'] = (data['time'] - data['prev_time']).dt.total_seconds() / 3600.0  # in hours
    
    # Calculate distances using the haversine function where we have valid previous points
    valid_rows = data['prev_latitude'].notna() & data['prev_longitude'].notna()
    data.loc[valid_rows, 'delta_distance'] = haversine(
        data.loc[valid_rows, 'prev_latitude'].to_numpy(),
        data.loc[valid_rows, 'prev_longitude'].to_numpy(),
        data.loc[valid_rows, 'latitude'].to_numpy(),
        data.loc[valid_rows, 'longitude'].to_numpy()
    )
    
    # Calculate speed (distance/time) and filter by threshold
    data['speed'] = data['delta_distance'] / data['delta_time']
    
    # Filter out rows where speed exceeds the threshold
    cleaned_data = data[(data['speed'] <= vthreshold) | data['speed'].isna()].copy()
    
    # Drop intermediate calculation columns
    cleaned_data = cleaned_data.drop(columns=['prev_latitude', 'prev_longitude', 'prev_time', 
                                              'delta_time', 'delta_distance', 'speed'])
    
    return cleaned_data.reset_index(drop=True)

baseDataset = heuristic_clean(baseDataset)

### Positional feature engineering

Kanskje legge til etterhvert?

In [10]:
"""
def calculate_movement_deltas(df):

    # Calculate time difference in seconds between consecutive points for each vessel
    df = df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)
    df['time_diff'] = df.groupby('vesselId')['time'].diff().dt.total_seconds()
    
    # Convert cog to radians for trigonometric calculations
    df['cog_rad'] = np.radians(df['cog'])

    # Calculate delta_lat and delta_lon based on COG, ROT, and time difference
    df['delta_lat'] = np.sin(df['cog_rad']) * df['time_diff'] * df['sog']
    df['delta_lon'] = np.cos(df['cog_rad']) * df['time_diff'] * df['sog']

    # Fill NaN values in delta_lat and delta_lon with 0
    df['delta_lat'].fillna(0, inplace=True)
    df['delta_lon'].fillna(0, inplace=True)

    # Drop the unnecessary columns
    df.drop(columns=['cog_rad', 'time_diff'], inplace=True)

    return df

baseDataset = calculate_movement_deltas(baseDataset)
"""

"\ndef calculate_movement_deltas(df):\n\n    # Calculate time difference in seconds between consecutive points for each vessel\n    df = df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)\n    df['time_diff'] = df.groupby('vesselId')['time'].diff().dt.total_seconds()\n    \n    # Convert cog to radians for trigonometric calculations\n    df['cog_rad'] = np.radians(df['cog'])\n\n    # Calculate delta_lat and delta_lon based on COG, ROT, and time difference\n    df['delta_lat'] = np.sin(df['cog_rad']) * df['time_diff'] * df['sog']\n    df['delta_lon'] = np.cos(df['cog_rad']) * df['time_diff'] * df['sog']\n\n    # Fill NaN values in delta_lat and delta_lon with 0\n    df['delta_lat'].fillna(0, inplace=True)\n    df['delta_lon'].fillna(0, inplace=True)\n\n    # Drop the unnecessary columns\n    df.drop(columns=['cog_rad', 'time_diff'], inplace=True)\n\n    return df\n\nbaseDataset = calculate_movement_deltas(baseDataset)\n"

In [11]:
def add_temporal_features(df):
    # Ensure 'time' column is in datetime format
    df['time'] = pd.to_datetime(df['time'])
    
    # Extract day of the week (0=Monday, 6=Sunday)
    df['day_of_week'] = df['time'].dt.dayofweek
    
    # Extract hour of the day
    df['hour_of_day'] = df['time'].dt.hour
    
    # Create a binary feature for weekends (0=Weekday, 1=Weekend)
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # Create a categorical feature for seasons
    df['month'] = df['time'].dt.month
    df['season'] = df['month'].apply(lambda x: (x%12 + 3)//3)
    df['season'] = df['season'].map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Fall'})
    
    # One-hot encode the 'season' feature
    df = pd.get_dummies(df, columns=['season'], prefix='season')
    
    # Drop the intermediate 'month' column
    df.drop(columns=['month'], inplace=True)
    
    return df

# Apply the feature engineering function to the baseDataset
baseDataset = add_temporal_features(baseDataset)

 ### Employ Port based feature

In [12]:
# Define the path to your file in the bucket
file_path = '../../original_data/ports.csv'

#Load the file into a pandas dataframe
ports = pd.read_csv(file_path, delimiter= '|', encoding= 'utf-8')

def preprocess_ports_df(ports_df):
    # Renaming the latitude and longitude columns in ports_df to portLatitude and portLongitude
    ports_df = ports_df.rename(columns={'latitude': 'portLatitude', 'longitude': 'portLongitude'})
    # List of columns to keep
    columns_to_keep = ['portId', 'portLatitude', 'portLongitude']

    # Drop all other columns except the ones specified in columns_to_keep
    ports_df = ports_df[columns_to_keep]
    return ports_df

ports = preprocess_ports_df(ports)

In [13]:
def find_close_to_port(df, ports, distance_threshold_nm=3, 
                       vessel_lat_col='latitude', vessel_lon_col='longitude', 
                       port_lat_col='portLatitude', port_lon_col='portLongitude'):
    """
    Determines if each vessel in df is close to a port within a specified distance (in nautical miles).

    Parameters:
    - df (DataFrame): DataFrame containing vessel data with specified latitude and longitude columns.
    - ports (DataFrame): DataFrame containing port data with specified latitude and longitude columns.
    - distance_threshold_nm (float): Distance threshold in nautical miles for identifying proximity to ports.
    - vessel_lat_col (str): Column name for the vessel latitude in df.
    - vessel_lon_col (str): Column name for the vessel longitude in df.
    - port_lat_col (str): Column name for the port latitude in ports.
    - port_lon_col (str): Column name for the port longitude in ports.

    Returns:
    - DataFrame: Updated df with a new binary column 'close_to_port', where 1 indicates proximity to a port within the threshold.
    """

    # Convert latitude and longitude to radians
    df['lat_rad'] = np.radians(df[vessel_lat_col])
    df['lon_rad'] = np.radians(df[vessel_lon_col])
    ports['lat_rad'] = np.radians(ports[port_lat_col])
    ports['lon_rad'] = np.radians(ports[port_lon_col])

    # Build a k-d tree using port coordinates
    port_coords = np.vstack((ports['lat_rad'], ports['lon_rad'])).T
    port_tree = KDTree(port_coords, metric='euclidean')  # Using 'euclidean' since we pre-convert to radians

    # Convert nautical miles to radians (1 nautical mile ≈ 1/3437.75 radians)
    distance_threshold_radians = distance_threshold_nm / 3437.75

    # Query each vessel against the port k-d tree for the closest port within the threshold
    vessel_coords = np.vstack((df['lat_rad'], df['lon_rad'])).T
    distances, indices = port_tree.query(vessel_coords, k=1, return_distance=True)

    # Update df with close_to_port information based on the distance threshold
    df['close_to_port'] = (distances.flatten() < distance_threshold_radians).astype(int)

    # Drop the temporary radian columns
    df.drop(columns=['lat_rad', 'lon_rad'], inplace=True)

    return df


baseDataset = find_close_to_port(
    df=baseDataset, 
    ports=ports, 
    distance_threshold_nm=3, 
    vessel_lat_col='latitude', 
    vessel_lon_col='longitude', 
    port_lat_col='portLatitude', 
    port_lon_col='portLongitude'
)


In [14]:
baseDataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871978 entries, 0 to 871977
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   time           871978 non-null  datetime64[ns]
 1   cog            871978 non-null  float64       
 2   sog            871978 non-null  float64       
 3   rot            871978 non-null  float64       
 4   heading        871978 non-null  float64       
 5   navstat        871978 non-null  int64         
 6   latitude       871978 non-null  float64       
 7   longitude      871978 non-null  float64       
 8   vesselId       871978 non-null  object        
 9   portId         871978 non-null  object        
 10  day_of_week    871978 non-null  int32         
 11  hour_of_day    871978 non-null  int32         
 12  is_weekend     871978 non-null  int64         
 13  season_Spring  871978 non-null  bool          
 14  season_Winter  871978 non-null  bool          
 15  

# We finalize the traning data and fill the test data

In [15]:
le = LabelEncoder()

vessels = baseDataset["vesselId"].unique().tolist()
le.fit(vessels)
baseDataset["vesselId"] = le.transform(baseDataset["vesselId"])

baseDataset = baseDataset.drop(columns=['navstat', 'portId'])

In [16]:
def fill_test_data(training_filepath, test_filepath, ports_df, le, distance_threshold_nm=3):

    # Load and process training data to get the latest known position of each vessel
    ais_train = pd.read_csv(training_filepath, sep='|')
    ais_train['time'] = pd.to_datetime(ais_train['time'])
    ais_train = ais_train.sort_values(by='time')
    final_positions = ais_train.groupby("vesselId").last()

    # Rename columns in final_positions for feature engineering in the test data
    final_positions = final_positions.rename(columns={
        'time': 'time_previous',
        'latitude': 'latitude_previous',
        'longitude': 'longitude_previous',
        'sog': 'sog_previous',
        'cog': 'cog_previous',
        'rot': 'rot_previous',
        'heading': 'heading_previous'
    })

    # Load test data and merge with latest known positions
    ais_test = pd.read_csv(test_filepath)
    ais_test['time'] = pd.to_datetime(ais_test['time'])
    ais_test= ais_test.merge(final_positions, on="vesselId", how="left")

    # Normalize course and heading
    ais_test['cog_previous'] = (ais_test['cog_previous'] / 180) - 1
    ais_test['cog_previous'] = (ais_test['cog_previous'] / 180) - 1

    # Encode vessel IDs using the provided LabelEncoder
    ais_test['vesselId'] = le.transform(ais_test['vesselId'])

    # Calculate time difference in seconds between the test time and last known time
    ais_test['time_gap'] = (ais_test['time'] - ais_test['time_previous']).dt.total_seconds()

    # Determine proximity to ports by using the find_close_to_port function
    testset = find_close_to_port(
        df=ais_test,
        ports=ports_df,
        distance_threshold_nm=distance_threshold_nm,
        vessel_lat_col='latitude_previous',
        vessel_lon_col='longitude_previous',
        port_lat_col='portLatitude',
        port_lon_col='portLongitude'
    )

    testset = testset.drop(columns=['scaling_factor', 'time_previous', 'navstat', 'etaRaw', 'portId'])

    return testset


test_filled = fill_test_data(
    training_filepath='../../original_data/ais_train.csv',
    test_filepath='../../original_data/ais_test.csv',
    ports_df=ports,
    le=le,
    distance_threshold_nm=3,
)


### Keep only vital information for dataset we want to create

# We create the training set!

In [17]:
def lagged_training_data(df, lag_steps):
    """
    Creates a training set with features that include previous values for latitude, longitude,
    speed over ground (sog), and time-related features like hour and day of the week.

    Parameters:
    - df (DataFrame): The input DataFrame containing the AIS data.
    - lag_steps (int): The number of time steps to shift for creating lagged features.

    Returns:
    - DataFrame: A new DataFrame with added lagged features and time-related information.
    """
    # Create a copy of the input DataFrame to work on
    data = df.copy()

    # Generate lagged features for latitude, longitude, and speed over ground (sog)
    data['latitude_previous'] = data.groupby('vesselId')['latitude'].shift(lag_steps)
    data['longitude_previous'] = data.groupby('vesselId')['longitude'].shift(lag_steps)
    data['sog_previous'] = data.groupby('vesselId')['sog'].shift(lag_steps)
    #data['delta_lat_previous'] = data.groupby('vesselId')['delta_lat'].shift(lag_steps)
    #data['delta_lon_previous'] = data.groupby('vesselId')['delta_lon'].shift(lag_steps)
    data['cog_previous'] = data.groupby('vesselId')['cog'].shift(lag_steps)
    data['heading_previous'] = data.groupby('vesselId')['heading'].shift(lag_steps)
    data['rot_previous'] = data.groupby('vesselId')['rot'].shift(lag_steps)
    

    # Calculate time difference between consecutive points in seconds
    data['temp_time_gap'] = data.groupby('vesselId')['time'].diff(lag_steps)
    data['time_gap'] = data['temp_time_gap'].dt.total_seconds()

    # Drop any rows with missing values due to shifting or time differences
    data.dropna(inplace=True)

    data.drop(columns=['temp_time_gap'], inplace=True)

    return data


#### We make a function where we can specify amount of lags! For example, if we want to train multiple models based on different lags, as we want each model to capture different time horizons!

For instance, each vessel should apprxoimately have 240 instances for all the 5 days, i.e. we should train on 240 lags to capture all possibilities, however this reduces amount of data for the first lags (which are more important), thus we could also specify 50 lags, for example to train on the first day, nd get more data for that!

In [18]:
def create_trainingset(df, max_size=12000000, total_lags=69):
    """
    Creates a training dataset by sampling data from specified lags, with sample size per lag 
    dynamically adjusted based on the total number of lags.

    Parameters:
    - make_training_set_func (function): Function that generates the training set for a given lag.
    - max_size (int): Maximum number of total samples to accumulate across all lags.
    - total_lags (int): Total number of lag steps to include in the training dataset.

    Returns:
    - DataFrame: Concatenated training dataset containing sampled instances across specified lags.
    """
    datasets = []
    total_size = 0
    lag = 1
    
    # Dynamically calculate max_samples_per_lag based on total_lags and max_size
    max_samples_per_lag = max_size // total_lags

    while lag <= total_lags and total_size < max_size:
        # Generate dataset for the current lag
        current_set = lagged_training_data(df, lag)
        
        # If the dataset is empty for the current lag, break the loop
        if len(current_set) == 0:
            break

        # Sample the dataset with the dynamically calculated sample size for this lag
        sampled_set = current_set.sample(min(max_samples_per_lag, len(current_set)), random_state=42)
        datasets.append(sampled_set)
        total_size += len(sampled_set)

        print(f"Size after lag {lag}: {total_size:_} samples accumulated with {len(sampled_set)} samples from lag {lag}")

        # Move to the next lag step
        lag += 1

    # Concatenate all sampled datasets into a single training set
    training_data = pd.concat(datasets, ignore_index=True)
    print("Total length of training data:", len(training_data))

    # Clear out memory by deleting intermediate datasets list
    del datasets

    training_data = training_data.drop(columns=['time', 'sog', 'cog', 'heading', 'rot'])

    return training_data


In [19]:
train_data = create_trainingset(baseDataset, max_size=12000000, total_lags=14)

Size after lag 1: 857_142 samples accumulated with 857142 samples from lag 1
Size after lag 2: 1_714_284 samples accumulated with 857142 samples from lag 2
Size after lag 3: 2_571_426 samples accumulated with 857142 samples from lag 3
Size after lag 4: 3_428_568 samples accumulated with 857142 samples from lag 4
Size after lag 5: 4_285_710 samples accumulated with 857142 samples from lag 5
Size after lag 6: 5_142_852 samples accumulated with 857142 samples from lag 6
Size after lag 7: 5_999_994 samples accumulated with 857142 samples from lag 7
Size after lag 8: 6_857_136 samples accumulated with 857142 samples from lag 8
Size after lag 9: 7_714_278 samples accumulated with 857142 samples from lag 9
Size after lag 10: 8_571_420 samples accumulated with 857142 samples from lag 10
Size after lag 11: 9_428_562 samples accumulated with 857142 samples from lag 11
Size after lag 12: 10_285_704 samples accumulated with 857142 samples from lag 12
Size after lag 13: 11_142_846 samples accumulat

# Training pipeline!

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

# Define features (X) and targets (y)
X = train_data.drop(columns=['latitude', 'longitude'])
y = train_data[['latitude', 'longitude']]


In [21]:
# Find all DataFrames in memory
dataframes_in_memory = {name: obj for name, obj in globals().items() if isinstance(obj, pd.DataFrame)}

# Print the names and memory usage of each DataFrame
for name, df in dataframes_in_memory.items():
    print(f"{name}: {df.memory_usage(deep=True).sum() / (1024 ** 2):.2f} MB")

baseDataset: 74.84 MB
testset: 8.54 MB
vessels_with_low_records: 0.00 MB
ports: 0.14 MB
test_filled: 4.34 MB
train_data: 1213.07 MB
X: 1029.97 MB
y: 183.11 MB


In [22]:
import gc

#del gdf_with_near_land
del train_data
del testset
del baseDataset # Ikke dersom vi vil lage flere modeller for å stacke tid


# Run garbage collection to free up memory
gc.collect()

21

In [23]:
# Initialize and train the MultiOutputRegressor with RandomForest
# Initialize and train the MultiOutputRegressor with an optimized RandomForest
model = MultiOutputRegressor(
    RandomForestRegressor(
        n_estimators=10,           # Adjusted number of trees for balance between performance and training time
        n_jobs=6,                  # Use all CPU cores dynamically
        random_state=42 ,            # For reproducibility
        warm_start=False,
        criterion='squared_error',
        max_depth=30
    )
)

model.fit(X, y)

print("Model training completed on the training dataset.")


Model training completed on the training dataset.


### Preparing test set for prediction!

In [24]:
test_ids = test_filled['ID'].copy()
test_filled = test_filled.drop(columns=['ID'])
# Set testset columns in the same order as the training set X
testset = test_filled[X.columns]

KeyError: "['day_of_week', 'hour_of_day', 'is_weekend', 'season_Spring', 'season_Winter'] not in index"

In [None]:
def predict_and_append_coordinates(predictor, testset):
    """
    Predicts longitude and latitude for the test set using a MultiOutputRegressor model
    and appends them as new columns.

    Parameters:
    - predictor (MultiOutputRegressor): The trained model for predicting both latitude and longitude.
    - testset (pd.DataFrame): The test set including all features required for prediction.

    Returns:
    - pd.DataFrame: The test set DataFrame with appended 'latitude_predicted' and 'longitude_predicted' columns.
    """
    # Predict both latitude and longitude on the test set
    predictions = predictor.predict(testset)

    # Extract predictions for latitude and longitude
    latitude_predictions = predictions[:, 0]
    longitude_predictions = predictions[:, 1]

    # Append predictions to the test set DataFrame as new columns
    testset_with_predictions = testset.copy()
    testset_with_predictions['latitude_predicted'] = latitude_predictions
    testset_with_predictions['longitude_predicted'] = longitude_predictions

    return testset_with_predictions

# Example usage:
testset_with_predictions = predict_and_append_coordinates(model, testset)
testset_with_predictions.head(60)  # To verify the appended predictions

In [None]:
def predict_and_prepare_submission(predictor, testset, test_ids, submission_path='submission.csv'):
    """
    Predicts latitude and longitude for the test set using a MultiOutputRegressor model, 
    merges the predictions with test IDs, and creates a submission file in the required Kaggle format.

    Parameters:
    - predictor (MultiOutputRegressor): The trained model for predicting both latitude and longitude.
    - testset (pd.DataFrame): The test set including all features required for prediction.
    - test_ids (pd.Series or list): The IDs for each entry in the test set.
    - submission_path (str): The path to save the submission CSV file.

    Returns:
    - pd.DataFrame: The final submission dataframe.
    """
    # Step 1: Predict both latitude and longitude on the test set
    predictions = predictor.predict(testset)

    # Step 2: Extract latitude and longitude predictions
    latitude_predictions = predictions[:, 0]
    longitude_predictions = predictions[:, 1]

    # Step 3: Combine predictions with test IDs into a single DataFrame
    submission_df = pd.DataFrame({
        'ID': test_ids,
        'longitude_predicted': longitude_predictions,
        'latitude_predicted': latitude_predictions
    })

    # Step 4: Save to CSV with the correct column order
    submission_df.to_csv(submission_path, index=False, columns=['ID', 'longitude_predicted', 'latitude_predicted'])
    print(f"Submission file saved to {submission_path}")

    return submission_df

# Example usage:
submission_df = predict_and_prepare_submission(model, testset, test_ids, '../../submissions/subfinal.csv')
submission_df.head(50)  # To verify the structure of the submission file