In [27]:
import pandas as pd
import numpy as np

# Load training data
train_df = pd.read_csv('new_train.csv', parse_dates=['time'])

# Load test data
test_df = pd.read_csv('ais_test.csv', parse_dates=['time'])


In [28]:
# Sort the training data
train_df = train_df.sort_values(by=['vesselId', 'time'])

# Get the last known data point for each vessel
last_known_positions = train_df.groupby('vesselId').last().reset_index()


In [29]:
# Merge test data with last known positions
merged_df = test_df.merge(
    last_known_positions,
    on='vesselId',
    suffixes=('_test', '_train'),
    how='left'  # Use 'left' join to keep all test data
)


In [30]:
# Identify vessels with missing training data
missing_vessels = merged_df[merged_df['time_train'].isnull()]['vesselId'].unique()

if len(missing_vessels) > 0:
    print("Warning: The following vesselIds are in the test set but not in the training set:")
    print(missing_vessels)
    
    # Option 1: Remove these vessels from the merged_df
    # merged_df = merged_df[~merged_df['vesselId'].isin(missing_vessels)]
    
    # Option 2: Handle them separately (e.g., assign default values)
    # For this example, we'll assign default latitude and longitude of 0.0
    merged_df['latitude'] = merged_df['latitude'].fillna(0.0)
    merged_df['longitude'] = merged_df['longitude'].fillna(0.0)
    merged_df['sog'] = merged_df['sog'].fillna(0.0)
    merged_df['cog'] = merged_df['cog'].fillna(0.0)
    merged_df['time_train'] = merged_df['time_train'].fillna(merged_df['time_test'])


In [31]:
# Define a threshold for movement
MOVEMENT_THRESHOLD = 0.5  # knots

def is_moving(sog):
    return sog > MOVEMENT_THRESHOLD

# Apply the function to create a new column
merged_df['is_moving'] = merged_df['sog'].apply(is_moving)


In [32]:
# Calculate time difference in hours
merged_df['time_diff_hours'] = (merged_df['time_test'] - merged_df['time_train']).dt.total_seconds() / 3600.0

# For any negative or zero time differences, set to zero
merged_df['time_diff_hours'] = merged_df['time_diff_hours'].apply(lambda x: max(x, 0))


In [33]:
# Install geopy if not already installed
# !pip install geopy

from geopy import distance
from geopy.point import Point


In [34]:
def predict_position(row):
    if row['is_moving']:
        try:
            # Get last known position
            lat1 = row['latitude']
            lon1 = row['longitude']
            
            # Get course over ground (degrees) (prøver på heading)
            cog = row['heading']
            
            # Get speed over ground (knots), convert to km/h
            sog_knots = row['sog']
            sog_kmh = sog_knots * 1.852  # 1 knot = 1.852 km/h
            
            # Get time difference in hours
            time_diff = row['time_diff_hours']
            
            # Calculate distance traveled in kilometers
            distance_traveled = sog_kmh * time_diff
            
            # Create a geopy Point for the last known position
            origin = Point(lat1, lon1)
            
            # Calculate destination point given distance and bearing
            # geopy uses bearing in degrees
            destination = distance.distance(kilometers=distance_traveled).destination(origin, bearing=cog)
            
            # Get predicted latitude and longitude
            new_lat = destination.latitude
            new_lon = destination.longitude
            
            # Ensure latitude is within -90 to 90 degrees
            new_lat = max(min(new_lat, 90), -90)
            
            # Ensure longitude is within -180 to 180 degrees
            new_lon = (new_lon + 180) % 360 - 180
            
            return pd.Series({'latitude_predicted': new_lat, 'longitude_predicted': new_lon})
        except Exception as e:
            print(f"Error predicting position for vesselId {row['vesselId']}: {e}")
            # In case of error, return last known position
            return pd.Series({'latitude_predicted': row['latitude'], 'longitude_predicted': row['longitude']})
    else:
        # For stationary vessels, return last known position
        return pd.Series({'latitude_predicted': row['latitude'], 'longitude_predicted': row['longitude']})


In [35]:
# Apply the function to each row
merged_df[['latitude_predicted', 'longitude_predicted']] = merged_df.apply(predict_position, axis=1)


In [36]:
# Prepare the final output dataframe
output_df = merged_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Ensure the columns are in the correct order
output_df = output_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save to CSV
output_df.to_csv('baseline_predictions.csv', index=False)
