In [2]:
import pandas as pd

# Load the preprocessed dataset
df = pd.read_csv('updated_ships_data_with_loitering.csv')  # Replace with your preprocessed data file path

# Check the first few rows of the data to ensure it's correct
print(df.head())

   MMSI         BaseDateTime       LAT       LON  SOG    COG  Heading  \
0    11  2024-12-31 00:00:44  27.29231 -90.96791  0.0  223.8    511.0   
1    11  2024-12-31 00:03:45  27.29230 -90.96785  0.1  220.8    511.0   
2    11  2024-12-31 00:09:46  27.29234 -90.96789  0.1  200.6    511.0   
3    11  2024-12-31 00:18:44  27.29226 -90.96779  0.1  222.1    511.0   
4    11  2024-12-31 00:21:48  27.29235 -90.96787  0.1  207.0    511.0   

     VesselName         IMO CallSign  ...  Width  Draft  Cargo  \
0  CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   
1  CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   
2  CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   
3  CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   
4  CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   

   TransceiverClass  heading_vs_cog_diff  time_diff_min is_loitering  \
0                 A                287.2            NaN        False   
1                 A 

In [3]:
df.columns
# Target variable: Custom condition for detecting oil spill
df['is_oil_spill'] = (df['SOG'] < 1) & (df['is_loitering'] == 1) & (df['heading_vs_cog_diff'] > 20)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define features (X) and target (y)
X = df[['LAT', 'LON','COG', 'SOG', 'Heading', 'heading_vs_cog_diff', 'time_diff_min', 'is_loitering', 
        'loiter_time', 'speed_change', 'course_change']]
y = df['is_oil_spill']  # Target: whether an oil spill is detected

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred = rf_model.predict(X_test)

# Evaluate the model performance
print(classification_report(y_test, y_pred))


# Check the shape of the resulting data
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


              precision    recall  f1-score   support

       False       1.00      1.00      1.00    925497
        True       1.00      1.00      1.00     24887

    accuracy                           1.00    950384
   macro avg       1.00      1.00      1.00    950384
weighted avg       1.00      1.00      1.00    950384

Training set size: (2217560, 11)
Testing set size: (950384, 11)


In [5]:
# Get feature importance
feature_importances = rf_model.feature_importances_

# Create a DataFrame to visualize feature importance
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df)

                Feature  Importance
7          is_loitering    0.581529
6         time_diff_min    0.294736
5   heading_vs_cog_diff    0.102560
3                   SOG    0.014037
4               Heading    0.002331
0                   LAT    0.001629
2                   COG    0.001041
9          speed_change    0.001002
1                   LON    0.000761
10        course_change    0.000368
8           loiter_time    0.000006


In [13]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Sample training data (Replace with actual data and model training)
# Assuming X_train is your feature set and rf_model is your trained RandomForestClassifier

# Sample input for oil spill
oil_spill_sample = {
    'LAT': [12.9716],                 # Latitude of the vessel
    'LON': [77.5946],                 # Longitude of the vessel
    'COG': [180],                     # Course Over Ground
    'SOG': [0.5],                     # Speed Over Ground (very low speed, suggests potential oil spill)
    'Heading': [90],                  # Heading of the vessel
    'heading_vs_cog_diff': [30],      # Significant difference between heading and COG
    'time_diff_min': [15],            # Time difference (minutes)
    'is_loitering': [1],              # The vessel is loitering (indicating possible oil spill behavior)
    'loiter_time': [60],              # Time the vessel has been loitering (in minutes)
    'speed_change': [0.1],            # Speed change over time
    'course_change': [0.1]            # Course change over time
}

# Convert the sample input to a DataFrame
oil_spill_df = pd.DataFrame(oil_spill_sample)

# Assuming you already have your trained model (`rf_model`) and scaler (`scaler`)

# Initialize the scaler and fit it on the training data (X_train)
scaler = StandardScaler()

# Fit the scaler using X_train (ensure X_train is available from previous training steps)
scaler.fit(X_train)

# Ensure the new data columns match the training data columns (this is a basic check)
print("Training columns:", X_train.columns)  # Columns from the training data
print("New data columns:", oil_spill_df.columns)  # Columns from the new data

# Check for missing columns in the new data compared to the training data
missing_cols = set(X_train.columns) - set(oil_spill_df.columns)
for col in missing_cols:
    oil_spill_df[col] = 0  # You can fill missing columns with zeros or np.nan as per your use case

# Reorder the new data to match the training data column order
oil_spill_df = oil_spill_df[X_train.columns]

# Apply scaling to the new data using the already fitted scaler
scaled_oil_spill_data = scaler.transform(oil_spill_df)

# Predict anomaly (1 for oil spill, 0 for no oil spill)
prediction = rf_model.predict(scaled_oil_spill_data)

# Print prediction result
if prediction == 1:
    print("Potential oil spill detected!")
else:
    print("No oil spill detected.")



Training columns: Index(['LAT', 'LON', 'COG', 'SOG', 'Heading', 'heading_vs_cog_diff',
       'time_diff_min', 'is_loitering', 'loiter_time', 'speed_change',
       'course_change'],
      dtype='object')
New data columns: Index(['LAT', 'LON', 'COG', 'SOG', 'Heading', 'heading_vs_cog_diff',
       'time_diff_min', 'is_loitering', 'loiter_time', 'speed_change',
       'course_change'],
      dtype='object')
No oil spill detected.


