In [None]:
import pymongo
import pandas as pd
import numpy as np
# MongoDB connection details
client = pymongo.MongoClient("mongodb://localhost:27017/")  # MongoDB URI
db = client["ais_training_data"]  # Database name
collection = db["ais_data_relative_week_summary_simple"]  # Collection name

# Fetch data from MongoDB
data = list(collection.find())  # Get all documents from the collection

# Convert the data to a Pandas DataFrame
df = pd.DataFrame(data)

df = df.drop(columns=['_id'])
# Get a list of ship types excluding 'Undefined'
ship_types = df['Ship_Type'][df['Ship_Type'] != 'Undefined'].unique()

df['Ship_Type'] = df['Ship_Type'].apply(lambda x: np.random.choice(ship_types) if x == 'Undefined' else x)
df=df.groupby(['YearWeek', 'Ship_Type', 'Destination'], as_index=False)['TotalCount'].sum()

  YearWeek  Ship_Type  Destination  TotalCount
0  2023-43  Undefined      Rostock          26
1  2023-43  Undefined     Sassnitz          24
2  2023-43  Undefined      Esbjerg          24
3  2023-43  Undefined       Skagen          22
4  2023-43  Undefined  Gotthenburg          21


In [223]:
df

Unnamed: 0,YearWeek,Ship_Type,Destination,TotalCount
0,2023-43,Undefined,Rostock,26
1,2023-43,Undefined,Sassnitz,24
2,2023-43,Undefined,Esbjerg,24
3,2023-43,Undefined,Skagen,22
4,2023-43,Undefined,Gotthenburg,21
...,...,...,...,...
40987,2023-01,Dredging,Kerteminde,1
40988,2023-01,Dredging,Klagshamn,1
40989,2023-01,Reserved,Helsingborg,1
40990,2023-01,Dredging,Skagen,1


# RandomForestRegressor

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# Load data
data = pd.read_csv('shipping_data.csv')

# Preprocess YearWeek
data[['Year', 'Week']] = data['YearWeek'].str.split('-', expand=True)
data['Year'] = data['Year'].astype(int)
data['Week'] = data['Week'].astype(int)
data = data.drop('YearWeek', axis=1)

# Define features and target
X = data.drop('TotalCount', axis=1)
y = data['TotalCount']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Preprocessing
categorical_features = ['Ship_Type', 'Destination']
numerical_features = ['Year', 'Week']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ])

# Pipeline with model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# Predict on new data
new_data = pd.DataFrame({
    'YearWeek': ['2024-05'],
    'Ship_Type': ['WIG'],
    'Destination': ['Frederiksvark']
})

new_data[['Year', 'Week']] = new_data['YearWeek'].str.split('-', expand=True)
new_data['Year'] = new_data['Year'].astype(int)
new_data['Week'] = new_data['Week'].astype(int)
new_data = new_data.drop('YearWeek', axis=1)

predicted_count = model.predict(new_data)
print(f"Predicted TotalCount: {predicted_count[0]:.0f}")


Mean Absolute Error (MAE): 1.72
Mean Squared Error (MSE): 10.99
Root Mean Squared Error (RMSE): 3.31
R^2 Score: 0.84
Predicted TotalCount: 2


In [245]:
import joblib 
model_filename = 'frequency_of_deployment_regressor.joblib'
joblib.dump(model, model_filename)

['frequency_of_deployment_regressor.joblib']

In [242]:
# Predict on new data
new_data = pd.DataFrame({
    'YearWeek': ['2024-1'],
    'Ship_Type': ['Anti-pollution'],
    'Destination': ['Gedser']
})

new_data[['Year', 'Week']] = new_data['YearWeek'].str.split('-', expand=True)
new_data['Year'] = new_data['Year'].astype(int)
new_data['Week'] = new_data['Week'].astype(int)
new_data = new_data.drop('YearWeek', axis=1)

predicted_count = model.predict(new_data)
print(f"Predicted TotalCount: {predicted_count[0]:.0f}")

Predicted TotalCount: 6


In [None]:
import pandas as pd
import numpy as np
import joblib
model_filename="frequency_of_deployment_regressor.joblib"
model = joblib.load(model_filename)
print("\nModel loaded successfully.")


# Array of all ship types
ship_types = np.array([
    'Sailing', 'Undefined', 'Military', 'Tug', 'Fishing', 'Pilot',
    'Other', 'Port tender', 'Cargo', 'Pleasure', 'Passenger',
    'Reserved', 'Tanker', 'SAR', 'HSC', 'Dredging',
    'Not party to conflict', 'Law enforcement', 'Towing', 'Diving',
    'Anti-pollution', 'Medical', 'Spare 1', 'WIG', 'Towing long/wide',
    'Spare 2'
], dtype=object)

# Function to generate predictions with ceiling applied
def generate_predictions(yearweek, destination, model, ship_types):
    """
    Generates predictions for all ship types based on the provided YearWeek and Destination,
    applying the ceiling function to the predicted TotalCount.

    Parameters:
    - yearweek (str): The YearWeek in 'YYYY-WW' format, e.g., '2025-12'.
    - destination (str): The destination location, e.g., 'Aabenraa'.
    - model: The pre-trained prediction model.
    - ship_types (array-like): Array of ship type strings.

    Returns:
    - pd.DataFrame: DataFrame containing Ship_Type and their Predicted_TotalCount (ceiled).
    """
    # Create dataframe with all ship types
    new_data = pd.DataFrame({
        'Yearweek': [yearweek] * len(ship_types),
        'Ship_Type': ship_types,
        'Destination': [destination] * len(ship_types)
    })

    # Split Yearweek into Year and Week
    try:
        new_data[['Year', 'Week']] = new_data['Yearweek'].str.split('-', expand=True)
        new_data['Year'] = new_data['Year'].astype(int)
        new_data['Week'] = new_data['Week'].astype(int)
    except Exception as e:
        print("Error processing Yearweek. Ensure it's in 'YYYY-WW' format.")
        raise e


    new_data = new_data.drop('Yearweek', axis=1)


    try:
        predicted_counts = model.predict(new_data)
    except Exception as e:
        print("Error during prediction. Check if the input features match the model's expected format.")
        raise e

    predicted_counts_ceiled = np.ceil(predicted_counts).astype(int)

    new_data['Predicted_TotalCount'] = predicted_counts_ceiled

    result_df = new_data[['Ship_Type', 'Predicted_TotalCount']]

    return result_df

user_yearweek = '2025-12'      
user_destination = 'Aabenraa'   


predictions_df = generate_predictions(user_yearweek, user_destination, model, ship_types)

print(predictions_df)



                Ship_Type  Predicted_TotalCount
0                 Sailing                     2
1               Undefined                     8
2                Military                     6
3                     Tug                     4
4                 Fishing                     2
5                   Pilot                     4
6                   Other                     7
7             Port tender                     4
8                   Cargo                     4
9                Pleasure                     4
10              Passenger                     2
11               Reserved                     5
12                 Tanker                     2
13                    SAR                     3
14                    HSC                     4
15               Dredging                     4
16  Not party to conflict                     4
17        Law enforcement                     2
18                 Towing                     3
19                 Diving               

# Lstm

In [232]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# 1. Load Data
data = df

# 2. Convert YearWeek to Date
def yearweek_to_datetime(yearweek_str):
    year, week = map(int, yearweek_str.split('-'))
    return pd.to_datetime(f'{year}-W{week}-1', format='%Y-W%W-%w')

data['Date'] = data['YearWeek'].apply(yearweek_to_datetime)
data = data.drop('YearWeek', axis=1)
data = data.sort_values('Date')
data.reset_index(drop=True, inplace=True)

# 3. Encode Categorical Variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # Updated parameter
encoded_features = encoder.fit_transform(data[['Ship_Type', 'Destination']])
encoded_feature_names = encoder.get_feature_names_out(['Ship_Type', 'Destination'])
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)
data = pd.concat([data.drop(['Ship_Type', 'Destination'], axis=1), encoded_df], axis=1)

# 4. Prepare Features and Target
window_size = 4
feature_cols = data.columns.difference(['Date', 'TotalCount'])
data_array = data[feature_cols].values  # Shape: (num_steps, features)
target_array = data['TotalCount'].values  # Shape: (num_steps,)

# 5. Scale Features
feature_scaler = MinMaxScaler()
target_scaler = MinMaxScaler()

# Scale the entire feature data
scaled_data = feature_scaler.fit_transform(data_array)  # Shape: (num_steps, features)

# Scale the target
scaled_target = target_scaler.fit_transform(target_array.reshape(-1, 1))  # Shape: (num_steps, 1)

# 6. Create Sequences
X = []
y = []

for i in range(len(scaled_data) - window_size):
    X.append(scaled_data[i:i + window_size])  # Each X is (window_size, features)
    y.append(scaled_target[i + window_size])  # Each y is (1,)

X = np.array(X)  # Shape: (num_samples, window_size, features)
y = np.array(y)  # Shape: (num_samples, 1)

# 7. Split Data
split_index = int(0.8 * len(X))
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print(f'Training samples: {X_train.shape[0]}')
print(f'Testing samples: {X_test.shape[0]}')

# 8. Build LSTM Model
model = Sequential()
model.add(LSTM(64, activation='relu', input_shape=(window_size, X_train.shape[2]), return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

# 9. Train the Model
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# 10. Evaluate the Model
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
print(f'Test MAE (scaled): {test_mae:.4f}')

y_pred_scaled = model.predict(X_test)
y_pred = target_scaler.inverse_transform(y_pred_scaled)
y_test_original = target_scaler.inverse_transform(y_test)

mae = mean_absolute_error(y_test_original, y_pred)
mse = mean_squared_error(y_test_original, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_original, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R^2 Score: {r2:.2f}")

# 11. Save the Encoder, Scalers, and Model for Future Use
joblib.dump(encoder, 'onehot_encoder.joblib')
joblib.dump(feature_scaler, 'feature_scaler.joblib')
joblib.dump(target_scaler, 'target_scaler.joblib')
model.save('lstm_model.h5')

# 12. Predict on New Data
def predict_new_total_count(new_yearweek, new_ship_type, new_destination, scaled_data, window_size=4):
    """
    Predicts the TotalCount for a new data point based on the latest window_size -1 data points.

    Parameters:
    - new_yearweek (str): The YearWeek of the new data point (e.g., '2024-05').
    - new_ship_type (str): The Ship_Type of the new data point (e.g., 'WIG').
    - new_destination (str): The Destination of the new data point (e.g., 'Frederiksvark').
    - scaled_data (np.ndarray): The entire scaled feature data (num_steps, features).
    - window_size (int): The window size used for the LSTM model.

    Returns:
    - predicted_count (float): The predicted TotalCount.
    """
    # Create a DataFrame for the new data
    new_data = pd.DataFrame({
        'YearWeek': [new_yearweek],
        'Ship_Type': [new_ship_type],
        'Destination': [new_destination]
    })

    # Convert YearWeek to Date
    new_data['Date'] = new_data['YearWeek'].apply(yearweek_to_datetime)
    new_data = new_data.drop('YearWeek', axis=1)

    # Encode Ship_Type and Destination using the previously fitted encoder
    new_encoded = encoder.transform(new_data[['Ship_Type', 'Destination']])
    new_encoded_df = pd.DataFrame(new_encoded, columns=encoded_feature_names)

    # Concatenate with the new data
    new_data = pd.concat([new_data.drop(['Ship_Type', 'Destination'], axis=1), new_encoded_df], axis=1)

    # Select feature columns
    new_features = new_data[feature_cols].values  # Shape: (1, features)

    # Scale the new features
    new_features_scaled = feature_scaler.transform(new_features)  # Shape: (1, features)

    # Extract the last window_size -1 data points from scaled_data
    if len(scaled_data) < (window_size - 1):
        raise ValueError(f"Not enough data to create a sequence. Need at least {window_size - 1} data points.")

    last_steps = scaled_data[-(window_size - 1):]  # Shape: (window_size -1, features)

    # Concatenate the last_steps with new_features_scaled to form a new sequence
    new_sequence_scaled = np.concatenate([last_steps, new_features_scaled], axis=0)  # Shape: (window_size, features)

    # Reshape to 3D array for LSTM input: (1, window_size, features)
    new_sequence_scaled = new_sequence_scaled.reshape(1, window_size, -1)

    # Predict
    predicted_scaled = model.predict(new_sequence_scaled)
    predicted = target_scaler.inverse_transform(predicted_scaled)

    return predicted[0][0]

# Example Prediction
try:
    predicted_count = predict_new_total_count(
        new_yearweek='2024-05',
        new_ship_type='WIG',
        new_destination='Frederiksvark',
        scaled_data=scaled_data,
        window_size=window_size
    )
    print(f"Predicted TotalCount: {predicted_count:.0f}")
except Exception as e:
    print(f"An error occurred during prediction: {e}")


Training samples: 32790
Testing samples: 8198


  super().__init__(**kwargs)


Epoch 1/10
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.0017 - mae: 0.0220 - val_loss: 6.0666e-04 - val_mae: 0.0129
Epoch 2/10
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 8.3664e-04 - mae: 0.0134 - val_loss: 4.8542e-04 - val_mae: 0.0109
Epoch 3/10
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 6.2088e-04 - mae: 0.0117 - val_loss: 4.5878e-04 - val_mae: 0.0104
Epoch 4/10
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 5.2975e-04 - mae: 0.0107 - val_loss: 4.6151e-04 - val_mae: 0.0098
Epoch 5/10
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 4.6915e-04 - mae: 0.0102 - val_loss: 4.3854e-04 - val_mae: 0.0095
Epoch 6/10
[1m410/410[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 4.0182e-04 - mae: 0.0097 - val_loss: 4.6977e-04 - val_mae: 0.0102
Epoch 7/10
[1m410/410[0m [32m━━━━━━━━━━━━



Mean Absolute Error (MAE): 1.77
Mean Squared Error (MSE): 20.22
Root Mean Squared Error (RMSE): 4.50
R^2 Score: 0.69
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Predicted TotalCount: 6


In [238]:
# 12. Predict on New Data
def predict_new_total_count(new_yearweek, new_ship_type, new_destination, scaled_data, window_size=4):
    """
    Predicts the TotalCount for a new data point based on the latest window_size -1 data points.

    Parameters:
    - new_yearweek (str): The YearWeek of the new data point (e.g., '2024-05').
    - new_ship_type (str): The Ship_Type of the new data point (e.g., 'WIG').
    - new_destination (str): The Destination of the new data point (e.g., 'Frederiksvark').
    - scaled_data (np.ndarray): The entire scaled feature data (num_steps, features).
    - window_size (int): The window size used for the LSTM model.

    Returns:
    - predicted_count (float): The predicted TotalCount.
    """
    # Create a DataFrame for the new data
    new_data = pd.DataFrame({
        'YearWeek': [new_yearweek],
        'Ship_Type': [new_ship_type],
        'Destination': [new_destination]
    })

    # Convert YearWeek to Date
    new_data['Date'] = new_data['YearWeek'].apply(yearweek_to_datetime)
    new_data = new_data.drop('YearWeek', axis=1)

    # Encode Ship_Type and Destination using the previously fitted encoder
    new_encoded = encoder.transform(new_data[['Ship_Type', 'Destination']])
    new_encoded_df = pd.DataFrame(new_encoded, columns=encoded_feature_names)

    # Concatenate with the new data
    new_data = pd.concat([new_data.drop(['Ship_Type', 'Destination'], axis=1), new_encoded_df], axis=1)

    # Select feature columns
    new_features = new_data[feature_cols].values  # Shape: (1, features)

    # Scale the new features
    new_features_scaled = feature_scaler.transform(new_features)  # Shape: (1, features)

    # Extract the last window_size -1 data points from scaled_data
    if len(scaled_data) < (window_size - 1):
        raise ValueError(f"Not enough data to create a sequence. Need at least {window_size - 1} data points.")

    last_steps = scaled_data[-(window_size - 1):]  # Shape: (window_size -1, features)

    # Concatenate the last_steps with new_features_scaled to form a new sequence
    new_sequence_scaled = np.concatenate([last_steps, new_features_scaled], axis=0)  # Shape: (window_size, features)

    # Reshape to 3D array for LSTM input: (1, window_size, features)
    new_sequence_scaled = new_sequence_scaled.reshape(1, window_size, -1)

    # Predict
    predicted_scaled = model.predict(new_sequence_scaled)
    predicted = target_scaler.inverse_transform(predicted_scaled)

    return predicted[0][0]

try:
    predicted_count = predict_new_total_count(
        new_yearweek='2024-43',
        new_ship_type='Anti-pollution',
        new_destination='Gedser',
        scaled_data=scaled_data,
        window_size=window_size
    )
    print(f"Predicted TotalCount: {predicted_count:.0f}")
except Exception as e:
    print(f"An error occurred during prediction: {e}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
Predicted TotalCount: 7
