# Random Forest Regression

In [1]:
import os
import pandas as pd
import numpy as np
import shutil
import random
import time
from datetime import timedelta
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import glob

In [3]:
# Define paths
output_folder_path = 'D:/A TESE/Modelo ML/CSV/voyages_final'  # Folder containing processed CSV files
train_folder_path = 'D:/A TESE/Modelo ML/RF/train_RF/'  # Folder to save train files
test_folder_path = 'D:/A TESE/Modelo ML/RF/test_RF/'  # Folder to save test files
test_subfolder_path = os.path.join(test_folder_path, 'test/')  # Subfolder for test half
validate_subfolder_path = os.path.join(test_folder_path, 'validate/')  # Subfolder for validation half

# Create train, test, and validation folders if they don't exist
os.makedirs(train_folder_path, exist_ok=True)
os.makedirs(test_subfolder_path, exist_ok=True)
os.makedirs(validate_subfolder_path, exist_ok=True)

In [5]:
# Get all CSV files in the output folder
all_files = [f for f in os.listdir(output_folder_path) if f.endswith('.csv')]

# Shuffle files for random selection
random.shuffle(all_files)

# Split files into 70% train and 30% test
train_files = all_files[:int(0.7 * len(all_files))]
test_files = all_files[int(0.7 * len(all_files)):] 

# Move train files to train folder
for file in train_files:
    shutil.copy(os.path.join(output_folder_path, file), os.path.join(train_folder_path, file))

# Split test files into test and validation halves
for file in test_files:
    file_path = os.path.join(output_folder_path, file)
    df = pd.read_csv(file_path)

    # Split data into two halves
    midpoint = len(df) // 2
    test_half = df.iloc[:midpoint]
    validate_half = df.iloc[midpoint:]

    # Save test half
    test_half.to_csv(os.path.join(test_subfolder_path, file), index=False)

    # Save validation half
    validate_half.to_csv(os.path.join(validate_subfolder_path, file), index=False)

print("Files have been split into train, test, and validate folders.")

Files have been split into train, test, and validate folders.


In [9]:
# Start the timer
start_time = time.time()

# Load train data
train_files = glob.glob(train_folder_path + "*.csv")
train_data = pd.concat([pd.read_csv(f) for f in train_files], ignore_index=True)
train_data.dropna(inplace=True)  # Drop rows with NaN values

# Features and target variable
X = train_data[['RTD', 'SOG', 'COG', 'LAT', 'LON']]  # Feature columns
y = train_data['RTT']  # Target column

# Split train data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor model (more complex model to capture non-linear relationships)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Cross-validation to assess model performance
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"Cross-Validation R^2 Scores: {cv_scores}")
print(f"Mean Cross-Validation R^2: {np.mean(cv_scores)}")

# Validate model
y_pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)

print(f"Model MAE: {mae}")
print(f"Model RMSE: {rmse}")
print(f"Model R^2: {r2}")

# Test the model on test data
test_files = glob.glob(test_subfolder_path + "*.csv")
for test_file in test_files:
    test_data = pd.read_csv(test_file)
    test_data.dropna(inplace=True)  # Drop rows with NaN values
    X_test = test_data[['RTD', 'SOG', 'COG', 'LAT', 'LON']]  # Features
    y_test_pred = model.predict(X_test)

    # Save the predictions to the corresponding validation file
    validation_file = os.path.join(validate_subfolder_path, os.path.basename(test_file))
    validation_data = pd.read_csv(validation_file)
    validation_data = validation_data.iloc[:len(y_test_pred)]  # Ensure alignment if rows were dropped
    validation_data['Predicted_RTT'] = y_test_pred

    # Ensure 'BaseDateTime' is a datetime object
    validation_data['BaseDateTime'] = pd.to_datetime(validation_data['BaseDateTime'], errors='coerce')

    # Calculate Predicted_ETA by adding Predicted_RTT (in hours) to BaseDateTime
    validation_data['Predicted_ETA'] = validation_data['BaseDateTime'] + pd.to_timedelta(validation_data['Predicted_RTT'], unit='h')

    # Ensure Predicted_ETA is displayed properly as datetime
    validation_data['Predicted_ETA'] = validation_data['Predicted_ETA'].dt.strftime('%Y-%m-%d %H:%M:%S')

    # Save the updated validation file with the correct datetime format
    validation_data.to_csv(validation_file, index=False)

# End the timer and print execution time
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution Time: {execution_time} seconds")
print("ML model for RFR")

Cross-Validation R^2 Scores: [0.94093975 0.97744558 0.9887294  0.98091625 0.92267148]
Mean Cross-Validation R^2: 0.9621404910699287
Model MAE: 0.004533182465111077
Model RMSE: 0.016337689900680418
Model R^2: 0.9999861907615685
Execution Time: 702.0408506393433 seconds
ML model for KNN


In [30]:
# Load train data
train_files = glob.glob(train_folder_path + "*.csv")
train_data = pd.concat([pd.read_csv(f) for f in train_files], ignore_index=True)
train_data.dropna(inplace=True)  # Drop rows with NaN values

# Features and target variable
X = train_data[['RTD', 'SOG', 'COG','LAT','LON']]  # Feature columns (added 'COG')
y = train_data['RTT']  # Target column

# Split train data for validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor model (more complex model to capture non-linear relationships)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Cross-validation to assess model performance
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"Cross-Validation R^2 Scores: {cv_scores}")
print(f"Mean Cross-Validation R^2: {np.mean(cv_scores)}")

# Validate model
y_pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)

print(f"Model MAE: {mae}")
print(f"Model RMSE: {rmse}")
print(f"Model R^2: {r2}")

# Test the model on test data
test_files = glob.glob(test_subfolder_path + "*.csv")
for test_file in test_files:
    test_data = pd.read_csv(test_file)
    test_data.dropna(inplace=True)  # Drop rows with NaN values
    X_test = test_data[['RTD', 'SOG', 'COG','LAT','LON']]  # Updated features
    y_test_pred = model.predict(X_test)

    # Save the predictions to the corresponding validation file
    validation_file = os.path.join(validate_subfolder_path, os.path.basename(test_file))
    validation_data = pd.read_csv(validation_file)
    validation_data = validation_data.iloc[:len(y_test_pred)]  # Ensure alignment if rows were dropped
    validation_data['Predicted_RTT'] = y_test_pred
    validation_data.to_csv(validation_file, index=False)
print("ML model for RFR")
print("Model predictions have been saved to the validation files.")

Cross-Validation R^2 Scores: [0.95312965 0.96745517 0.92576949 0.99161241 0.77605567]
Mean Cross-Validation R^2: 0.9228044791563882
Model MAE: 0.006623407779553002
Model RMSE: 0.08621841896339671
Model R^2: 0.9996258127732152
ML model for RFR
Model predictions have been saved to the validation files.
