# K-Nearest Neighbors

In [13]:
# Importing the required libraries

import os
import pandas as pd
import numpy as np
import shutil
import random
import time
from datetime import timedelta
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import glob

In [5]:
# Defining paths to the base directory
output_folder_path = 'D:/A TESE/Modelo ML/CSV/voyages_final'  # Folder containing processed CSV files
train_folder_path = 'D:/A TESE/Modelo ML/KNN/train_KNN/'  # Folder to save train files
test_folder_path = 'D:/A TESE/Modelo ML/KNN/test_KNN/'  # Folder to save test files
test_subfolder_path = os.path.join(test_folder_path, 'test/')  # Subfolder for test half
validate_subfolder_path = os.path.join(test_folder_path, 'validate/')  # Subfolder for validation half

# Create train, test, and validation folders if they don't exist
os.makedirs(train_folder_path, exist_ok=True)
os.makedirs(test_subfolder_path, exist_ok=True)
os.makedirs(validate_subfolder_path, exist_ok=True)

In [43]:
# Splitting into train-test folders (70/30)
# Splitting test folder into test and validation halves

all_files = [f for f in os.listdir(output_folder_path) if f.endswith('.csv')]

random.shuffle(all_files)

train_files = all_files[:int(0.7 * len(all_files))]
test_files = all_files[int(0.7 * len(all_files)):] 

for file in train_files:
    shutil.copy(os.path.join(output_folder_path, file), os.path.join(train_folder_path, file))

for file in test_files:
    file_path = os.path.join(output_folder_path, file)
    df = pd.read_csv(file_path)

    midpoint = len(df) // 2
    test_half = df.iloc[:midpoint]
    validate_half = df.iloc[midpoint:]

    test_half.to_csv(os.path.join(test_subfolder_path, file), index=False)

    validate_half.to_csv(os.path.join(validate_subfolder_path, file), index=False)

print("Files have been split into train, test, and validate folders.")

Files have been split into train, test, and validate folders.


In [35]:
# Verifying the number of CSV files

csv_files = [f for f in os.listdir(train_folder_path) if f.endswith('.csv')]
csv_count = len(csv_files)

print(f"Total CSV files created: {csv_count}")

Total CSV files created: 52


In [45]:
# KNN model with validation and performance metrics

start_time = time.time()

train_files = glob.glob(train_folder_path + "*.csv")
train_data = pd.concat([pd.read_csv(f) for f in train_files], ignore_index=True)
train_data.dropna(inplace=True)  

X = train_data[['RTD', 'SOG', 'COG', 'LAT', 'LON']]  
y = train_data['RTT']  

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

model = KNeighborsRegressor(n_neighbors=10)
model.fit(X_train, y_train)

cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"Cross-Validation R^2 Scores: {cv_scores}")
print(f"Mean Cross-Validation R^2: {np.mean(cv_scores)}")

y_pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)

print(f"Model MAE: {mae}")
print(f"Model RMSE: {rmse}")
print(f"Model R^2: {r2}")

test_files = glob.glob(test_subfolder_path + "*.csv")
for test_file in test_files:
    test_data = pd.read_csv(test_file)
    test_data.dropna(inplace=True)  
    X_test = test_data[['RTD', 'SOG', 'COG', 'LAT', 'LON']]  
    y_test_pred = model.predict(X_test)

    validation_file = os.path.join(validate_subfolder_path, os.path.basename(test_file))
    validation_data = pd.read_csv(validation_file)
    validation_data = validation_data.iloc[:len(y_test_pred)]  
    validation_data['Predicted_RTT'] = y_test_pred

    validation_data['BaseDateTime'] = pd.to_datetime(validation_data['BaseDateTime'], errors='coerce')

    validation_data['Predicted_ETA'] = validation_data['BaseDateTime'] + pd.to_timedelta(validation_data['Predicted_RTT'], unit='h')

    validation_data['Predicted_ETA'] = validation_data['Predicted_ETA'].dt.strftime('%Y-%m-%d %H:%M:%S')
    
    validation_data.to_csv(validation_file, index=False)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution Time: {execution_time} seconds")
print("ML model for KNN")

Cross-Validation R^2 Scores: [0.80261324 0.80207725 0.78595119 0.80788404 0.10979262]
Mean Cross-Validation R^2: 0.6616636691063496
Model MAE: 0.13342448084271685
Model RMSE: 0.5075734153250491
Model R^2: 0.9877034021945689
Execution Time: 5.280024528503418 seconds
ML model for KNN


In [37]:
# NOT IN USE IN THIS WORK
# MODEL WITH SCALED FEATURES

# KNN model with validation and performance metrics

train_files = glob.glob(train_folder_path + "*.csv")
train_data = pd.concat([pd.read_csv(f) for f in train_files], ignore_index=True)
train_data.dropna(inplace=True)  

X = train_data[['RTD', 'SOG', 'COG','LAT','LON']]  
y = train_data['RTT']  

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

model = KNeighborsRegressor(n_neighbors=10)  

model.fit(X_train, y_train)

cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"Cross-Validation R^2 Scores: {cv_scores}")
print(f"Mean Cross-Validation R^2: {np.mean(cv_scores)}")

y_pred = model.predict(X_valid)
mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
r2 = r2_score(y_valid, y_pred)

print(f"Model MAE: {mae}")
print(f"Model RMSE: {rmse}")
print(f"Model R^2: {r2}")

test_files = glob.glob(test_subfolder_path + "*.csv")
for test_file in test_files:
    test_data = pd.read_csv(test_file)
    test_data.dropna(inplace=True) 
    X_test = test_data[['RTD', 'SOG', 'COG','LAT','LON']]  
    y_test_pred = model.predict(X_test)
    
    validation_file = os.path.join(validate_subfolder_path, os.path.basename(test_file))
    validation_data = pd.read_csv(validation_file)
    validation_data = validation_data.iloc[:len(y_test_pred)]  
    validation_data['Predicted_RTT'] = y_test_pred
    validation_data.to_csv(validation_file, index=False)

print("ML model for KNN")
print("Model predictions have been saved to the validation files.")

Cross-Validation R^2 Scores: [0.80427564 0.78820333 0.45174086 0.84270795 0.24708238]
Mean Cross-Validation R^2: 0.6268020303102249
Model MAE: 0.1559439199341003
Model RMSE: 0.6094561276713235
Model R^2: 0.980989056748098
ML model for KNN
Model predictions have been saved to the validation files.
