# Import dependencies

In [None]:
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial import distance
from pyxdameraulevenshtein import damerau_levenshtein_distance
from jaro import jaro_winkler_metric, jaro_metric
import gower
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Find optimal number of neighbours with K-Fold cross validation

Read file

In [None]:
number_of_folds = 4
number_of_neighbours = 5

file = pd.read_csv("DATA.txt", dtype=str, header=None, sep='|', na_filter=False)
data = file.to_numpy()
kf = KFold(n_splits=number_of_folds)

Perform K-Fold cross validation

In [None]:
for i, (train_index, test_index) in enumerate(kf.split(data)):
  training_data = [data[j, 0] for j in train_index]
  training_labels = [data[j, 1] for j in train_index]
  test_data = [data[j, 0] for j in test_index]
  test_labels = [data[j, 1] for j in test_index]

  # calculate Damerau-Levenshtein distance between all training samples
  levenshtein_dist = np.zeros((len(training_data), len(training_data)))
  for i in range(len(training_data)):
    for j in range(len(training_data)):
      levenshtein_dist[i, j] = damerau_levenshtein_distance(training_data[i].lower(), training_data[j].lower())

  # feed training data into model
  neigh = KNeighborsClassifier(n_neighbors=number_of_neighbours, metric='precomputed', weights='distance')
  neigh.fit(levenshtein_dist, training_labels)

  # calculate Damerau-Levenshtein distance between test samples and original training samples
  levenshtein_dist = np.zeros((len(test_data), len(training_data)))
  for i in range(len(test_data)):
    for j in range(len(training_data)):
      levenshtein_dist[i, j] = damerau_levenshtein_distance(test_data[i].lower(), training_data[j].lower())
  
  # classification of test data
  prediction = neigh.predict(levenshtein_dist)
  print(classification_report(test_labels, prediction))
  print(confusion_matrix(test_labels, prediction))


# KNN classifier

Training the classifier

In [None]:
file = pd.read_csv("TRAIN.txt", dtype=str, header=None, sep='|', na_filter=False)
data = file.to_numpy()
training_data = data[:, 0]
training_labels = data[:, 1]

# calculate Damerau-Levenshtein distance between all samples
levenshtein_dist = np.zeros((len(training_data), len(training_data)))
for i in range(len(training_data)):
  for j in range(len(training_data)):
      levenshtein_dist[i, j] = damerau_levenshtein_distance(training_data[i].lower(), training_data[j].lower())

neigh = KNeighborsClassifier(n_neighbors=3, metric='precomputed', weights='distance')
neigh.fit(levenshtein_dist, training_labels)

# feed the classifier with training data
prediction = neigh.predict(levenshtein_dist)
print(classification_report(training_labels, prediction))
# print(confusion_matrix(training_labels, prediction))

Testing the classifier

In [None]:
file = pd.read_csv("TEST.txt", dtype=str, header=None, sep='|', na_filter=False)
data = file.to_numpy()
test_data = data[:, 0]
test_labels = data[:, 1]

# calculate Damerau-Levenshtein distance between test samples and original training samples
levenshtein_dist = np.zeros((len(test_data), len(training_data)))
for i in range(len(test_data)):
  for j in range(len(training_data)):
    levenshtein_dist[i, j] = damerau_levenshtein_distance(test_data[i].lower(), training_data[j].lower())

# feed the classifier with test data
prediction = neigh.predict(levenshtein_dist)
print(classification_report(test_labels, prediction))
print(confusion_matrix(test_labels, prediction))

Printing out incorrect predictions (consider all 3 classes together)

In [None]:
wrong_predictions_idx = []
for i in range(len(prediction)):
  if prediction[i] != test_labels[i]:
    wrong_predictions_idx.append(i)
    print(prediction[i], test_labels[i], test_data[i])
print(wrong_predictions_idx)

# K-Medoids clustering

Read file

In [None]:
file = pd.read_csv("vessel_arrival.csv", na_filter=False)
data = file.to_numpy()
port_time_tide = data[:, [0, 5, 13]]
print(port_time_tide)

Perform clustering

In [None]:
input_data = port_time_tide
number_of_clusters = 5

gower_dist = gower.gower_matrix(input_data)

clusters = (KMedoids(n_clusters=number_of_clusters,
                           metric='precomputed',
                           method='pam', init='build')
            .fit(gower_dist)
            .labels_)

df = pd.DataFrame(data=clusters)
df.to_csv("out.csv", header=False, index=False)