# Import dependencies

In [None]:
from sklearn.cluster import KMeans
from sklearn_extra.cluster import KMedoids
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial import distance
from pyxdameraulevenshtein import damerau_levenshtein_distance
import gower
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# KNN classifier (classify as 3 separate classes)

KNN classifier (training the classifier) (classify into 3 classes)

In [None]:
file = pd.read_csv("TRAIN.txt", dtype=str, header=None, sep='|', na_filter=False)
data = file.to_numpy()
training_data = data[:, 0]
training_labels = data[:, [1,2,3]]

# calculate Damerau-Levenshtein distance between all samples
levenshtein_dist = np.zeros((len(training_data), len(training_data)))
for i in range(len(training_data)):
  for j in range(len(training_data)):
    levenshtein_dist[i, j] = damerau_levenshtein_distance(training_data[i].lower(), training_data[j].lower())

print(levenshtein_dist)
neigh = KNeighborsClassifier(n_neighbors=3, metric='precomputed', weights='distance')
neigh.fit(levenshtein_dist, training_labels)

# feed the classifier with training data
prediction = neigh.predict(levenshtein_dist)
print(classification_report(training_labels[:, 0], prediction[:, 0]))
print(classification_report(training_labels[:, 1], prediction[:, 1]))
print(classification_report(training_labels[:, 2], prediction[:, 2]))
# print(confusion_matrix(training_labels, prediction))

KNN classifier (testing the classifier) (classify into 3 classes)

In [None]:
file = pd.read_csv("TEST.txt", dtype=str, header=None, sep='|', na_filter=False)
data = file.to_numpy()
test_data = data[:, 0]
test_labels = data[:, [1,2,3]]

# calculate Damerau-Levenshtein distance between test samples and original training samples
levenshtein_dist = np.zeros((len(test_data), len(training_data)))
for i in range(len(test_data)):
  for j in range(len(training_data)):
    levenshtein_dist[i, j] = damerau_levenshtein_distance(test_data[i].lower(), training_data[j].lower())

# feed the classifier with test data
prediction = neigh.predict(levenshtein_dist)
print(classification_report(test_labels[:, 0], prediction[:, 0]))
print(classification_report(test_labels[:, 1], prediction[:, 1]))
print(classification_report(test_labels[:, 2], prediction[:, 2]))
# print(confusion_matrix(test_labels, prediction))

Printing out incorrect predictions (classify into 3 classes)

In [None]:
class_1_labels = test_labels[:, 0]
class_2_labels = test_labels[:, 1]
class_3_labels = test_labels[:, 2]

print("CLASS 1")
for i in range(len(prediction)):
  if prediction[:, 0][i] != class_1_labels[i]:
    print(prediction[:, 0][i], class_1_labels[i], test_data[i])

print("CLASS 2")
for i in range(len(prediction)):
  if prediction[:, 1][i] != class_2_labels[i]:
    print(prediction[:, 1][i], class_2_labels[i], test_data[i])

print("CLASS 3")
for i in range(len(prediction)):
  if prediction[:, 2][i] != class_3_labels[i]:
    print(prediction[:, 2][i], class_3_labels[i], test_data[i])

# KNN classifier (consider all classes together)

KNN classifier (training the classifier) (consider all 3 classes together)

In [None]:
file = pd.read_csv("TRAIN.txt", dtype=str, header=None, sep='|', na_filter=False)
data = file.to_numpy()
training_data = data[:, 0]
training_labels = data[:, 1]

# calculate Damerau-Levenshtein distance between all samples
levenshtein_dist = np.zeros((len(training_data), len(training_data)))
for i in range(len(training_data)):
  for j in range(len(training_data)):
    levenshtein_dist[i, j] = damerau_levenshtein_distance(training_data[i].lower(), training_data[j].lower())

neigh = KNeighborsClassifier(n_neighbors=4, metric='precomputed', weights='distance')
neigh.fit(levenshtein_dist, training_labels)

# feed the classifier with training data
prediction = neigh.predict(levenshtein_dist)
print(classification_report(training_labels, prediction))
# print(confusion_matrix(training_labels, prediction))

KNN classifier (testing the classifier) (consider all 3 classes together)

In [None]:
file = pd.read_csv("TEST.txt", dtype=str, header=None, sep='|', na_filter=False)
data = file.to_numpy()
test_data = data[:, 0]
test_labels = data[:, 1]

# calculate Damerau-Levenshtein distance between test samples and original training samples
levenshtein_dist = np.zeros((len(test_data), len(training_data)))
for i in range(len(test_data)):
  for j in range(len(training_data)):
    levenshtein_dist[i, j] = damerau_levenshtein_distance(test_data[i].lower(), training_data[j].lower())

# feed the classifier with test data
prediction = neigh.predict(levenshtein_dist)
print(classification_report(test_labels, prediction))
# print(confusion_matrix(test_labels, prediction))

Printing out incorrect predictions (consider all 3 classes together)

In [None]:
for i in range(len(prediction)):
  if prediction[i] != test_labels[i]:
    print(prediction[i], test_labels[i], test_data[i])

# K-Medoids clustering

Read file

In [None]:
file = pd.read_csv("vessel_arrival.csv", na_filter=False)
data = file.to_numpy()
port_time_tide = data[:, [0, 5, 13]]
print(port_time_tide)

Perform clustering

In [None]:
input_data = port_time_tide
number_of_clusters = 5

gower_dist = gower.gower_matrix(input_data)

clusters = (KMedoids(n_clusters=number_of_clusters,
                           metric='precomputed',
                           method='pam', init='build')
            .fit(gower_dist)
            .labels_)

df = pd.DataFrame(data=clusters)
df.to_csv("out.csv", header=False, index=False)