**Imports**

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import umap
from sklearn.decomposition import PCA
from sklearn.manifold import trustworthiness
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
import hdbscan
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.cluster import AgglomerativeClustering
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

In [None]:
# Get the current working directory
current_wd = os.getcwd()

# Define paths relative to the base directory
data_dir = os.path.join(current_wd, 'Project')
iroads_folder = os.path.join(data_dir, 'iROADSDataset')
catsvsdogs_folder = os.path.join(data_dir, 'PetImages')

In [None]:
# Set up parameters for the datasets
image_height = 224
image_width = 224
batch_size = 32

# Create dataset for iRoads
iroads_dataset = tf.keras.utils.image_dataset_from_directory(
    directory=iroads_folder,
    image_size=(image_height, image_width),
    batch_size=batch_size,
    labels="inferred"
)

#Get rid of corrupt images from catsvsdogs
deleted = 0

#Loop over the pet_folder
for pet_folder in ("Cat", "Dog"):
  #Path to the folder
  folder_path = os.path.join(catsvsdogs_folder, pet_folder)
  #Loop through each file in folder
  for pet_name in os.listdir(folder_path):
    #Full path for each image
    image_path = os.path.join(folder_path, pet_name)

    #Check if the file is a valid JPEG image
    try:
      x = open(image_path, "rb")
      valid_jfif = b"JFIF" in x.peek(10)
    finally:
      x.close()

    if not valid_jfif:
      deleted += 1
      os.remove(image_path)
print(f"Number of delete corrupted images {deleted}")


# Create dataset for catsvsdogs
catsvsdogs_dataset = tf.keras.utils.image_dataset_from_directory(
    directory=catsvsdogs_folder,
    image_size=(image_height,image_width),
    batch_size=batch_size,
    labels="inferred"
)


Found 4656 files belonging to 7 classes.


**Convert to Numpy Array**

In [None]:
iroad_images = []
iroad_labels = []

for image_batch, label_batch in iroads_dataset:
  iroad_images.append(image_batch)
  iroad_labels.append(label_batch)

iroad_images = tf.concat(iroad_images, axis = 0)
iroad_labels = tf.concat(iroad_labels, axis = 0)

iroad_images = iroad_images.numpy()
iroad_labels = iroad_labels.numpy()

#Make a copy of the original numpy arrays for the augment images and labels
iroad_images_with_augment = np.copy(iroad_images)
iroad_labels_with_augment = np.copy(iroad_labels)

NameError: name 'iroads_dataset' is not defined

In [None]:
catsvsdogs_images = []
catsvsdogs_labels = []

for image_batch, label_batch in catsvsdogs_dataset:
  catsvsdogs_images.append(image_batch)
  catsvsdogs_labels.append(label_batch)

catsvsdogs_images = tf.concat(catsvsdogs_images, axis=0)
catsvsdogs_labels = tf.concat(catsvsdogs_labels, axis=0)

catsvsdogs_images = catsvsdogs_images.numpy()
catsvsdogs_labels = catsvsdogs_labels.numpy()

#Make a copy of the original numpy arrays for the augment images and labels
catsvsdogs_images_with_augment = np.copy(catsvsdogs_images)
catsvsdogs_labels_with_augment = np.copy(catsvsdogs_labels)

**Augmentation**

In [None]:
#ImageDataGenerator - Augmentation

augmented_image_maker = ImageDataGenerator(
    rotation_range = 20,
    width_shift_range = 0.3,
    height_shift_range = 0.1,
    zoom_range = 0.3,
    vertical_flip = True,
    horizontal_flip = True,
    fill_mode = "nearest"
)

#How many images do we want to augment
percent = 0.4
number_of_images_iroads = int(len(iroad_images) * percent)
number_of_images_catsvsdogs = int(len(catsvsdogs_images)*percent)

print(number_of_images_iroads)
print(number_of_images_catsvsdogs)

#Random selection of images in each dataset (ie. the index of the image), that we will augment
images_to_augment_iroads = np.random.choice(len(iroad_images), number_of_images_iroads, replace=False)
images_to_augment_catsvsdogs = np.random.choice(len(catsvsdogs_images), number_of_images_catsvsdogs, replace=False)

print(images_to_augment_iroads)
print(images_to_augment_catsvsdogs)

#Select those random images, along with there labels
selected_iroads_images = iroad_images[images_to_augment_iroads]
selected_iroads_labels = iroad_labels[images_to_augment_iroads]

selected_catsvsdogs_images = catsvsdogs_images[images_to_augment_catsvsdogs]
selected_catsvsdogs_labels = catsvsdogs_labels[images_to_augment_catsvsdogs]

#Augment the images and labels
augment_iroads = augmented_image_maker.flow(selected_iroads_images, selected_iroads_labels, batch_size=number_of_images_iroads,shuffle=True)
augment_catsvsdogs = augmented_image_maker.flow(selected_catsvsdogs_images, selected_catsvsdogs_labels, batch_size=number_of_images_catsvsdogs, shuffle=True)

#Return the augmented images and labels
augmented_iroad_image, augmented_iroad_label = next(augment_iroads)
augmented_cd_image, augmented_cd_label = next(augment_catsvsdogs)

#Add the augmented images with the original dataset
iroad_images_with_augment[images_to_augment_iroads] = augmented_iroad_image
iroad_labels_with_augment[images_to_augment_iroads] = augmented_iroad_label

catsvsdogs_images_with_augment[images_to_augment_catsvsdogs] = augmented_cd_image
catsvsdogs_labels_with_augment[images_to_augment_catsvsdogs] = augmented_cd_label







**Split (Train/Test) -original**

In [None]:
#Split iRoads Original

x_trainr, x_testr, y_trainr, y_testr = train_test_split(iroad_images, iroad_labels, test_size=0.2, random_state=23)
#Split CatsvsDogs Original

x_train, x_test, y_train, y_test = train_test_split(catsvsdogs_images, catsvsdogs_labels, test_size=0.2, random_state=23)

**Split(Train/Test) - with augmentation**

In [None]:
#Split iRoads (with augmentation)

x_trainr_a, x_testr_a, y_trainr_a, y_testr_a = train_test_split(augmented_iroad_image, augmented_iroad_label, test_size=0.2, random_state=23)

#Split CatsvsDogs (with augmentation)

x_train_a, x_test_a, y_train_a, y_test_a = train_test_split(augmented_cd_image, augmented_cd_label, test_size=0.2, random_state=23)

**Feature Extraction Resnet50 -original**

In [None]:
#Feature Extraction -ResNet50

#Standard Model
standard_model = ResNet50(weights="imagenet", include_top=False, input_shape=(image_height, image_width, 3))

#Freeze Layers
for layer in standard_model.layers:
  layer.trainable = False

#Using the last convolutional block
conv5_block3_out = standard_model.get_layer("conv5_block3_out").output

# Feature Extractor
extractor = Model(inputs = standard_model.input, outputs = conv5_block3_out)

In [None]:
#Preprocess iRoads

preprocess_x_train_roads = preprocess_input(x_trainr)
preprocess_x_test_roads = preprocess_input(x_testr)

#Preprocess CatsvsDogs

preprocess_x_train = preprocess_input(x_train)
preprocess_x_test = preprocess_input(x_test)

In [None]:
# Extract Features iRoads
features_x_train_roads = extractor.predict(preprocess_x_train_roads)
features_x_test_roads = extractor.predict(preprocess_x_test_roads)

#Extract Features CatsvsDogs
features_x_train = extractor.predict(preprocess_x_train)
features_x_test = extractor.predict(preprocess_x_test)

In [None]:
# Flatten Features iRoads
road_train_features = features_x_train_roads.reshape(features_x_train_roads.shape[0], -1)
road_test_features = features_x_test_roads.reshape(features_x_test_roads.shape[0],-1)

#Flatten Featuers CatsvsDogs
train_features = features_x_train.reshape(features_x_train.shape[0], -1)
test_features = features_x_test.reshape(features_x_test.shape[0],-1)

In [None]:
#Scaling Features

scaler = StandardScaler()

#Scale iRoads
road_train_scaled = scaler.fit_transform(road_train_features)
road_test_scaled = scaler.fit_transform(road_test_features)

#Scale CatsvsDogs
catsvsdogs_train_scaled = scaler.fit_transform(train_features)
catsvsdogs_test_scaled = scaler.fit_transform(test_features)


In [None]:
#PCA - explained variance (components)

#iRoads
pca_roads = PCA(n_components=0.95)
pca_roads.fit(road_train_scaled)
components_used_roads = pca_roads.n_components_
print(components_used_roads)

# #CatsvsDogs
pca_catsvsdogs = PCA(n_components=0.95)
pca_catsvsdogs.fit(catsvsdogs_train_scaled)
components_used_catsvsdogs = pca_catsvsdogs.n_components_
print(components_used_catsvsdogs)



In [None]:
#Umap - finding optimal n_components for iRoads

iroads_score = {}


for i in range (2,6):
  for j in range (15,30):
    umap_reduction = umap.UMAP(n_components=i, n_neighbors=j)
    umap_roads = umap_reduction.fit_transform(road_train_scaled)
    iroads_score[(i,j)] = trustworthiness(road_train_scaled,umap_roads,n_neighbors=j)



KeyboardInterrupt: 

In [None]:
#Umap - finding optimal n_components for Cats vs Dogs

catsvsdogs_score = {}

for i in range(2,6):
  for j in range(15,30):
    umap_reduction_2 = umap.UMAP(n_components=i, n_neighbors=j)
    umap_catsvsdogs = umap_reduction_2.fit_transform(catsvsdogs_train_scaled)
    catsvsdogs_score[(i,j)] = trustworthiness(catsvsdogs_train_scaled,umap_catsvsdogs,n_neighbors=j)

In [None]:
# Apply Umap

#iRoads Umap
umap_1 = umap.UMAP(n_components=3, n_neighbors=15)
umap_train_road_results = umap_1.fit_transform(road_train_scaled)
umap_test_road_results = umap_1.transform(road_test_scaled)

#CatsvsDogs Umap
umap_2 = umap.UMAP(n_components=3, n_neighbors=15)
umap_catsvsdogs_train_results = umap_2.fit_transform(catsvsdogs_train_scaled)
umap_catsvsdogs_test_results = umap_2.transform(catsvsdogs_test_scaled)


In [None]:
#Visualize Umap

#iRoads
df_iroads_train = pd.DataFrame({
    "Umap1": umap_train_road_results[:,0],
    "Umap2": umap_train_road_results[:,1],
    "Labels": y_trainr
})

df_iroads_test = pd.DataFrame({
    "Umap1":umap_test_road_results[:,0],
    "Umap2":umap_test_road_results[:,1],
    "Labels":y_testr
})


#CatsvsDogs
df_catsvsdogs_train = pd.DataFrame({
    "Umap1":umap_catsvsdogs_train_results[:,0],
    "Umap2":umap_catsvsdogs_train_results[:,1],
    "Labels":y_train
})

df_catsvsdogs_test = pd.DataFrame({
    "Umap1":umap_catsvsdogs_test_results[:,0],
    "Umap2":umap_catsvsdogs_test_results[:,1],
    "Labels":y_test
})


#Plot iRoads Train
plt.figure(figsize=(8,6))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Labels",data = df_iroads_train, palette ="viridis", s=100, markers="o")
plt.title("iRoads Train")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()

#Plot iRoads Test

plt.figure(figsize=(8,6))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Labels",data = df_iroads_test, palette ="viridis", s=100, markers="o")
plt.title("iRoads Test")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot CatsvsDogs Train
plt.figure(figsize=(8,6))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Labels",data = df_catsvsdogs_train, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Train")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()

#Plot CatsvsDogs Test
plt.figure(figsize=(8,6))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Labels",data = df_catsvsdogs_test, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Test")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()

**Feature Extraction Resnet50 (with augmentation)**

In [None]:
#Feature Extraction -ResNet50

#Standard Model
standard_model_with_augmentation = ResNet50(weights="imagenet", include_top=False, input_shape=(image_height, image_width, 3))

#Freeze Layers
for layer in standard_model_with_augmentation.layers:
  layer.trainable = False

#Using the last convolutional block
conv5_block3_out_2 = standard_model_with_augmentation.get_layer("conv5_block3_out").output

# Feature Extractor
extractor_2 = Model(inputs = standard_model_with_augmentation.input, outputs = conv5_block3_out_2)




#Preprocess iRoads

preprocess_x_train_roads_with_augmentation = preprocess_input(x_trainr_a)
preprocess_x_test_roads_with_augmentation = preprocess_input(x_testr_a)

#Preprocess CatsvsDogs

preprocess_x_train_with_augmentation = preprocess_input(x_train_a)
preprocess_x_test_with_augmentation = preprocess_input(x_test_a)



# Extract Features iRoads
features_x_train_roads_with_augmentation = extractor_2.predict(preprocess_x_train_roads_with_augmentation)
features_x_test_roads_with_augmentation = extractor_2.predict(preprocess_x_test_roads_with_augmentation)

#Extract Features CatsvsDogs
features_x_train_with_augmentation = extractor_2.predict(preprocess_x_train_with_augmentation)
features_x_test_with_augmentation = extractor_2.predict(preprocess_x_test_with_augmentation)




# Flatten Features iRoads
road_train_features_with_augmentation = features_x_train_roads_with_augmentation.reshape(features_x_train_roads_with_augmentation.shape[0], -1)
road_test_features_with_augmentation = features_x_test_roads_with_augmentation.reshape(features_x_test_roads_with_augmentation.shape[0],-1)

#Flatten Featuers CatsvsDogs
train_features_with_augmentation = features_x_train_with_augmentation.reshape(features_x_train_with_augmentation.shape[0], -1)
test_features_with_augmentation = features_x_test_with_augmentation.reshape(features_x_test_with_augmentation.shape[0],-1)



#Scaling Features

scaler_2 = StandardScaler()

#Scale iRoads
road_train_scaled_with_augmentation = scaler_2.fit_transform(road_train_features_with_augmentation)
road_test_scaled_with_augmentation = scaler_2.fit_transform(road_test_features_with_augmentation)

#Scale CatsvsDogs
catsvsdogs_train_scaled_with_augmentation = scaler_2.fit_transform(train_features_with_augmentation)
catsvsdogs_test_scaled_with_augmentation = scaler_2.fit_transform(test_features_with_augmentation)



In [None]:
#PCA - explained variance (components)

#iRoads
pca_roads_with_augmentation = PCA(n_components=0.95)
pca_roads_with_augmentation.fit(road_train_scaled_with_augmentation)
components_used_iroads_aug = pca_roads_with_augmentation.n_components_
print(components_used_iroads_aug)

#CatsvsDogs
pca_catsvsdogs_with_augmentation = PCA(n_components=0.95)
pca_catsvsdogs_with_augmentation.fit(catsvsdogs_train_scaled_with_augmentation)
components_used_catsvsdogs_aug = pca_catsvsdogs_with_augmentation.n_components_
print(components_used_catsvsdogs_aug)


In [None]:
#Umap - finding optimal n_components for iRoads

iroads_score_with_augmentation = {}


for i in range (2,6):
  for j in range (15,30):
    umap_reduction_2 = umap.UMAP(n_components=i, n_neighbors=j)
    umap_roads_2 = umap_reduction_2.fit_transform(road_train_scaled_with_augmentation)
    iroads_score_with_augmentation[(i,j)] = trustworthiness(road_train_scaled_with_augmentation,umap_roads_2,n_neighbors=j)

In [None]:
#Umap - finding optimal n_components for Cats vs Dogs

catsvsdogs_score_with_augmentation = {}

for i in range(2,6):
  for j in range(15,30):
    umap_reduction_3 = umap.UMAP(n_components=i, n_neighbors=j)
    umap_catsvsdogs_3 = umap_reduction_3.fit_transform(catsvsdogs_train_scaled_with_augmentation)
    catsvsdogs_score_with_augmentation[(i,j)] = trustworthiness(catsvsdogs_train_scaled_with_augmentation,umap_catsvsdogs_3,n_neighbors=j)

In [None]:
# Apply Umap (data with augmentation)

#iRoads Umap
umap_3 = umap.UMAP(n_components=4, n_neighbors=15)
umap_train_road_results_with_augmentation = umap_3.fit_transform(road_train_scaled_with_augmentation)
umap_test_road_results_with_augmentation = umap_3.transform(road_test_scaled_with_augmentation)

#CatsvsDogs Umap
umap_4 = umap.UMAP(n_components=4, n_neighbors=15)
umap_catsvsdogs_train_results_with_augmentation = umap_4.fit_transform(catsvsdogs_train_scaled_with_augmentation)
umap_catsvsdogs_test_results_with_augmentation = umap_4.transform(catsvsdogs_test_scaled_with_augmentation)


In [None]:
#Visualize Umap

#iRoads
df_iroads_train_with_augmentation = pd.DataFrame({
    "Umap1": umap_train_road_results_with_augmentation[:,0],
    "Umap2": umap_train_road_results_with_augmentation[:,1],
    "Labels": y_trainr_a
})

df_iroads_test_with_augmentation = pd.DataFrame({
    "Umap1":umap_test_road_results_with_augmentation[:,0],
    "Umap2":umap_test_road_results_with_augmentation[:,1],
    "Labels":y_testr_a
})


#CatsvsDogs
df_catsvsdogs_train_with_augmentation = pd.DataFrame({
    "Umap1":umap_catsvsdogs_train_results_with_augmentation[:,0],
    "Umap2":umap_catsvsdogs_train_results_with_augmentation[:,1],
    "Labels":y_train_a
})

df_catsvsdogs_test_with_augmentation = pd.DataFrame({
    "Umap1":umap_catsvsdogs_test_results_with_augmentation[:,0],
    "Umap2":umap_catsvsdogs_test_results_with_augmentation[:,1],
    "Labels":y_test_a
})



#Plot iRoads Train
plt.figure(figsize=(8,6))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Labels",data = df_iroads_train_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("iRoads Train w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()

#Plot iRoads Test

plt.figure(figsize=(8,6))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Labels",data = df_iroads_test_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("iRoads Test w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot CatsvsDogs Train
plt.figure(figsize=(8,6))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Labels",data = df_catsvsdogs_train_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Train w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()

#Plot CatsvsDogs Test
plt.figure(figsize=(8,6))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Labels",data = df_catsvsdogs_test_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Test w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


**CLUSTERING - (original)**

In [None]:
#Clustering

#iRoads HDBSCAN -Finding the best silhouette score, given certain value parameters


#iRoads -Train

iroads_train_min_cs = []
iroads_train_min_s = []
iroads_silscore_training = []

for x in range(2,31):
  iroads_train_min_cs.append(x)
  iroads_train_min_s.append(x)


for i in iroads_train_min_cs:
  for j in iroads_train_min_s:
    h_dbscan = hdbscan.HDBSCAN(min_cluster_size =i, min_samples =j)
    h_dbscan_results = h_dbscan.fit_predict(umap_train_road_results)
    if np.any(h_dbscan_results != -1):
      iroads_train_score = silhouette_score(umap_train_road_results,h_dbscan_results)
      iroads_silscore_training.append((i,j,iroads_train_score))


#iRoads Test
iroads_test_min_cs = []
iroads_test_min_s = []
iroads_silscore_test = []

for x in range(2,31):
  iroads_test_min_cs.append(x)
  iroads_test_min_s.append(x)

for i in iroads_test_min_cs:
  for j in iroads_test_min_s:
    h_dbscan2 = hdbscan.HDBSCAN(min_cluster_size =i, min_samples =j)
    h_dbscan2_results = h_dbscan.fit_predict(umap_test_road_results)
    if np.any(h_dbscan2_results != -1):
      iroads_test_score = silhouette_score(umap_test_road_results,h_dbscan2_results)
      iroads_silscore_test.append((i,j,iroads_test_score))


In [None]:
# Get and Print the best score
best_train = max(iroads_silscore_training,key = lambda x: x[2])
best_test = max(iroads_silscore_test, key= lambda x: x[2])

print(best_train)
print(best_test)

In [None]:
#Clustering

#CatsvsDogs HDBSCAN -Finding the best silhouette score, given certain value parameters


#CatsvsDogs -Train

cd_train_min_cs = []
cd_train_min_s = []
cd_silscore_training = []

for x in range(2,31):
  cd_train_min_cs.append(x)
  cd_train_min_s.append(x)


for i in cd_train_min_cs:
  for j in cd_train_min_s:
    h_dbscan3 = hdbscan.HDBSCAN(min_cluster_size = i, min_samples=j)
    h_dbscan3_results = h_dbscan3.fit_predict(umap_catsvsdogs_train_results)
    if np.any(h_dbscan3_results != -1):
      cd_train_score = silhouette_score(umap_catsvsdogs_train_results,h_dbscan3_results)
      cd_silscore_training.append((i,j,cd_train_score))


#CatsvsDogs -Test
cd_test_min_cs = []
cd_test_min_s = []
cd_silscore_test = []

for x in range(2,31):
  cd_test_min_cs.append(x)
  cd_test_min_s.append(x)

for i in cd_test_min_cs:
  for j in cd_test_min_s:
    h_dbscan4 = hdbscan.HDBSCAN(min_cluster_size = i, min_samples=j)
    h_dbscan4_results = h_dbscan4.fit_predict(umap_catsvsdogs_test_results)
    if np.any(h_dbscan4_results != -1):
      cd_test_score = silhouette_score(umap_catsvsdogs_test_results,h_dbscan4_results)
      cd_silscore_test.append((i,j,cd_test_score))


In [None]:
# Get and Print the best score

best_train_1 = max(cd_silscore_training,key = lambda x: x[2])
best_test_1 = max(cd_silscore_test, key= lambda x: x[2])

print(best_train_1)
print(best_test_1)

In [None]:
#Apply Clustering HDBSCAN

#iRoads

clustering_1 = hdbscan.HDBSCAN(min_cluster_size = 20, min_samples =10)
clustering_1_results = clustering_1.fit_predict(umap_train_road_results)

df_clustering_1 = pd.DataFrame({
    "Umap1": umap_train_road_results[:,0],
    "Umap2": umap_train_road_results[:,1],
    "Cluster": clustering_1_results
})


clustering_2 = hdbscan.HDBSCAN(min_cluster_size = 2, min_samples =2)
clustering_2_results = clustering_1.fit_predict(umap_test_road_results)


df_clustering_2 = pd.DataFrame({
    "Umap1":umap_test_road_results[:,0],
    "Umap2":umap_test_road_results[:,1],
    "Cluster":clustering_2_results
})


#Cats vs Dogs
clustering_3 = hdbscan.HDBSCAN(min_cluster_size =2 , min_samples =30)
clustering_3_results = clustering_1.fit_predict(umap_catsvsdogs_train_results)

df_clustering_3 = pd.DataFrame({
    "Umap1":umap_catsvsdogs_train_results[:,0],
    "Umap2":umap_catsvsdogs_train_results[:,1],
    "Cluster":clustering_3_results
})


clustering_4 = hdbscan.HDBSCAN(min_cluster_size =28 , min_samples =2)
clustering_4_results = clustering_1.fit_predict(umap_catsvsdogs_test_results)


df_clustering_4 = pd.DataFrame({
    "Umap1":umap_catsvsdogs_test_results[:,0],
    "Umap2":umap_catsvsdogs_test_results[:,1],
    "Cluster":clustering_4_results
})

In [None]:
#Visualize Clusters

#Plot iRoads Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_clustering_1, palette ="viridis", s=100, markers="o")
plt.title("iRoads Train Clusters")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot iRoads Test
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_clustering_2, palette ="viridis", s=100, markers="o")
plt.title("iRoads Test Clusters")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot Cats vs Dogs Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_clustering_3, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Train Clusters")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot Cats vs Dogs Test
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_clustering_4, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Test Clusters")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()

In [None]:
#Clustering

#iRoads AggClus -Finding the best silhouette score, given certain value of n_clusters

#iRoads Train

n_cluster_train = []
n_cluster_train_silscore = []

for x in range(2,21):
  n_cluster_train.append(x)

for i in n_cluster_train:
  aggcluster = AgglomerativeClustering(n_clusters=i)
  aggcluster_results = aggcluster.fit_predict(umap_train_road_results)
  aggcluster_score = silhouette_score(umap_train_road_results,aggcluster_results)
  n_cluster_train_silscore.append((i, aggcluster_score))


#iRoads Test
n_cluster_test = []
n_cluster_test_silscore =[]

for x in range(2,21):
  n_cluster_test.append(x)

for i in n_cluster_test:
  aggcluster_1 = AgglomerativeClustering(n_clusters=i)
  aggcluster_results_1 = aggcluster.fit_predict(umap_test_road_results)
  aggcluster_score_1 = silhouette_score(umap_test_road_results,aggcluster_results_1)
  n_cluster_test_silscore.append((i, aggcluster_score_1))


In [None]:
# Get and Print the best score/with number of clusters

top_cluster_train = max(n_cluster_train_silscore, key = lambda x: x[1])
top_cluster_test = max(n_cluster_test_silscore, key = lambda x: x[1])
print(top_cluster_train)
print(top_cluster_test)

In [None]:
#Clustering

#CatsvsDogs AggClus -Finding the best silhouette score, given certain value of n_clusters

#CatsvsDogs Train

n_cluster_train_1 = []
n_cluster_train_silscore_1 = []

for x in range(2,21):
  n_cluster_train_1.append(x)

for i in n_cluster_train:
  aggcluster_2 = AgglomerativeClustering(n_clusters=i)
  aggcluster_results_2 = aggcluster.fit_predict(umap_catsvsdogs_train_results)
  aggcluster_score_2 = silhouette_score(umap_catsvsdogs_train_results,aggcluster_results_2)
  n_cluster_train_silscore_1.append((i, aggcluster_score_2))

#CatsvsDogs Test

n_cluster_test_1 = []
n_cluster_test_silscore_1 =[]

for x in range(2,21):
  n_cluster_test_1.append(x)

for i in n_cluster_test:
  aggcluster_3 = AgglomerativeClustering(n_clusters=i)
  aggcluster_results_3 = aggcluster.fit_predict(umap_catsvsdogs_test_results)
  aggcluster_score_3 = silhouette_score(umap_catsvsdogs_test_results,aggcluster_results_3)
  n_cluster_test_silscore_1.append((i, aggcluster_score_3))



In [None]:
top_cluster_train_1 = max(n_cluster_train_silscore_1, key = lambda x: x[1])
top_cluster_test_1 = max(n_cluster_test_silscore_1, key = lambda x: x[1])
print(top_cluster_train_1)
print(top_cluster_test_1)

In [None]:
#Apply Clustering - Agglomerative Clustering

#iRoads

algo_1 = AgglomerativeClustering(n_clusters=14)
algo_1_results = algo_1.fit_predict(umap_train_road_results)

df_algo_1 = pd.DataFrame({
    "Umap1": umap_train_road_results[:,0],
    "Umap2": umap_train_road_results[:,1],
    "Cluster": algo_1_results
})


algo_2 = AgglomerativeClustering(n_clusters=2)
algo_2_results = algo_2.fit_predict(umap_test_road_results)

df_algo_2 = pd.DataFrame({
    "Umap1": umap_test_road_results[:,0],
    "Umap2": umap_test_road_results[:,1],
    "Cluster": algo_2_results
})



#Cats Vs Dogs

algo_3 = AgglomerativeClustering(n_clusters=2)
algo_3_results = algo_3.fit_predict(umap_catsvsdogs_train_results)

df_algo_3 = pd.DataFrame({
    "Umap1": umap_catsvsdogs_train_results[:,0],
    "Umap2": umap_catsvsdogs_train_results[:,1],
    "Cluster": algo_3_results
})


algo_4 = AgglomerativeClustering(n_clusters=2)
algo_4_results = algo_4.fit_predict(umap_catsvsdogs_test_results)

df_algo_4 = pd.DataFrame({
    "Umap1": umap_catsvsdogs_test_results[:,0],
    "Umap2": umap_catsvsdogs_test_results[:,1],
    "Cluster": algo_4_results
})

In [None]:
#Visualize Clusters

#Plot iRoads Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_algo_1, palette ="viridis", s=100, markers="o")
plt.title("iRoads Train Clusters")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()

#Plot iRoads Test
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_algo_2, palette ="viridis", s=100, markers="o")
plt.title("iRoads Test Clusters")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot CatsvsDogs Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_algo_3, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Train Clusters")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot CatsvsDogs Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_algo_4, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Test Clusters")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


In [None]:
#Evaluate the quality of Clusters - Davies-Bouldin

#iRoads HDBSCAN
db_1 = davies_bouldin_score(umap_train_road_results,clustering_1_results)
db_2 = davies_bouldin_score(umap_test_road_results,clustering_2_results)

#Cats vs Dogs HDBSCAN
db_3 = davies_bouldin_score(umap_catsvsdogs_train_results,clustering_3_results)
db_4 = davies_bouldin_score(umap_catsvsdogs_test_results,clustering_4_results)


#iRoads Agglomerative
db_5 = davies_bouldin_score(umap_train_road_results,algo_1_results)
db_6 = davies_bouldin_score(umap_test_road_results,algo_2_results)

#Cats vs Dogs Agglomerative
db_7 = davies_bouldin_score(umap_catsvsdogs_train_results,algo_3_results)
db_8 = davies_bouldin_score(umap_catsvsdogs_test_results,algo_4_results)



In [None]:
#Print Davis-Bouldin Values

print(f"HDBSCAN: iRoads train {db_1}, test {db_2},  CatsvsDogs train {db_3}, test {db_4}")
print(f"Agglomerative : iRoads train {db_5}, test {db_6}, CatsvsDogs train {db_7}, test {db_8}")


Clustering -(with augmentation)

In [None]:
#Clustering (augmented data)

#iRoads HDBSCAN -Finding the best silhouette score, given certain value parameters


#iRoads -Train

iroads_train_min_cs_with_augmentation = []
iroads_train_min_s_with_augmentation = []
iroads_silscore_training_with_augmentation = []

for x in range(2,31):
  iroads_train_min_cs_with_augmentation.append(x)
  iroads_train_min_s_with_augmentation.append(x)


for i in iroads_train_min_cs_with_augmentation:
  for j in iroads_train_min_s_with_augmentation:
    h_dbscan_with_augmentation = hdbscan.HDBSCAN(min_cluster_size =i, min_samples =j)
    h_dbscan_results_with_augmentation = h_dbscan_with_augmentation.fit_predict(umap_train_road_results_with_augmentation)
    if np.any(h_dbscan_results_with_augmentation != -1):
      iroads_train_score_with_augmentation = silhouette_score(umap_train_road_results_with_augmentation,h_dbscan_results_with_augmentation)
      iroads_silscore_training_with_augmentation.append((i,j,iroads_train_score_with_augmentation))


#iRoads Test
iroads_test_min_cs_with_augmentation = []
iroads_test_min_s_with_augmentation = []
iroads_silscore_test_with_augmentation = []

for x in range(2,31):
  iroads_test_min_cs_with_augmentation.append(x)
  iroads_test_min_s_with_augmentation.append(x)

for i in iroads_test_min_cs_with_augmentation:
  for j in iroads_test_min_s_with_augmentation:
    h_dbscan2_with_augmentation = hdbscan.HDBSCAN(min_cluster_size =i, min_samples =j)
    h_dbscan2_results_with_augmentation = h_dbscan2_with_augmentation.fit_predict(umap_test_road_results_with_augmentation)
    if np.any(h_dbscan2_results_with_augmentation != -1):
      iroads_test_score_with_augmentation = silhouette_score(umap_test_road_results_with_augmentation,h_dbscan2_results_with_augmentation)
      iroads_silscore_test_with_augmentation.append((i,j,iroads_test_score_with_augmentation))


# Get and Print the best score
best_train_with_augmentation = max(iroads_silscore_training_with_augmentation,key = lambda x: x[2])
best_test_with_augmentation = max(iroads_silscore_test_with_augmentation, key= lambda x: x[2])

print(best_train_with_augmentation)
print(best_test_with_augmentation)

In [None]:
#Clustering (with augmentation)

#CatsvsDogs HDBSCAN -Finding the best silhouette score, given certain value parameters


#CatsvsDogs -Train

cd_train_min_cs_with_augmentation = []
cd_train_min_s_with_augmentation = []
cd_silscore_training_with_augmentation = []

for x in range(2,31):
  cd_train_min_cs_with_augmentation.append(x)
  cd_train_min_s_with_augmentation.append(x)


for i in cd_train_min_cs_with_augmentation:
  for j in cd_train_min_s_with_augmentation:
    h_dbscan3_with_augmentation = hdbscan.HDBSCAN(min_cluster_size = i, min_samples=j)
    h_dbscan3_results_with_augmentation = h_dbscan3_with_augmentation.fit_predict(umap_catsvsdogs_train_results_with_augmentation)
    if np.any(h_dbscan3_results_with_augmentation != -1):
      cd_train_score_with_augmentation = silhouette_score(umap_catsvsdogs_train_results_with_augmentation,h_dbscan3_results_with_augmentation)
      cd_silscore_training_with_augmentation.append((i,j,cd_train_score_with_augmentation))


#CatsvsDogs -Test
cd_test_min_cs_with_augmentation = []
cd_test_min_s_with_augmentation = []
cd_silscore_test_with_augmentation = []

for x in range(2,31):
  cd_test_min_cs_with_augmentation.append(x)
  cd_test_min_s_with_augmentation.append(x)

for i in cd_test_min_cs_with_augmentation:
  for j in cd_test_min_s_with_augmentation:
    h_dbscan4_with_augmentation = hdbscan.HDBSCAN(min_cluster_size = i, min_samples=j)
    h_dbscan4_results_with_augmentation = h_dbscan4_with_augmentation.fit_predict(umap_catsvsdogs_test_results_with_augmentation)
    if np.any(h_dbscan4_results_with_augmentation != -1):
      cd_test_score_with_augmentation = silhouette_score(umap_catsvsdogs_test_results_with_augmentation,h_dbscan4_results_with_augmentation)
      cd_silscore_test_with_augmentation.append((i,j,cd_test_score_with_augmentation))

# Get the best score

best_train_1_with_augmentation = max(cd_silscore_training_with_augmentation,key = lambda x: x[2])
best_test_1_with_augmentation = max(cd_silscore_test_with_augmentation, key= lambda x: x[2])

print(best_train_1_with_augmentation)
print(best_test_1_with_augmentation)


In [None]:
#Apply Clustering HDBSCAN _with_augmentation

#iRoads

clustering_1_with_augmentation = hdbscan.HDBSCAN(min_cluster_size = 2, min_samples =2)
clustering_1_results_with_augmentation = clustering_1_with_augmentation.fit_predict(umap_train_road_results_with_augmentation)

df_clustering_1_with_augmentation = pd.DataFrame({
    "Umap1": umap_train_road_results_with_augmentation[:,0],
    "Umap2": umap_train_road_results_with_augmentation[:,1],
    "Cluster": clustering_1_results_with_augmentation
})


clustering_2_with_augmentation = hdbscan.HDBSCAN(min_cluster_size = 11, min_samples =2)
clustering_2_results_with_augmentation = clustering_2_with_augmentation.fit_predict(umap_test_road_results_with_augmentation)


df_clustering_2_with_augmentation = pd.DataFrame({
    "Umap1":umap_test_road_results_with_augmentation[:,0],
    "Umap2":umap_test_road_results_with_augmentation[:,1],
    "Cluster":clustering_2_results_with_augmentation
})


#Cats vs Dogs
clustering_3_with_augmentation = hdbscan.HDBSCAN(min_cluster_size =2 , min_samples =30)
clustering_3_results_with_augmentation = clustering_3_with_augmentation.fit_predict(umap_catsvsdogs_train_results_with_augmentation)

df_clustering_3_with_augmentation = pd.DataFrame({
    "Umap1":umap_catsvsdogs_train_results_with_augmentation[:,0],
    "Umap2":umap_catsvsdogs_train_results_with_augmentation[:,1],
    "Cluster":clustering_3_results_with_augmentation
})


clustering_4_with_augmentation = hdbscan.HDBSCAN(min_cluster_size =28 , min_samples =2)
clustering_4_results_with_augmentation = clustering_4_with_augmentation.fit_predict(umap_catsvsdogs_test_results_with_augmentation)


df_clustering_4_with_augmentation = pd.DataFrame({
    "Umap1":umap_catsvsdogs_test_results_with_augmentation[:,0],
    "Umap2":umap_catsvsdogs_test_results_with_augmentation[:,1],
    "Cluster":clustering_4_results_with_augmentation
})

In [None]:
#Visualize Clusters

#Plot iRoads Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_clustering_1_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("iRoads Train Clusters w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot iRoads Test
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_clustering_2_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("iRoads Test Clusters w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot Cats vs Dogs Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_clustering_3_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Train Clusters wa Agmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot Cats vs Dogs Test
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_clustering_4_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Test Clusters w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()

In [None]:
#Clustering _with_augmentation

#iRoads AggClus -Finding the best silhouette score, given certain value of n_clusters

#iRoads Train

n_cluster_train_with_augmentation = []
n_cluster_train_silscore_with_augmentation = []

for x in range(2,21):
  n_cluster_train_with_augmentation.append(x)

for i in n_cluster_train_with_augmentation:
  aggcluster_with_augmentation = AgglomerativeClustering(n_clusters=i)
  aggcluster_results_with_augmentation = aggcluster_with_augmentation.fit_predict(umap_train_road_results_with_augmentation)
  aggcluster_score_with_augmentation = silhouette_score(umap_train_road_results_with_augmentation,aggcluster_results_with_augmentation)
  n_cluster_train_silscore_with_augmentation.append((i, aggcluster_score_with_augmentation))


#iRoads Test
n_cluster_test_with_augmentation = []
n_cluster_test_silscore_with_augmentation =[]

for x in range(2,21):
  n_cluster_test_with_augmentation.append(x)

for i in n_cluster_test_with_augmentation:
  aggcluster_1_with_augmentation = AgglomerativeClustering(n_clusters=i)
  aggcluster_results_1_with_augmentation = aggcluster_with_augmentation.fit_predict(umap_test_road_results_with_augmentation)
  aggcluster_score_1_with_augmentation = silhouette_score(umap_test_road_results_with_augmentation,aggcluster_results_1_with_augmentation)
  n_cluster_test_silscore_with_augmentation.append((i, aggcluster_score_1_with_augmentation))


# Get the best score/with number of clusters

top_cluster_train_with_augmentation = max(n_cluster_train_silscore_with_augmentation, key = lambda x: x[1])
top_cluster_test_with_augmentation = max(n_cluster_test_silscore_with_augmentation, key = lambda x: x[1])
print(top_cluster_train_with_augmentation)
print(top_cluster_test_with_augmentation)


In [None]:
#Clustering

#CatsvsDogs AggClus -Finding the best silhouette score, given certain value of n_clusters

#CatsvsDogs Train

n_cluster_train_1_with_augmentation = []
n_cluster_train_silscore_1_with_augmentation = []

for x in range(2,21):
  n_cluster_train_1_with_augmentation.append(x)

for i in n_cluster_train_with_augmentation:
  aggcluster_2_with_augmentation = AgglomerativeClustering(n_clusters=i)
  aggcluster_results_2_with_augmentation = aggcluster_with_augmentation.fit_predict(umap_catsvsdogs_train_results_with_augmentation)
  aggcluster_score_2_with_augmentation = silhouette_score(umap_catsvsdogs_train_results_with_augmentation,aggcluster_results_2_with_augmentation)
  n_cluster_train_silscore_1_with_augmentation.append((i, aggcluster_score_2_with_augmentation))

#CatsvsDogs Test

n_cluster_test_1_with_augmentation = []
n_cluster_test_silscore_1_with_augmentation =[]

for x in range(2,21):
  n_cluster_test_1_with_augmentation.append(x)

for i in n_cluster_test_with_augmentation:
  aggcluster_3_with_augmentation = AgglomerativeClustering(n_clusters=i)
  aggcluster_results_3_with_augmentation = aggcluster_with_augmentation.fit_predict(umap_catsvsdogs_test_results_with_augmentation)
  aggcluster_score_3_with_augmentation = silhouette_score(umap_catsvsdogs_test_results_with_augmentation,aggcluster_results_3_with_augmentation)
  n_cluster_test_silscore_1_with_augmentation.append((i, aggcluster_score_3_with_augmentation))


# Get the best score/with number of clusters
top_cluster_train_1_with_augmentation = max(n_cluster_train_silscore_1_with_augmentation, key = lambda x: x[1])
top_cluster_test_1_with_augmentation = max(n_cluster_test_silscore_1_with_augmentation, key = lambda x: x[1])
print(top_cluster_train_1_with_augmentation)
print(top_cluster_test_1_with_augmentation)



In [None]:
#Apply Clustering - Agglomerative Clustering

#iRoads

algo_1_with_augmentation = AgglomerativeClustering(n_clusters=2)
algo_1_results_with_augmentation = algo_1_with_augmentation.fit_predict(umap_train_road_results_with_augmentation)

df_algo_1_with_augmentation = pd.DataFrame({
    "Umap1": umap_train_road_results_with_augmentation[:,0],
    "Umap2": umap_train_road_results_with_augmentation[:,1],
    "Cluster": algo_1_results_with_augmentation
})


algo_2_with_augmentation = AgglomerativeClustering(n_clusters=2)
algo_2_results_with_augmentation = algo_2_with_augmentation.fit_predict(umap_test_road_results_with_augmentation)

df_algo_2_with_augmentation = pd.DataFrame({
    "Umap1": umap_test_road_results_with_augmentation[:,0],
    "Umap2": umap_test_road_results_with_augmentation[:,1],
    "Cluster": algo_2_results_with_augmentation
})



#Cats Vs Dogs

algo_3_with_augmentation = AgglomerativeClustering(n_clusters=2)
algo_3_results_with_augmentation = algo_3_with_augmentation.fit_predict(umap_catsvsdogs_train_results_with_augmentation)

df_algo_3_with_augmentation = pd.DataFrame({
    "Umap1": umap_catsvsdogs_train_results_with_augmentation[:,0],
    "Umap2": umap_catsvsdogs_train_results_with_augmentation[:,1],
    "Cluster": algo_3_results_with_augmentation
})


algo_4_with_augmentation = AgglomerativeClustering(n_clusters=2)
algo_4_results_with_augmentation = algo_4_with_augmentation.fit_predict(umap_catsvsdogs_test_results_with_augmentation)

df_algo_4_with_augmentation = pd.DataFrame({
    "Umap1": umap_catsvsdogs_test_results_with_augmentation[:,0],
    "Umap2": umap_catsvsdogs_test_results_with_augmentation[:,1],
    "Cluster": algo_4_results_with_augmentation
})

In [None]:
#Visualize Clusters

#Plot iRoads Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_algo_1_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("iRoads Train Clusters w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()

#Plot iRoads Test
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_algo_2_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("iRoads Test Clusters w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot CatsvsDogs Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_algo_3_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Train Clusters w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


#Plot CatsvsDogs Train
plt.figure(figsize=(6,4))
sns.scatterplot(x="Umap1", y="Umap2", hue = "Cluster",data = df_algo_4_with_augmentation, palette ="viridis", s=100, markers="o")
plt.title("Cats vs Dogs Test Clusters w Augmentation")
plt.xlabel("Umap1")
plt.ylabel("Umap2")
plt.show()


In [None]:
#Evaluate the quality of Clusters - Davies-Bouldin

#iRoads HDBSCAN
db_1_with_augmentation = davies_bouldin_score(umap_train_road_results_with_augmentation,clustering_1_results_with_augmentation)
db_2_with_augmentation = davies_bouldin_score(umap_test_road_results_with_augmentation,clustering_2_results_with_augmentation)

#Cats vs Dogs HDBSCAN
db_3_with_augmentation = davies_bouldin_score(umap_catsvsdogs_train_results_with_augmentation,clustering_3_results_with_augmentation)
db_4_with_augmentation = davies_bouldin_score(umap_catsvsdogs_test_results_with_augmentation,clustering_4_results_with_augmentation)


#iRoads Agglomerative
db_5_with_augmentation = davies_bouldin_score(umap_train_road_results_with_augmentation,algo_1_results_with_augmentation)
db_6_with_augmentation = davies_bouldin_score(umap_test_road_results_with_augmentation,algo_2_results_with_augmentation)

#Cats vs Dogs Agglomerative
db_7_with_augmentation = davies_bouldin_score(umap_catsvsdogs_train_results_with_augmentation,algo_3_results_with_augmentation)
db_8_with_augmentation = davies_bouldin_score(umap_catsvsdogs_test_results_with_augmentation,algo_4_results_with_augmentation)



In [None]:
#Print Davis-Bouldin Values

print(f"HDBSCAN: iRoads train {db_1_with_augmentation}, test {db_2_with_augmentation},  CatsvsDogs train {db_3_with_augmentation}, test {db_4_with_augmentation}")
print(f"Agglomerative : iRoads train {db_5_with_augmentation}, test {db_6_with_augmentation}, CatsvsDogs train {db_7_with_augmentation}, test {db_8_with_augmentation}")


CLASSIFICATION

In [None]:
#iRoads

#XGBoost
modelboost = xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
modelboost.fit(road_train_scaled,y_trainr)
boost_predict = modelboost.predict(road_test_scaled)

#Evaluation of XGBoost
reportboost = classification_report(y_testr, boost_predict)
matrixboost = confusion_matrix(y_testr,boost_predict)
accuracyboost = accuracy_score(y_testr,boost_predict)


#SVM
model1 = SVC(kernel="rbf", random_state=23)
model1.fit(road_train_scaled,y_trainr)
prediction1 = model1.predict(road_test_scaled)

#Evaluation of SVM
report = classification_report(y_testr,prediction1)
matrix = confusion_matrix(y_testr,prediction1)
accuracy = accuracy_score(y_testr,prediction1)


#MLP Classifier
model2 = MLPClassifier(random_state=23, max_iter=300)
model2.fit(road_train_scaled,y_trainr)
prediction2 = model2.predict(road_test_scaled)

#Evaluation of #MLP Classifier
report2 = classification_report(y_testr, prediction2)
matrix2 = confusion_matrix(y_testr,prediction2)
accuracy2 = accuracy_score(y_testr,prediction2 )


In [None]:
#Print XGBoost Classifer Evaluation
print(reportboost)
print(matrixboost)

plt.figure(figsize=(6,4))
sns.heatmap(matrixboost, annot=True, fmt="g", cmap="Greens", xticklabels=np.unique(y_testr), yticklabels=np.unique(y_testr))
plt.title("XG Boost Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()


#Print SVM Classifer  Evaluation
print(reportboost)
print(accuracyboost)

plt.figure(figsize=(6,4))
sns.heatmap(matrix, annot=True, fmt="g", cmap="Reds", xticklabels=np.unique(y_testr), yticklabels=np.unique(y_testr))
plt.title("SVM Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()




#Print MLP Classifier Evaluation
print(report2)
print(accuracy2)


plt.figure(figsize=(6,4))
sns.heatmap(matrix2, annot=True, fmt="g", cmap="Blues", xticklabels=np.unique(y_testr), yticklabels=np.unique(y_testr))
plt.title("MLP Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()



In [None]:
#CatsvsDogs

#Decision Tree
model3 = DecisionTreeClassifier(max_depth=10, max_features="sqrt", splitter="random", random_state=23)
model3.fit(catsvsdogs_train_scaled,y_train)
prediction3 = model3.predict(catsvsdogs_test_scaled)

#Evaluation of Decision Tree
report3 = classification_report(y_test,prediction3)
matrix3 = confusion_matrix(y_test,prediction3)
accuracy3 = accuracy_score(y_test,prediction3)

#MLP Classifier
model4 = MLPClassifier(random_state=23, max_iter=300)
model4.fit(catsvsdogs_train_scaled,y_train)
prediction4 = model4.predict(catsvsdogs_test_scaled)

#Evaluation of MLP Classifier
report4 = classification_report(y_test,prediction4)
matrix4 = confusion_matrix(y_test,prediction4)
accuracy4 = accuracy_score(y_test,prediction4)


In [None]:
#Print Decision Tree Evaluation
print(report3)
print(accuracy3)

plt.figure(figsize=(6,4))
sns.heatmap(matrix3, annot=True, fmt="g", cmap="Reds", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.title("Decision Tree Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()

#Print MLP Classifier Evaluation
print(report4)
print(accuracy4)

plt.figure(figsize=(6,4))
sns.heatmap(matrix4, annot=True, fmt="g", cmap="Blues", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.title("MLP Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()



Classification - augmentation included

In [None]:
#iRoads

#XGBoost
modelboost2 = xgb.XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")
modelboost2.fit(road_train_scaled_with_augmentation,y_trainr_a)
boost_predict2 = modelboost2.predict(road_test_scaled_with_augmentation)

#Evaluation of XGBoost
reportboost2 = classification_report(y_testr_a, boost_predict2)
matrixboost2 = confusion_matrix(y_testr_a,boost_predict2)
accuracyboost2 = accuracy_score(y_testr_a,boost_predict2)


#SVM
model1_with_augmentation = SVC(kernel="rbf", random_state=23)
model1_with_augmentation.fit(road_train_scaled_with_augmentation,y_trainr_a)
prediction1_with_augmentation = model1_with_augmentation.predict(road_test_scaled_with_augmentation)

#Evaluation of SVM
report_with_augmentation = classification_report(y_testr_a,prediction1_with_augmentation)
matrix_with_augmentation = confusion_matrix(y_testr_a,prediction1_with_augmentation)
accuracy_with_augmentation = accuracy_score(y_testr_a,prediction1_with_augmentation)


#MLP Classifier
model2_with_augmentation = MLPClassifier(random_state=23, max_iter=300)
model2_with_augmentation.fit(road_train_scaled_with_augmentation,y_trainr_a)
prediction2_with_augmentation = model2_with_augmentation.predict(road_test_scaled_with_augmentation)

#Evaluation of #MLP Classifier
report2_with_augmentation = classification_report(y_testr_a, prediction2_with_augmentation)
matrix2_with_augmentation = confusion_matrix(y_testr_a,prediction2_with_augmentation)
accuracy2_with_augmentation = accuracy_score(y_testr_a,prediction2_with_augmentation)


In [None]:
#Print XGBoost Classifer Evaluation
print(reportboost2)
print(accuracyboost2)

plt.figure(figsize=(6,4))
sns.heatmap(matrixboost2, annot=True, fmt="g", cmap="Greens", xticklabels=np.unique(y_testr_a), yticklabels=np.unique(y_testr_a))
plt.title("XGBoost Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()


# Print SVM Evaluation -
print(report_with_augmentation)
print(accuracy_with_augmentation)

plt.figure(figsize=(6,4))
sns.heatmap(matrix_with_augmentation, annot=True, fmt="g", cmap="Reds", xticklabels=np.unique(y_testr_a), yticklabels=np.unique(y_testr_a))
plt.title("SVM Classifier Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()


#Print MLP Classifier Evaluation
print(report2_with_augmentation)
print(accuracy2_with_augmentation)


plt.figure(figsize=(6,4))
sns.heatmap(matrix2_with_augmentation, annot=True, fmt="g", cmap="Blues", xticklabels=np.unique(y_testr_a), yticklabels=np.unique(y_testr_a))
plt.title("MLP Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()


In [None]:
#CatsvsDogs

#Decision Tree
model3_with_augmentation = DecisionTreeClassifier(max_depth=10, max_features="sqrt", splitter="random", random_state=23)
model3_with_augmentation.fit(catsvsdogs_train_scaled_with_augmentation,y_train_a)
prediction3_with_augmentation = model3_with_augmentation.predict(catsvsdogs_test_scaled_with_augmentation)

#Evaluation of Decision Tree
report3_with_augmentation = classification_report(y_test_a,prediction3_with_augmentation)
matrix3_with_augmentation = confusion_matrix(y_test_a,prediction3_with_augmentation)
accuracy3_with_augmentation = accuracy_score(y_test_a,prediction3_with_augmentation)

#MLP Classifier
model4_with_augmentation = MLPClassifier(random_state=23, max_iter=300)
model4_with_augmentation.fit(catsvsdogs_train_scaled_with_augmentation,y_train_a)
prediction4_with_augmentation = model4_with_augmentation.predict(catsvsdogs_test_scaled_with_augmentation)

#Evaluation of MLP Classifier
report4_with_augmentation = classification_report(y_test_a,prediction4_with_augmentation)
matrix4_with_augmentation = confusion_matrix(y_test_a,prediction4_with_augmentation)
accuracy4_with_augmentation = accuracy_score(y_test_a,prediction4_with_augmentation)


In [None]:
#Print Decision Tree Evaluation
print(report3_with_augmentation)
print(accuracy3_with_augmentation)

plt.figure(figsize=(6,4))
sns.heatmap(matrix3_with_augmentation, annot=True, fmt="g", cmap="Reds", xticklabels=np.unique(y_test_a), yticklabels=np.unique(y_test_a))
plt.title("Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()


#Print MLP Classifier Evaluation
print(report4_with_augmentation)
print(accuracy4_with_augmentation)

plt.figure(figsize=(6,4))
sns.heatmap(matrix4_with_augmentation, annot=True, fmt="g", cmap="Blues", xticklabels=np.unique(y_test_a), yticklabels=np.unique(y_test_a))
plt.title("Confussion Matrix")
plt.xlabel("Prediction")
plt.ylabel("Actual")
plt.show()
