In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import tensorflow as tf

import ucl_v3

import utilities # this file must be in the same folder as the notebook

import seaborn as sns

In [None]:
training_params = {
    'elambda': 'lof',                       #lof(for kmeans or spctral), auto or float(only with kmeans)
    'clustering_method': 'spectral',        #kmeans or spectral(recomended to use kmeans if dataset is above ~50 000 images)
    'use_dim_red': True,                    #recommended to use dimensionality reduction by default
    'umap_iterations': 1,                   #Number of iterations to use UMAP before using autoencoder(if dimensionality reduction is enabled). Set it above the stop_iter value if UMAP is to be used for all iterations.
    'start_iter': 1,                        #If start_iter is set above 1 the model from iteration start_iter-1 will be loaded
    'stop_iter': 10
}

In [None]:
labels = ['airplane',
          #'automobile',
          #'bird',
          #'cat',
          #'deer',
          #'dog',
          #'frog',
          'horse',
          #'ship',
          'truck']
train_txt = []
valid_txt = []
test_txt = []

for i in labels:
    train_txt.append(f'data_txt/{i}_train.txt')
    valid_txt.append(f'data_txt/{i}_valid.txt')
    test_txt.append(f'data_txt/{i}_test.txt')

train_txt

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
from importlib import reload
reload(ucl_v3)
ucl_v3.create_model

In [None]:
tf.keras.backend.clear_session
# The GPU id to use, usually either "0" or "1";
os.environ["CUDA_VISIBLE_DEVICES"]="0"; 

In [None]:
# Constants used for learning
WORK_IMG_SIZE = 112,112 # Note: (224, 224) is default for most pretrained models
INPUT_SHAPE = WORK_IMG_SIZE + (3,) # 3 for rgb, since model is pretrained for color images
BATCH_SIZE = 32

# dt = datetime.now()
# timestamp = str(dt)[:str(dt).find(".")].replace("-", "").replace(":", "").replace(" ", "_")  # use for log file name

STORE_Path = "Results_v2/Res_F2"
CHECKPOINTS_Path = STORE_Path+"/checkpoints/"
LOGS_Path = STORE_Path+"/"
PLOTS_Path = STORE_Path+"/plots/"

# create directories
if not os.path.exists(CHECKPOINTS_Path):
    os.makedirs(CHECKPOINTS_Path)
if not os.path.exists(PLOTS_Path):
    os.makedirs(PLOTS_Path)

In [None]:
# Data Generator for feature extraction (only standardization, no randomness)
ext_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
                                            samplewise_center=True, #False,
                                            samplewise_std_normalization=True #False
                                        )

# This one is for the training (contains randomized augmentations)
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
                                            samplewise_center=True, #False,
                                            samplewise_std_normalization=True, #False,
                                            rotation_range=30, # 0,
                                            brightness_range=None, # ?
                                            shear_range=0.1, # 0.0,
                                            zoom_range=0.1, # 0.0,
                                            channel_shift_range=0.0,
                                            fill_mode='nearest',
                                            cval=0.0,
                                            horizontal_flip=True, # False,
                                            vertical_flip=True, # False,
                                        )

In [None]:
train_paths_df = utilities.make_path_df(train_txt)
valid_paths_df = utilities.make_path_df(valid_txt)
test_paths_df = utilities.make_path_df(test_txt)

#train_paths_df = train_paths_df.append(valid_paths_df, ignore_index=True)
train_paths_df = pd.concat([train_paths_df, valid_paths_df], ignore_index=True)

# Replace labels with integers. Keep names for future use.
label_names = train_paths_df["label"].unique()
label_dict = {}
for e,l in enumerate(label_names): label_dict[l] = e
train_paths_df["label"].replace(label_dict, inplace=True) 
test_paths_df["label"].replace(label_dict, inplace=True) 

# Checks...
if len(train_paths_df[train_paths_df.duplicated()]) > 0: #should not happen...
    print("WARNING, there are duplicates in the dataset.")
    #print(paths_df[paths_df.duplicated()])
    
print(f"Training data: {len(train_paths_df)}\nTesting data: {len(test_paths_df)}")
print("Unique labels: ",label_names, len(label_names))
print(train_paths_df["label"].value_counts())
print(test_paths_df["label"].value_counts())

In [None]:
train_paths_df

In [None]:
for label in range(len(labels)):
    _ = utilities.view_images(train_paths_df[train_paths_df["label"]==label],
                                  #y_train[y_train==0],
                                  n_images=9,
                                  cmap="Greys",
                                  #randomize=True,
                                  size=(3,3)).tight_layout()


In [None]:
n_clusters = len(labels)

In [None]:
# Create model. Remember to rerun this before running the next cell
vgg16model = ucl_v3.create_model(tf.keras.applications.vgg16.VGG16, weights="imagenet", input_shape=INPUT_SHAPE,
                              #core_output_layer="block3_pool", # here you can set a different output layer
                              n_clusters=n_clusters,
                              learning_rate=0.00000001, momentum=0, # never actually used, model is recompiled later
                              random_seed=101)
vgg16model.save(CHECKPOINTS_Path + '0.ckpt')

In [None]:
if training_params['start_iter'] > 1:
    vgg16model = tf.keras.models.load_model(CHECKPOINTS_Path + str(training_params['start_iter']-1) + '.ckpt')

In [None]:
training_params

In [None]:
from importlib import reload
reload(ucl_v3)


#vgg16model = tf.keras.models.load_model(CHECKPOINTS_Path + '16.ckpt')
mod, i_metrics, pl, sims, hists = ucl_v3.make_ucl(vgg16model,
                                           data=train_paths_df["path"],
                                           true_labels=train_paths_df["label"],
                                           ext_datagen=ext_datagen,
                                           train_datagen=train_datagen,
                                           img_size=WORK_IMG_SIZE,
                                           starting_centers=None,
                                           freeze_centers=False,
                                           use_previous_centers=False,
                                           random_seed=101,
                                           batch_size=BATCH_SIZE,
                                           freeze_layers=11, # first 3 conv blocks
                                           learning_rate=0.0001, momentum=0.9,
                                           elambda=training_params['elambda'],
                                           n_clusters=n_clusters,
                                           start_iter=training_params['start_iter'],
                                           stop_iter=training_params['stop_iter'],
                                           n_epochs=50,
                                           use_validation=True,
                                           log_path=LOGS_Path,
                                           checkpoint_path=CHECKPOINTS_Path,
                                           plots_path=PLOTS_Path,
                                           show_images=False,
                                           show_figs=True,
                                           clustering_method=training_params['clustering_method'],
                                           use_dim_red=training_params['use_dim_red'],
                                           umap_iterations=training_params['umap_iterations'],
                                           comment="")
i_metrics.head(12)

In [None]:
i_metrics

In [None]:
for c in i_metrics.columns:
    plt.figure(figsize=(6,3))
    i_metrics[c].plot(title=c)
    plt.ylim(bottom=0, top=max(1.1, max(i_metrics[c])+0.2))
    plt.tight_layout()
    plt.savefig(PLOTS_Path+f"histories_{c}.jpg") # for some reason, tight layout doesn't work with png
    plt.show()

In [None]:
import imp
imp.reload(ucl_v3)

from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix
for i in range(training_params['start_iter'],training_params['stop_iter']+1):
    print("*****************************************")
    print("******** ITERATION "+str(i)+" ************")
    eval_model = tf.keras.models.load_model(CHECKPOINTS_Path+str(i)+".ckpt")

    preds_df = ucl_v3.predict_model(eval_model,test_paths_df["path"], labels=test_paths_df["label"], ext_datagen=ext_datagen, random_seed=101)
    print(preds_df.value_counts("correct_p"))
    # print(preds_df.value_counts("correct_c"))

    print(utilities.make_classification_report(preds_df["true"], preds_df["p_pred"], labels=label_names))
    f = open(LOGS_Path+"rep_p_it_"+str(i)+".txt", "w")
    f.write(utilities.make_classification_report(preds_df["true"], preds_df["p_pred"], labels=label_names))
    f.close()

    print(confusion_matrix(preds_df["true"], preds_df["p_pred"]))
    # print(sklearn.metrics.cluster.contingency_matrix(preds_df["true"], preds_df["p_pred"]))
    print("*****************************************")

In [None]:
# Evaluation using the base model

# Evaluation. Load the model checkpoint
eval_model = tf.keras.models.load_model(CHECKPOINTS_Path+"0.ckpt")

preds_df = ucl_v3.predict_model(eval_model,test_paths_df["path"], labels=test_paths_df["label"], ext_datagen=ext_datagen, random_seed=101)

print(preds_df.value_counts("correct_p"))

print(utilities.make_classification_report(preds_df["true"], preds_df["p_pred"], labels=label_names))
f = open(LOGS_Path+"rep_p_it3.txt", "w")
f.write(utilities.make_classification_report(preds_df["true"], preds_df["p_pred"], labels=label_names))
f.close()


In [None]:
# Evaluation using the last trained model

# Evaluation. Load the model checkpoint
eval_model = tf.keras.models.load_model(CHECKPOINTS_Path+"5.ckpt")

preds_df = ucl_v3.predict_model(eval_model,test_paths_df["path"], labels=test_paths_df["label"], ext_datagen=ext_datagen, random_seed=101)

print(preds_df.value_counts("correct_p"))

print(utilities.make_classification_report(preds_df["true"], preds_df["p_pred"], labels=label_names))
f = open(LOGS_Path+"rep_p_it3.txt", "w")
f.write(utilities.make_classification_report(preds_df["true"], preds_df["p_pred"], labels=label_names))
f.close()

In [None]:
label_dict = {v: k for k, v in label_dict.items()}
label_dict

In [None]:
preds_df["true"] = [label_dict[x] for x in preds_df["true"]] 
preds_df["p_pred"] = [label_dict[x] for x in preds_df["p_pred"]] 

preds_df["correct_p"] = preds_df["p_pred"] == preds_df["true"]
misclassified = preds_df[preds_df["correct_p"] == False]
misclassified.to_csv(STORE_Path+'UCL_misclassified.csv')
preds_df.to_csv(STORE_Path+'UCL_all_pred.csv')


In [None]:
cm = pd.crosstab(preds_df["true"], preds_df["p_pred"])
f = sns.heatmap(cm, annot=True, fmt='d', cmap = 'Blues', linewidth=.5)

In [None]:
misclassified

In [None]:
import imp
imp.reload(utilities)
_ = utilities.view_images(misclassified["path"], labels='Pred: ' + misclassified["p_pred"] + '\nTrue: ' + misclassified["true"], cmap="Greys", n_images=64).tight_layout()

In [None]:
import imp
imp.reload(utilities)
_ = utilities.view_images(misclassified["path"], labels='Pred: ' + misclassified["p_pred"] + '\nTrue: ' + misclassified["true"], cmap="Greys", n_images=64).tight_layout()