In [1]:
import numpy as np 
import matplotlib as mpl 
import matplotlib.pyplot as plt 
import pickle
import random

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input

import os, shutil, glob, os.path

data_trainX = np.loadtxt('handout/train_triplets.txt')
data_testX = np.loadtxt('handout/test_triplets.txt')

2022-04-30 21:20:49.453644: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [5]:
# load pretrained model: imagenet
image.LOAD_TRUNCATED_IMAGES = True 
model = VGG16(weights='imagenet', include_top=False)

imdir = 'handout/food/'
filelist = glob.glob(os.path.join(imdir, '*.jpg'))
filelist.sort()

featurelist = []
for i, imagepath in enumerate(filelist):
    img = image.load_img(imagepath, target_size=(224, 224))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)
    features = np.array(model.predict(img_data))
    featurelist.append(features.flatten())

# optional: save model in pickle file
with open('featurelist_vgg16.pkl', 'wb') as fp:
    pickle.dump(featurelist, fp)

2022-04-30 21:22:42.104561: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-04-30 21:22:42.105429: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2294660000 Hz


In [6]:
def similarity_numbers (itemlist, triplets, triplets_test, ncat):
    '''
    This function computes a grid in which the elements are pairs of classes.
    The elements of the grid are computed using triplets, which is an array 
    containing triplets. +1 is summed in the entry of the grid corresponding
    to similar categories of each triplet element and -1 is summed when pairs
    are less similar. The grid is used to make predictions on triples_test which
    has the same structure as triplets.
    '''
    np.random.seed(4)
    pairs = np.zeros((ncat, ncat))
    
    for i in range(len(triplets)):
        first = itemlist[int(triplets[i][0])]
        second = itemlist[int(triplets[i][1])]
        third = itemlist[int(triplets[i][2])]
        pairs[first,second] += 1
        pairs[second,first] += 1
        pairs[first,third] -= 1
        pairs[third,first] -= 1
    
    predictions = []
    for i in range(len(triplets_test)):
        first_test = itemlist[int(triplets_test[i][0])]
        second_test = itemlist[int(triplets_test[i][1])]
        third_test = itemlist[int(triplets_test[i][2])]
    
        comparison_1 = pairs[first_test,second_test]
        comparison_2 = pairs[first_test,third_test]
        
        if  comparison_1 > comparison_2:
            predictions.append(1)
        elif comparison_1 == comparison_2:
            predictions.append(np.random.randint(2))
        else:
            predictions.append(0)
        
    return np.array(predictions)

In [7]:
# optional: load pickle file 
# with open ('featurelist_vgg16', 'rb') as fp:
#    featurelist = pickle.load(fp)

"""We figured that most of the predictions done by setting a fixed value for the PCA components 
and the number of clusters in the Kmeans performed comparably well in cv. However this predictions are quite
different from one another. For this reason we exploited this diversity through the usage on an
averaging technique. To have many predictions we ran two for loops, over PCA dimension and number of
clusters in Kmeans. To ensure diversity of the predictions we measured distance between predictions 
and set the threshold = 20000, using cv. We then transformed the average values > 0.5 to 1 and < 0.5 to 0. 
This raised the cv score from 0.60 to 0.64.
"""

final = np.zeros(len(data_testX))

threshold = 20000
counter = 0
M = []
first = True
for components in range(10,110,10):
    print('PCA: ',components)
    pca = PCA(n_components=components)
    images_pca = pca.fit_transform(featurelist)
    cluster_range = np.arange(10,110)
    random.shuffle(cluster_range)
    
    for clusters in cluster_range:
        print('cluster: ',clusters)
        kmeans = KMeans(n_clusters=clusters, random_state=0).fit(np.array(images_pca))
        pred = similarity_numbers(kmeans.labels_, data_trainX, data_testX, clusters)
        if first: 
            M.append(pred)
            counter += 1
        first = False
        check = True
        for i in range(len(M)):
            a = (np.linalg.norm(M[i]-pred))**2
            if a < threshold: 
                check = False
                
        if check: 
            M.append(pred)
            counter += 1

M = np.array(M)
for i in range(len(M)):
    final += M[i]/len(M)

predictions = np.zeros(len(final))
for i in range(len(final)):
    if final[i] > 0.5: predictions[i] = 1
        
np.savetxt("predictions.txt", predictions, fmt="%i")

PCA:  10
cluster:  63
cluster:  55
cluster:  68
cluster:  98
cluster:  19
cluster:  53
cluster:  79
cluster:  88
cluster:  11
cluster:  28
cluster:  48
cluster:  103
cluster:  12
cluster:  21
cluster:  109
cluster:  56
cluster:  42
cluster:  49
cluster:  62
cluster:  27
cluster:  76
cluster:  90
cluster:  73
cluster:  72
cluster:  39
cluster:  23
cluster:  13
cluster:  81
cluster:  100
cluster:  24
cluster:  54
cluster:  29
cluster:  15
cluster:  108
cluster:  89
cluster:  43
cluster:  20
cluster:  97
cluster:  41
cluster:  66
cluster:  80
cluster:  60
cluster:  99
cluster:  86
cluster:  87
cluster:  92
cluster:  10
cluster:  16
cluster:  52
cluster:  94
cluster:  64
cluster:  106
cluster:  47
cluster:  96
cluster:  93
cluster:  51
cluster:  50
cluster:  35
cluster:  74
cluster:  26
cluster:  85
cluster:  22
cluster:  59
cluster:  105
cluster:  69
cluster:  83
cluster:  57
cluster:  61
cluster:  14
cluster:  67
cluster:  32
cluster:  17
cluster:  84
cluster:  38
cluster:  36
cluster:  