In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input, decode_predictions

#set the random seeds
np.random.seed(42)
tf.random.set_seed(42)

In [6]:
#read the triplet pairs of images
train_triplets = pd.read_csv('train_triplets.txt', delimiter = ' ', names = ['A', 'B', 'C'], dtype=str) + '.jpg'
test_triplets = pd.read_csv('test_triplets.txt', delimiter = ' ', names = ['A', 'B', 'C'], dtype=str) + '.jpg'

In [7]:
#add labels to the training dataframe
len = train_triplets.shape[0]
labels = np.ones(len)
train_triplets['labels'] = labels

In [8]:
#for the training data switch B and C random with 0.5 chance to balance labels
df_sample = train_triplets.sample(round(len/2))
indicies = df_sample.index
train_triplets.loc[indicies,'labels'] = 0

#dreiecks tausch:
df_B = train_triplets.loc[indicies, 'B']
train_triplets.loc[indicies,'B'] = train_triplets.loc[indicies,'C']
train_triplets.loc[indicies,'C'] = df_B

In [78]:
def feature_extractor(image_name):
    """
        for a given image name this function returns the feature vector,
        which was computed using the pretrained convoulutional layers of
        ResNet50. The final feautre is obtained by averageing the pooling
        layer obtained from RasNet50
        
        Args: 
            image_name: string of the image name e.g. '02345.jpg'
            
        returns:
            feature_average: tf.tensor storing the features obtained
                                from RasNet50
    """
    
    img = image.load_img(path+image_name, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    
    #predict whats in the image
    features = model.predict(x)
    
    global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
    feature_average = global_average_layer(features)
    
    return feature_average


In [None]:
"""
    We want to create a new dataframe storing the features for all 5000 unique training images
    and all 5000 unique test images as we hope to save computation time this way
"""

In [92]:
#therfore we first need to extract the unique image names for training and testing
unique_train = np.unique(train_triplets.to_numpy()[:,0:3].flatten())
unique_test = np.unique(test_triplets.to_numpy()[:,0:3].flatten())

In [93]:
#create new dataframe that stores the image names
train_images = pd.DataFrame({'images': unique_train})
test_images = pd.DataFrame({'images': unique_test})

In [105]:
#for the training images store all the corresponding features in the dataframe

model = ResNet50(weights='imagenet', include_top=False)

train_feature_0 = feature_extractor(train_images.iloc[0,0]).numpy()
train_features = pd.DataFrame(train_feature_0)

len_train = unique_train.shape[0]#number of unique images in the training set
for i in tqdm(range(1,len_train)):
    
    feature_i = feature_extractor(train_images.iloc[i,0]).numpy()
    df_feature_i = pd.DataFrame(feature_i)
    train_features = pd.concat([train_features, df_feature_i], ignore_index = True)
    
#concat image names and features
train_features = pd.concat([train_images,train_features],axis=1,ignore_index = True)
#save the dataframe
train_features.to_csv('train_features.csv',index = False)

2022-05-10 13:37:38.650990: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
100%|███████████████████████████████████████| 4999/4999 [02:58<00:00, 28.02it/s]


In [118]:
#for the test images store all the corresponding features in the dataframe
test_feature_0 = feature_extractor(test_images.iloc[0,0]).numpy()
test_features = pd.DataFrame(test_feature_0)

len_test = unique_test.shape[0]#number of unique images in the training set
for i in tqdm(range(1,len_test)):
    
    feature_i = feature_extractor(test_images.iloc[i,0]).numpy()
    df_feature_i = pd.DataFrame(feature_i)
    test_features = pd.concat([test_features, df_feature_i], ignore_index = True)
    
#concat image names and features
test_features = pd.concat([test_images,test_features],axis=1, ignore_index = True)
#save the dataframe
test_features.to_csv('test_features.csv',index = False)

100%|███████████████████████████████████████| 4999/4999 [03:27<00:00, 24.07it/s]
