# 图像相似度比较

## 载入套件

In [1]:
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input
import numpy as np

## 载入VGG 16 模型

In [2]:
# 载入VGG 16 模型, 不含最上面的三层(辨识层)
model = VGG16(weights='imagenet', include_top=False)
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)  

In [3]:
# 任选一张图片，例如大象侧面照，取得图档的特征向量
img_path = './images_test/elephant.jpg'

# 载入图档，并缩放宽高为 (224, 224) 
img = image.load_img(img_path, target_size=(224, 224))

# 加一维，变成 (1, 224, 224)
x = image.img_to_array(img)
x = np.expand_dims(x, axis=0)
x = preprocess_input(x)

# 取得图档的特征向量
features = model.predict(x)
print(features[0])

[[[ 0.         0.         0.        ...  0.         0.         0.       ]
  [ 0.         0.        42.547764  ...  0.         0.         0.       ]
  [ 1.075339   0.        23.495638  ...  0.         0.         0.       ]
  ...
  [ 0.         0.         0.        ...  0.         0.         0.       ]
  [ 0.         0.         0.        ...  0.         0.         0.       ]
  [ 0.         0.         0.        ...  0.         0.         0.       ]]

 [[ 0.         0.        36.338856  ...  0.         0.         3.4028761]
  [ 0.         0.        80.23629   ...  7.871895   0.         0.       ]
  [ 0.         0.        48.75136   ...  0.         0.         0.       ]
  ...
  [ 0.         0.         0.        ...  4.581372   0.         0.       ]
  [ 0.         0.         0.        ...  0.         0.         0.       ]
  [ 0.         0.         0.        ...  0.         0.         0.       ]]

 [[ 0.         0.         9.853486  ...  0.         0.         2.4919674]
  [ 0.         0.     

In [4]:
print(features.shape)

(1, 7, 7, 512)


# 使用 cosine_similarity 比较特征向量

### 步骤 1. 取得 images_test 目录下所有 .jpg 档案名称

In [5]:
from os import listdir
from os.path import isfile, join

# 取得 images_test 目录下所有 .jpg 档案名称
img_path = './images_test/'
image_files = np.array([f for f in listdir(img_path) 
        if isfile(join(img_path, f)) and f[-3:] == 'jpg'])
image_files

array(['bird.jpg', 'bird2.jpg', 'daisy1.jpg', 'daisy2.jpg', 'deer.jpg',
       'elephant.jpg', 'elephant2.jpg', 'lion1.jpg', 'lion2.jpg',
       'panda1.jpg', 'panda2.jpg', 'panda3.jpg', 'rose2.jpg',
       'tiger1.jpg', 'tiger2.jpg', 'tiger3.jpg'], dtype='<U13')

### 步骤 2. 取得 images_test 目录下所有 .jpg 档案的像素

In [6]:
import numpy as np

# 合并所有图档的像素
X = np.array([])
for f in image_files:
    image_file = join(img_path, f)
    # 载入图档，并缩放宽高为 (224, 224) 
    img = image.load_img(image_file, target_size=(224, 224))
    img2 = image.img_to_array(img)
    img2 = np.expand_dims(img2, axis=0)
    if len(X.shape) == 1:
        X = img2
    else:
        X = np.concatenate((X, img2), axis=0)

X = preprocess_input(X)

### 步骤 3. 取得所有图档的特征向量

In [7]:
# 取得所有图档的特征向量
features = model.predict(X)

features.shape, X.shape

((16, 7, 7, 512), (16, 224, 224, 3))

### 步骤 4. 使用 cosine_similarity 函数比较特征向量

In [8]:
# 使用 cosine_similarity 比较特征向量
from sklearn.metrics.pairwise import cosine_similarity


# 比较 Tiger2.jpg 与其他图档特征向量
no=-2
print(image_files[no])

# 转为二维向量，类似扁平层(Flatten)
features2 = features.reshape((features.shape[0], -1))

# 排除 Tiger2.jpg 的其他图档特征向量
other_features = np.concatenate((features2[:no], features2[no+1:]))

# 使用 cosine_similarity 计算 Cosine 函数
similar_list = cosine_similarity(features2[no:no+1], other_features, 
                                 dense_output=False)

# 显示相似度，由大排到小
print(np.sort(similar_list[0])[::-1])

# 依相似度，由大排到小，显示档名
image_files2 = np.delete(image_files, no)
image_files2[np.argsort(similar_list[0])[::-1]]

tiger2.jpg
[0.350051   0.26541096 0.19276574 0.19042632 0.16894677 0.14176077
 0.10579197 0.10556371 0.09763556 0.0929383  0.08532473 0.08095936
 0.07598996 0.06710661 0.03114463]


array(['tiger1.jpg', 'tiger3.jpg', 'lion1.jpg', 'elephant.jpg',
       'elephant2.jpg', 'lion2.jpg', 'panda2.jpg', 'panda3.jpg',
       'bird.jpg', 'panda1.jpg', 'bird2.jpg', 'deer.jpg', 'rose2.jpg',
       'daisy2.jpg', 'daisy1.jpg'], dtype='<U13')

### 其他图档比较

In [9]:
# 比较对象：bird.jpg
no=0
print(image_files[no])


# 使用 cosine_similarity 计算 Cosine 函数
other_features = np.concatenate((features2[:no], features2[no+1:]))
similar_list = cosine_similarity(features2[no:no+1], other_features, 
                                 dense_output=False)

# 显示相似度，由大排到小
print(np.sort(similar_list[0])[::-1])

# 依相似度，由大排到小，显示档名
image_files2 = np.delete(image_files, no)
image_files2[np.argsort(similar_list[0])[::-1]]

bird.jpg
[0.18169087 0.12461188 0.12236218 0.11822931 0.10887703 0.1053412
 0.09763555 0.09459615 0.07993933 0.06803741 0.06442314 0.06261782
 0.05864661 0.0449845  0.01628361]


array(['bird2.jpg', 'deer.jpg', 'panda3.jpg', 'lion2.jpg', 'panda1.jpg',
       'rose2.jpg', 'tiger2.jpg', 'tiger1.jpg', 'elephant2.jpg',
       'panda2.jpg', 'elephant.jpg', 'lion1.jpg', 'tiger3.jpg',
       'daisy2.jpg', 'daisy1.jpg'], dtype='<U13')