<a href="https://colab.research.google.com/github/matthewshawnkehoe/Data-Analysis/blob/main/efficientnet_fine_tunning_ImageSearch_FAISS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Reference links:
- [1]https://www.pyimagesearch.com/2014/12/01/complete-guide-building-image-search-engine-python-opencv/

- [2]https://www.pinecone.io/learn/faiss-tutorial/#:~:text=Faiss%20is%20a%20library%20%E2%80%94%20developed,similar%20vectors%20within%20the%20index.

# IMAGE SEARCH

In [1]:
# install faiss library
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [2]:
import os
import time
import imutils
import math
import faiss
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from IPython.display import clear_output
from google.colab import files
from imutils import paths
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, GlobalAveragePooling2D

## Download files

In [3]:
def download_from_gdrive(file_id, file_name):
    !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget \
    --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate \
    'https://docs.google.com/uc?export=download&id=$file_id' -O- | \
    sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$file_id" \
    -O $file_name && rm -rf /tmp/cookies.txt
    clear_output(wait=True)

    print("{} downloaded!".format(file_name))

In [4]:
# download dogs.zip file from google drive
download_from_gdrive("1zIgbo84K8TLh2SGg8wVlSEHLMv1O8Bgm", "dogs_subset.zip")
# unzig the dogs file
!unzip -qq dogs_subset.zip

dogs_subset.zip downloaded!


## Set Training Parameters

In [5]:
IMAGES = "output"
TRAIN_SAMPLES = 5214
NUM_CLASSES = 5648 # One additional label for unknown class
IMG_WIDTH, IMG_HEIGHT = 224, 224

In [6]:
# get all image paths
image_paths = list(paths.list_images(IMAGES))


In [15]:
print(image_paths)
print(len(image_paths))

['output/8135804/00001785.png', 'output/1971290/00000654.png', 'output/8564441/00001919.png', 'output/26217740/00004005.png', 'output/1485445/00000056.png', 'output/1485445/00000173.png', 'output/1511482/00000425.png', 'output/5057418/00001454.png', 'output/11810200/00002131.png', 'output/14561036/00002330.png', 'output/3220072/00001136.png', 'output/19679621/00003057.png', 'output/20948498/00003200.png', 'output/1608227/00000157.png', 'output/1987392/00000758.png', 'output/1795361/00000456.png', 'output/2803869/00001011.png', 'output/1689242/00000292.png', 'output/17262907/00002770.png', 'output/3250317/00001172.png', 'output/1782422/00000431.png', 'output/1664355/00000261.png', 'output/19474505/00003019.png', 'output/27445174/00004198.png', 'output/10259415/00002069.png', 'output/15038781/00002510.png', 'output/1964887/00000565.png', 'output/1504067/00000343.png', 'output/2229881/00000801.png', 'output/6898604/00001528.png', 'output/27411342/00004189.png', 'output/7206279/00001588.pn

## Transfer Learning: Fine-Tuning on custom dataset

In [8]:
def get_model(img_width, img_height):
    base_model = EfficientNetB0(include_top=False,
                           input_shape=(img_width, img_height, 3))
    for layer in base_model.layers[:]:
        layer.trainable = False
    input = Input(shape=(img_width, img_height, 3))
    custom_model = base_model(input)
    custom_model = GlobalAveragePooling2D()(custom_model)
    custom_model = Dense(64, activation='relu')(custom_model)
    custom_model = Dropout(0.5)(custom_model)
    predictions = Dense(NUM_CLASSES, activation='softmax')(custom_model)
    return Model(inputs=input, outputs=predictions)

In [9]:
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
                                   rotation_range=20,
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   zoom_range=0.2)

In [10]:
batch_size = 64
root_dir = 'output/'

In [11]:
train_generator = train_datagen.flow_from_directory(root_dir,
                                                    target_size=(IMG_WIDTH,
                                                                 IMG_HEIGHT),
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    seed=12345,
                                                    class_mode='categorical')

Found 5214 images belonging to 5648 classes.


In [12]:
model_finetuned = get_model(IMG_WIDTH, IMG_HEIGHT)
model_finetuned.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(0.001),
              metrics=['acc'])
history = model_finetuned.fit(
                              train_generator,
                              steps_per_epoch=100,
                              epochs=25)

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
Epoch 1/25
  4/100 [>.............................] - ETA: 2:16 - loss: 8.6477 - acc: 0.0039

UnknownError: ignored

## Create FAISS index

In [None]:
# load the features data
df = pd.read_csv("features_subset", header=None)

In [None]:
# exclude AnimalInter-ID (first column)
features = df.iloc[:, 1:]
embeddings = features.values
# get embeddings shape
dim = embeddings.shape[1]
print("Dimension:", dim)

# convert to float32
embeddings = embeddings.astype(np.float32)

# initialize IndexFlatL2 index with our vector dimensionality
index = faiss.IndexFlatL2(dim)
# check whether an index needs to be trained using the is_trained method
print("is_trained?", index.is_trained)
# This will handle => ValueError: array is not C-contiguous
embeddings_fixed = embeddings.copy(order='C')

# load embeddings
index.add(embeddings_fixed)
print("total indices:",index.ntotal)

In [None]:
faiss.write_index(index, 'faiss_index')

## Helper functions

In [None]:
def get_features(image_path):
    """ Load and preprocess image."""
    query = cv2.imread(image_path)
    query = cv2.resize(query, (224, 224), interpolation=cv2.INTER_AREA)
    query = cv2.cvtColor(query, cv2.COLOR_BGR2RGB)
    features = fe.extract_features(query)
    return features, query

def load_image(imgpath):
    image = cv2.imread(imgpath)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = imutils.resize(image, width=200)
    return image

In [None]:
index = faiss.read_index('faiss_index')

In [None]:
new_df = df.rename(columns={0: 'dog_id'}).set_index('dog_id')
new_df.head()

In [None]:
new_df.loc[3979432]

In [None]:
new_df = new_df.astype(np.float32)

In [None]:
new_df.to_csv('features_subset_with_id.csv')

In [None]:
test = pd.read_csv('features_subset_with_id.csv')
test.head()

In [None]:
df.head()

In [None]:
def get_similar_ids(query_id, k):
    # load image
    xq = new_df.loc[query_id].astype(np.float32)
    # get query image id
    # queryID = query_image_path.split(os.path.sep)[1]
    # start tracking time
    start = time.time()
    D, I = index.search(np.array([xq]), k)
    print("Time taken to search: {:.2f}s".format(time.time() - start))
    # similar dogs ids
    similar_dog_ids = new_df.iloc[I[0]].values
    return similar_dog_ids

similar_dog_ids = get_similar_ids(25641005, 5)

In [None]:
img_paths = list(paths.list_images(os.path.join(IMAGES, str(25641005))))
image = load_image(img_paths[0])
plt.imshow(image)
plt.axis('off')
plt.title(str(id))

In [None]:
similar_dog_ids

In [None]:
 """ # create a figure object
fig = plt.figure(figsize=(10, 6))
# loop over the results and display the similar images
for i, id in enumerate(similar_dog_ids):
    ax = fig.add_subplot(2, 5, i+1)
    img_paths = list(paths.list_images(os.path.join(IMAGES, str(id))))
    image = load_image(img_paths[0])
    plt.imshow(image)
    plt.axis('off')
    plt.title(str(id))
plt.tight_layout()
plt.suptitle("Similar Dogs\n", fontsize=16)
plt.show() """

In [None]:
def show_similar_images(query_image_path, k):
    # load image
    xq, query_image = get_features(query_image_path)
    # get query image id
    # queryID = query_image_path.split(os.path.sep)[1]
    # start tracking time
    start = time.time()
    D, I = index.search(np.array([xq]), k)
    print("Time taken to search: {:.2f}s".format(time.time() - start))
    # similar dogs ids
    similar_dog_ids = df[0].iloc[I[0]].values

    # display query image
    plt.imshow(query_image)
    plt.axis("off")
    plt.title(f"Query Image", fontsize=18)
    plt.show()
    # create a figure object
    fig = plt.figure(figsize=(10, 6))
    # loop over the results and display the similar images
    for i, id in enumerate(similar_dog_ids):
        ax = fig.add_subplot(2, 5, i+1)
        img_paths = list(paths.list_images(os.path.join(IMAGES, str(id))))
        image = load_image(img_paths[0])
        plt.imshow(image)
        plt.axis('off')
        plt.title(str(id))
    plt.tight_layout()
    plt.suptitle("Similar Dogs\n", fontsize=16)
    plt.show()

## Upload images from local system and get similar dog images

In [None]:
# upload option
uploaded = files.upload()
show_similar_images(list(uploaded.keys())[0], k=10)