# Comparing Image Retrival with and without Generalizer

### Imports

In [2]:
import os
import json
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.applications import DenseNet201
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from textwrap import wrap

nltk.download('punkt')
nltk.download('stopwords')

2024-01-05 23:36:26.072438: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-05 23:36:26.099416: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-05 23:36:26.099452: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-05 23:36:26.100740: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-05 23:36:26.106563: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-05 23:36:26.107244: I tensorflow/core/platform/cpu_feature_guard.cc:1

True

In [3]:
image_path = "Images"
model_path = "image_caption_model_data"

### Load Image Captioning Model

#### Load Models and Tokenizer

In [4]:
caption_model = tf.keras.models.load_model(model_path+'/caption_model.keras')
with open(model_path+'/tokenizer.json') as f:
    data = json.load(f)
    tokenizer = tokenizer_from_json(data)
dense_net_201_model = DenseNet201()
feature_extraction_model = Model(inputs=dense_net_201_model.input, outputs=dense_net_201_model.layers[-2].output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet201_weights_tf_dim_ordering_tf_kernels.h5


In [5]:
def extract_features(images: list):
    img_size = 224
    features = {}
    for image in tqdm(images):
        img = load_img(os.path.join(image_path,image),target_size=(img_size,img_size))
        img = img_to_array(img)
        img = img/255.
        img = np.expand_dims(img,axis=0)
        feature = feature_extraction_model.predict(img, verbose=0)
        features[image] = feature
    return features

In [6]:
def idx_to_word(integer,tokenizer):
    
    for word, index in tokenizer.word_index.items():
        if index==integer:
            return word
    return None

In [7]:
def predict_caption(model, image, tokenizer, max_length, features):
    feature = features[image]
    in_text = "startseq"
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], max_length)

        y_pred = model.predict([feature,sequence])
        y_pred = np.argmax(y_pred)
        
        word = idx_to_word(y_pred, tokenizer)
        
        if word is None:
            break
            
        in_text+= " " + word
        
        if word == 'endseq':
            break
            
    return in_text 

#### Generate Captions

In [8]:
images = os.listdir(image_path)
features = extract_features(images)

100%|██████████| 300/300 [00:30<00:00,  9.90it/s]


In [35]:
def generalize_query(query):
    from transformers import GPT2LMHeadModel , GPT2Tokenizer
    import torch

    #checks for GPU to choose
    if torch.cuda.is_available():  
        dev = "cuda:0" 
    else:  
        dev = "cpu"  
    device = torch.device(dev)

    query = "Generalize: " + "\"" + query + "\""

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-xl') 
    model = GPT2LMHeadModel.from_pretrained('gpt2-xl', pad_token_id = tokenizer.eos_token_id)
    model = model.to(device)

    input_ids = tokenizer.encode(query, return_tensors='pt').to(device)

    output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.8)

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

In [9]:
image_data = []

for image in images:
    img = load_img(os.path.join(image_path, image),target_size=(224,224))
    img = img_to_array(img)
    img = img/255.
    
    caption = predict_caption(caption_model, image, tokenizer, 34, features)
    generalized_caption = caption #Todo: Add Generalizer here!
    image_data.append((image, caption, generalized_caption))

image_data_df = pd.DataFrame(image_data, columns=['image','caption','generalized_caption'])



### Indexing Images

In [10]:
class Preprocessor:
    def __init__(self):
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.stemmer = PorterStemmer()

    def preprocess_string(self, data: str) -> list:
        # remove stopwords
        removed_stoppwords = []
        caption = data.lower()
        for word in caption.split():
            if word not in self.stopwords:
                removed_stoppwords.append(word)
        removed_stoppwords = " ".join(removed_stoppwords)

        # remove numbers and punctuation
        text_only = ""
        for character in removed_stoppwords:
            if character.isalpha() or character == " ":
                text_only += character
            else:
                text_only += " "

        # Tokenization
        tokenized = nltk.word_tokenize(text_only)

        # Stemming
        stemmed = [self.stemmer.stem(token) for token in tokenized]
        return stemmed

    def preprocess_dataset(self, df: pd.DataFrame) -> pd.DataFrame:
        df["tokens"] = df["caption"].astype(object)
        df["tokens_generalized"] = df["generalized_caption"].astype(object)
        for index, row in df.iterrows():
            df["tokens"][index] = self.preprocess_string(str(row["caption"]))
            df["tokens_generalized"][index] = self.preprocess_string(str(row["generalized_caption"]))
        return df
    
    def create_lookup_table(self, df: pd.DataFrame) -> dict[str,int]:
        all_tokens = df['tokens'].apply(pd.Series).stack().unique().tolist()
        all_tokens += df['tokens_generalized'].apply(pd.Series).stack().unique().tolist()
        all_tokens = set(all_tokens)
        lookup_table = {}
        i = 0
        for word in all_tokens:
            lookup_table[word] = i
            i+=1
        return lookup_table
    
    def create_reduced_document_index(self, df: pd.DataFrame, column: str, lookup_table: dict) -> list:
        document_counts = []
        for _, row in df.iterrows():
            binary_token_count = np.zeros(len(lookup_table))
            for token in row[column]:
                binary_token_count[lookup_table[token]] = 1
            document_counts.append((row["image"], binary_token_count))
        return document_counts

In [11]:
preprocessor = Preprocessor()
processed_df = preprocessor.preprocess_dataset(image_data_df)
lookup_table = preprocessor.create_lookup_table(processed_df)
searchable_images = preprocessor.create_reduced_document_index(processed_df,'tokens',lookup_table)
searchable_images_generalized = preprocessor.create_reduced_document_index(processed_df,'tokens_generalized',lookup_table)

### Searching

In [12]:
def search(lookup_table: dict, query_tokenized : list, documents: list) -> list:
    results = []
    compareable = np.zeros(len(lookup_table))
    for token in query_tokenized:
        if token in lookup_table:
            compareable[lookup_table[token]] = 1
    query_sum = np.sum(compareable.astype(int))

    if query_sum == 0:
        return []

    for document in documents:
        comparision = compareable.astype(bool) & document[1].astype(bool)
        comparison_sum = np.sum(comparision)
        if query_sum == comparison_sum:
            results.append(document[0])
    return results

In [13]:
def display_images(images):
    plt.figure(figsize = (20 , 20))
    n = 0
    for i in range(min(25,len(images))):
        n+=1
        plt.subplot(5 , 5, n)
        plt.subplots_adjust(hspace = 0.7, wspace = 0.3)
        image = load_img(f"Images/{images[i]}",color_mode='rgb',target_size=(224,224))
        image = img_to_array(image)
        image = image/255.
        plt.imshow(image)
        plt.axis("off")

In [14]:
query = "climbing"
query_tokenized = preprocessor.preprocess_string(query)

results = search(lookup_table, query_tokenized, searchable_images)
results_generalized = search(lookup_table, query_tokenized, searchable_images_generalized)

display_images(results)
display_images(results_generalized)

<Figure size 2000x2000 with 0 Axes>

<Figure size 2000x2000 with 0 Axes>