In [None]:
import os
import pandas as pd
from tqdm.auto import tqdm

from img2vec_keras import Img2Vec

In [23]:
from tensorflow import keras

from tensorflow.keras.applications import InceptionV3
from keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
import numpy as np


_IMAGE_NET_TARGET_SIZE = (299, 299)

class Img2Vec(object):

    def __init__(self):
        
        model = InceptionV3(weights='imagenet')
        layer_name = 'avg_pool'
        self.intermediate_layer_model = Model(
            inputs=model.input, 
            outputs=model.get_layer(layer_name).output
        )


    def get_vec(self, image_path):
        """ Gets a vector embedding from an image.
        :param image_path: path to image on filesystem
        :returns: numpy ndarray
        """

        img = image.load_img(image_path, target_size=_IMAGE_NET_TARGET_SIZE)
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)
        intermediate_output = self.intermediate_layer_model.predict(x)
        
        return intermediate_output[0]

In [24]:
img2vec = Img2Vec()

In [30]:
for id_annonce in tqdm(os.listdir("../data/reduced_images/train")):
    if "DS_Store" in id_annonce:
        continue
        
    if os.path.exists(
        "../embeddings/images_embedding_train_inception/"
        f"{id_annonce.replace('ann_', '')}.csv"
    ):
        continue

    image_embeddings = list()

    for j, image_path in enumerate(
        os.listdir(f"../data/reduced_images/train/{id_annonce}")
    ):
        if not image_path.endswith((".jpg", ".png")):
            continue
        vec = img2vec.get_vec(
            f"../data/reduced_images/train/{id_annonce}/{image_path}"
        )
        image_embeddings.append(vec)

    pd.DataFrame(image_embeddings).to_csv(
        "../embeddings/images_embedding_train_inception/"
        f"{id_annonce.replace('ann_', '')}.csv"
    )


HBox(children=(FloatProgress(value=0.0, max=37369.0), HTML(value='')))




In [None]:
TRAIN_FOLDER = "reduced_images/train/"

y_train = pd.read_csv("y_train.csv")

In [None]:
vals = list()

for img_folder in tqdm(os.listdir(TRAIN_FOLDER)):
    if "DS_Store" in img_folder:
        continue
    for img_path in os.listdir(TRAIN_FOLDER + img_folder):
        if "DS_Store" in img_path:
            continue
        vals.append((int(img_folder.replace("ann_", "")), img_path))

In [None]:
photos_match = pd.DataFrame(vals, columns=["id_annonce", "photo_id"])
photos_match = pd.merge(photos_match, y_train, on="id_annonce")

photos_match

In [None]:
photos_match.to_csv("photos_match.csv", index=False)