In [None]:
import numpy as np
from tensorflow import keras
import tensorflow as tf
import pandas as pd


from google.colab import drive
drive.mount('/content/drive')

In [None]:
#!pip install patool
#import patoolib
#patoolib.extract_archive("/content/drive/MyDrive/museum/museum_exhibits_cv_hack.rar", outdir="/content/drive/MyDrive/museum/")

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/museum/train_dataset_train/train.csv")

In [None]:
df_train.head()

In [None]:
df_train.index = df_train['id']

In [None]:
df_train.loc[[520, 1817]]

In [None]:
df_train['filename'] = df_train['object_img'].apply(lambda x: f"/content/drive/MyDrive/museum/train_dataset_train/train/{x}.png")

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import Sequence

In [None]:
train_datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)


train_generator = train_datagen.flow_from_dataframe(
    df_train,
    y_col='id',
    target_size=(32, 32),
    batch_size=32,
    class_mode='raw')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cnt = CountVectorizer(min_df=0.07)
cnt.fit_transform(df_train['description'])

In [None]:
class TrainDataGenerator(Sequence):
    """Generates data for Keras
    Sequence based data generator. Suitable for building data generator for training and prediction.
    """
    def __init__(self, df, vectorizer, to_fit=True, batch_size=32, dim=(150, 150),
                 n_channels=3, shuffle=True):
        self.train_datagen = ImageDataGenerator(
                                  rescale=1./255,
                                  shear_range=0.2,
                                  zoom_range=0.2,
                                  horizontal_flip=True)
        self.train_generator = train_datagen.flow_from_dataframe(
                                  df_train,
                                  y_col='id',
                                  target_size=(32, 32),
                                  batch_size=32,
                                  class_mode='raw')
        self.vectorizer = vectorizer
        self.df = df


    def __len__(self):
        """Denotes the number of batches per epoch
        :return: number of batches per epoch
        """
        return len(self.train_generator)

    def __getitem__(self, index):
        X, y_idx = self.train_generator[index]
        rows = self.df.loc[y_idx]
        y = self.vectorizer.transform(rows['description']).todense()
        return X, y


    def on_epoch_end(self):
        pass


In [None]:
data_generator = TrainDataGenerator(df_train, cnt)

In [None]:
input_shape = (32, 32, 3)
num_features = len(cnt.vocabulary_)
keras_tf_layers = tf.keras.layers

model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        keras_tf_layers.Conv2D(32, kernel_size=(4, 4), activation="relu"),
        keras_tf_layers.MaxPooling2D(pool_size=(2, 2)),
        keras_tf_layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
        keras_tf_layers.MaxPooling2D(pool_size=(2, 2)),
        keras_tf_layers.Flatten(),
        keras_tf_layers.Dropout(0.5),
        keras_tf_layers.Dense(num_features, activation="sigmoid"),
    ]
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model.compile(loss="mean_squared_error", optimizer="adam", metrics=["mae"])

In [None]:
model.fit(data_generator, epochs=100)

In [None]:
model.save('/content/drive/MyDrive/museum/model2.h5')

In [None]:
model = keras.models.load_model('/content/drive/MyDrive/museum/model2.h5')

In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/museum/test_dataset_test/test.csv")

In [None]:
df_test.head()

In [None]:
import os
filenames = os.listdir("/content/drive/MyDrive/museum/test_dataset_test/test")

test_df_img = pd.DataFrame({'id': list(range(len(filenames))),
                            'filename': list(map(lambda x: f"/content/drive/MyDrive/museum/test_dataset_test/test/{x}", filenames))})

In [None]:
test_datagen = ImageDataGenerator(
    rescale=1./255)


test_generator = test_datagen.flow_from_dataframe(
    test_df_img,
    y_col='id',
    target_size=(32, 32),
    class_mode='raw',
    shuffle=False)


In [None]:
raw_predictions = model.predict(test_generator)

In [None]:
raw_predictions[0]

In [None]:
test_vectors = cnt.transform(df_test['description']).todense()

In [None]:
res_map = {}

for i in range(len(raw_predictions)):
  dist = np.inf
  txt_n = None
  for j in range(len(test_vectors)):
    if j in res_map.values():
      continue
    
    dist_ = np.sum(np.abs(raw_predictions[i] - test_vectors[j]))
    if dist_ < dist:
      txt_n = j
      dist = dist_

  res_map[i] = txt_n

In [None]:
res_imgs = [filenames[y[0]] for y in sorted(res_map.items(), key=lambda x: x[1])]

In [None]:
submission = pd.DataFrame({'id': df_test['id'], 'object_img': list(map(lambda x: x.split('.')[0], res_imgs))})

In [None]:
submission

In [None]:
submission.to_csv('/content/drive/MyDrive/museum/submission3.csv', index=False)

In [None]:
pd.concat([df_train, df_test]).sort_values(by='object_img')

In [None]:
train_vectors = cnt.transform(df_train['description']).todense()
test_vectors = cnt.transform(df_test['description']).todense()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarities = cosine_similarity(train_vectors, test_vectors)

In [None]:
similar_train_idx = np.argmax(similarities, axis=0)

In [None]:
similar_train_idx

In [None]:
import os
test_labels = [int(x.split('.')[0]) for x in os.listdir("/content/drive/MyDrive/museum/test_dataset_test/test")]

In [None]:
train_labels = [int(x) for x in df_train['object_img']]

In [None]:
mapping = {}

for i, sim in enumerate(similar_train_idx):
  target_train = train_labels[sim]
  for j in range(1, 500):
    if target_train + j not in mapping:
      mapping[target_train + j] = i
      break
    if target_train - j not in mapping:
      mapping[target_train - j] = i
      break



In [None]:
result = [x[1] for x in sorted({v: k for k, v in mapping.items()}.items())]

In [None]:
submission = pd.DataFrame({'id': df_test['id'], 'object_img': result})

In [None]:
submission.to_csv('/content/drive/MyDrive/museum/submission_02.csv', index=False)