In [None]:
# Note: restart runtime after this import before running the augmentations
!pip install -U augly
!sudo apt-get install python3-magic

PART = 0 # 10 parts
DEBUG = True

In [None]:
s = """
dataset | n | size | s3_uri 
---------------------------------------------------------------------------------------------------- 
query images 
| 50,000 | 7GB | s3://drivendata-competition-fb-isc-data/all/query_images/ reference images 
| 1,000,000 | 178GB | s3://drivendata-competition-fb-isc-data/all/reference_images/ training images 
| 1,000,000 | 175GB | s3://drivendata-competition-fb-isc-data/all/training_images/
aws s3 cp s3://drivendata-competition-fb-isc-data/all/query_images/ ./ --recursive --exclude="" --include="Q00" --no-sign-request

# https://www.kaggle.com/philculliton/landmark-retrieval-2020-shared-scoring-script?scriptVersionId=38364319
# https://www.kaggle.com/camaskew/baseline-landmark-retrieval-model
"""
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0" 



In [None]:
from IPython.display import clear_output

# make sure your kaggle api "kaggle.json" file in your drive
!mkdir -p /root/.kaggle
!echo '{"username":"tarobxl","key":"a4b1e63bbff7f44f713dce2525191ba1"}' > /root/.kaggle/kaggle.json
#! cp '/content/drive/My Drive/kaggle.json' /root/.kaggle # <---- path for kaggle.json file
!chmod 400 /root/.kaggle/kaggle.json
!cat /root/.kaggle/kaggle.json

!pip uninstall -y kaggle >> quit
!pip install --upgrade pip >> quit
!pip install kaggle==1.5.6 >> quit
!kaggle -v >> quit

clear_output()

In [None]:
url_suffix = "camaskew/baseline-landmark-retrieval-model"
!kaggle datasets download $url_suffix -p /content/ --unzip

In [None]:
from IPython.display import clear_output
!pip install awscli
clear_output()
print("aws is ready!")

temp_dir = "/content/data/data_origin"
!mkdir -p $temp_dir
!rm -rf $temp_dir/*
%cd $temp_dir

driven_data_s3 = "s3://drivendata-competition-fb-isc-data/all/training_images/"

# Q00002.jpg => first 1K
# R000000.jpg => 
#!aws s3 cp $driven_data_s3 ./ --recursive --exclude="*" --include="Q00*" --no-sign-request
#!aws s3 cp $driven_data_s3 ./ --recursive --exclude="*" --include="R00000*" --no-sign-request
#            R000000.jpg
part_ref = f"T{PART}*"
if DEBUG:
    part_ref = f"T000{PART}*"

!aws s3 cp $driven_data_s3 ./ --recursive --exclude="*" --include=$part_ref --no-sign-request

clear_output()
!ls -alh *.jpg | wc
!ls -alh * | head -5

In [None]:
import os
import numpy as np
from pathlib import Path
import tensorflow as tf
from PIL import Image
from IPython.display import clear_output

REQUIRED_SIGNATURE = 'serving_default'
REQUIRED_OUTPUT = 'global_descriptor'
    
def load(saved_model_proto_filename):
    saved_model_path = Path(saved_model_proto_filename).parent        
    print (saved_model_path, saved_model_proto_filename)

    model = tf.saved_model.load(str(saved_model_path))

    found_signatures = list(model.signatures.keys())

    if REQUIRED_SIGNATURE not in found_signatures:
        return None

    outputs = model.signatures[REQUIRED_SIGNATURE].structured_outputs
    if REQUIRED_OUTPUT not in outputs:
        return None

    embedding_fn = model.signatures[REQUIRED_SIGNATURE]

    return model, embedding_fn
    
model, embedding_fn = load("./baseline_landmark_retrieval_model/saved_model.pb")
print("-"*80)
clear_output()
model, embedding_fn

In [None]:
import os
import numpy as np
from IPython.display import display
from matplotlib import pyplot as plt

import augly.image as imaugs
import augly.utils as utils

aug1_compose = imaugs.Compose(
    [
        imaugs.PerspectiveTransform(sigma=20),
        imaugs.OverlayEmoji()
    ]
)

aug2_compose = imaugs.Compose(
    [
        imaugs.Saturation(factor=2.0),
        imaugs.OverlayOntoScreenshot(
            template_filepath=os.path.join(
                utils.SCREENSHOT_TEMPLATES_DIR, "mobile.png"
            ),
        ),
        imaugs.Scale(factor=0.6),
    ]
)

def aug1_function(input_img):
    return aug1_compose(input_img)

def aug2_function(input_img):
    return aug2_compose(input_img)

if DEBUG:
    %cd /content/
    
    temp_dir = "/content/data/data_origin"
    image_path = f"{temp_dir}/T000000.jpg"

    input_img = Image.open(image_path) # imaugs.scale(input_img_path, factor=0.2)
    display(aug2_function(input_img))

In [None]:
from tqdm import tqdm

def get_embeddings(image_root_dir: str):
    def get_id(image_path: Path):
        return str(image_path.name).split(".")[0]
    
    def get_embedding_single(image_path: Path) -> np.ndarray:

        image_input = Image.open(str(image_path))
        image_aug1 = aug1_function(image_input)
        #image_aug2 = aug2_function(image_input)

        def get_emb(img):
            image_data = np.array(img.convert('RGB'))
            image_tensor = tf.convert_to_tensor(image_data)
            return embedding_fn(image_tensor)[REQUIRED_OUTPUT].numpy()

        image_id = get_id(image_path)

        return [image_id, get_emb(image_input), get_emb(image_aug1)]

    image_paths = [p for p in Path(image_root_dir).rglob('*.jpg')]
    print(len(image_paths))
    
    embeddings = [get_embedding_single(image_path) 
                  for i, image_path in tqdm(enumerate(image_paths))]

    return embeddings

if True:
    import datetime
    print("{date:%Y%m%d-%H%M%S}".format(date=datetime.datetime.now()))

    %cd /content/
    embeddings = get_embeddings(temp_dir)
    print(len(embeddings))

    import pickle
    data_features = {"embeddings": embeddings}
    file_to_store = open("data_features.pickle", "wb")
    pickle.dump(data_features, file_to_store)
    file_to_store.close()

    print("{date:%Y%m%d-%H%M%S}".format(date=datetime.datetime.now()))

In [None]:
for e in embeddings[0]:
    print(e)

In [None]:
import datetime
now_str = "{date:%Y%m%d-%H%M%S}".format(date=datetime.datetime.now())

title = f"k200-tf-training-{PART}-{now_str}"
if DEBUG:
    title = f"k200-tf-training-debug-{PART}-{now_str}"

print(title)

#------------------------------
!mkdir -p dataset
!rm -rf dataset/*
!cp *.pickle dataset

data = '''{
  "title": "__title__",
  "id": "tarobxl/__title__",
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ]
}
'''.replace("__title__", title)
text_file = open("dataset/dataset-metadata.json", 'w+')
n = text_file.write(data)
text_file.close()

#!kaggle datasets create -p "dataset"

!kaggle datasets create -p "dataset" --dir-mode zip