In [None]:
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import efficientnet.tfkeras as efn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from tensorflow.keras import backend as K
import tensorflow_addons as tfa
from scipy import spatial
from tqdm.notebook import tqdm
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 5
BATCH_SIZE = 32
IMAGE_SIZE = [384, 384]
# Seed
SEED = 42
# Learning rate
LR = 0.001
# Verbosity
VERBOSE = 2
# Function to get our f1 score
def f1_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    intersection = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    len_y_pred = y_pred.apply(lambda x: len(x)).values
    len_y_true = y_true.apply(lambda x: len(x)).values
    f1 = 2 * intersection / (len_y_pred + len_y_true)
    return f1

# Function to read and preprocess our data
def preprocess():
    # Read train and test csv
    train = pd.read_csv('./shopee-product-matching/train.csv')
    test = pd.read_csv('./shopee-product-matching/test.csv')
    # Drop duplicates images to avoid leakage (dont know if this is correct)
    train.drop_duplicates(subset = ['image'], inplace = True)
    train.reset_index(drop = True, inplace = True)
    label_mapper = dict(zip(train['label_group'].unique(), np.arange(len(train['label_group'].unique()))))
    label_mapper_inv = dict(zip(np.arange(len(train['label_group'].unique())), train['label_group'].unique()))
    train['label_group'] = train['label_group'].map(label_mapper)
    # Number of classes
    N_CLASSES = train['label_group'].nunique()
    # Get ground truth labels format
    tmp = train.groupby(['label_group'])['posting_id'].unique().to_dict()
    train['matches'] = train['label_group'].map(tmp)
    train['matches'] = train['matches'].apply(lambda x: ' '.join(x))
    ground_truth = train[['posting_id', 'matches']]
    # Calculate naive score using self-post
    ground_truth['f1'] = f1_score(ground_truth['matches'], ground_truth['posting_id'])
    score = ground_truth['f1'].mean()
    print(f'Using the same posting id as prediction our f1 score is {score}')
    return train, test, label_mapper, label_mapper_inv, N_CLASSES, ground_truth

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

# Function to decode our images
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels = 3)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = tf.cast(image, tf.float32) / 255.0
    return image

# Function to read our image and return image, label_group
def read_image(image, label_group):
    image = tf.io.read_file(image)
    image = decode_image(image)
    return image, label_group

# Function to get our training dataset
def get_training_dataset(image, label_group):
    dataset = tf.data.Dataset.from_tensor_slices((image, label_group))
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset

# Function to get our validation dataset
def get_validation_dataset(image, label_group):
    dataset = tf.data.Dataset.from_tensor_slices((image, label_group))
    dataset = dataset.map(read_image, num_parallel_calls = AUTO)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO)
    return dataset
    
# Function to split our data into train and validation
def train_and_eval_split(image, label_group):
    trn_image, val_image, trn_labels, val_labels = train_test_split(image, label_group, random_state = SEED, shuffle = True)
    return trn_image, val_image, trn_labels, val_labels

# Function to create our EfficientNetB0 model
def get_model():
        
    inp = tf.keras.layers.Input(shape = (*IMAGE_SIZE, 3))
    x = efn.EfficientNetB0(include_top = False, weights = 'imagenet')(inp)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(512, activation = 'relu')(x)
    output = tf.keras.layers.Dense(N_CLASSES, activation = 'softmax')(x)

    model = tf.keras.models.Model(inputs = [inp], outputs = [output])

    opt = tf.keras.optimizers.Adam(learning_rate = LR)

    model.compile(
        optimizer = opt,
        loss = [tf.keras.losses.SparseCategoricalCrossentropy()],
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    return model

# Function for a custom learning rate scheduler with warmup and decay
def get_lr_callback():
    lr_start   = 0.000001
    lr_max     = 0.000005 * BATCH_SIZE
    lr_min     = 0.000001
    lr_ramp_ep = 5
    lr_sus_ep  = 0
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start   
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max    
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min    
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = False)
    return lr_callback

# Function to train and evaluate our model
def train_and_evaluate(image, label_group):
    print('\n')
    print('-'*50)
    # Seed everything
    seed_everything(SEED)
    STEPS_PER_EPOCH = len(image) // BATCH_SIZE
    K.clear_session()
    model = get_model()
    image = './shopee-product-matching/train_images/' + image
    trn_image, val_image, trn_labels, val_labels = train_and_eval_split(image, label_group)
    train_dataset = get_training_dataset(trn_image, trn_labels)
    val_dataset = get_validation_dataset(val_image, val_labels)
    checkpoint = tf.keras.callbacks.ModelCheckpoint(f'EfficientNetB0_{IMAGE_SIZE[0]}_{SEED}.h5', 
                                                    monitor = 'val_loss', 
                                                    verbose = VERBOSE, 
                                                    save_best_only = True,
                                                    save_weights_only = True, 
                                                    mode = 'min')
    history = model.fit(train_dataset,
                        steps_per_epoch = STEPS_PER_EPOCH,
                        epochs = EPOCHS,
                        callbacks = [checkpoint, get_lr_callback()], 
                        validation_data = val_dataset,
                        verbose = VERBOSE)
    
    
    print('\n')
    print('-'*50)
    print('Training Complete...')
    
    return model, val_image

def get_cv_score(image, label_group, model, val_image):
    
    model.load_weights(f'EfficientNetB0_{IMAGE_SIZE[0]}_{SEED}.h5')
    model = tf.keras.models.Model(inputs = model.input, outputs = model.layers[-2].output)
    
    # Respect order
    image = './shopee-product-matching/train_images/' + image
    dataset_images = get_validation_dataset(image, label_group)
    dataset_images = dataset_images.map(lambda image, label_group: image)
    # Predict the entire dataset
    embeddings = model.predict(dataset_images)
    
    # Find the best threshold (lazy optimization)
    predictions_08 = []
    predictions_09 = []
    predictions_10 = []
    predictions_11 = []
    predictions_12 = []
    predictions_13 = []
    predictions_14 = []
    predictions_15 = []
    predictions_16 = []
    # Iterate over each validation image and use cosine distance to find similar images
    for val_index in tqdm(val_image.index):
        distances = spatial.distance.cdist(
            embeddings[np.newaxis, val_index, :], embeddings, 'cosine')[0]
        # Only get small distances
        TOP_08 = len(distances[distances <= 0.08])
        TOP_09 = len(distances[distances <= 0.09])
        TOP_10 = len(distances[distances <= 0.10])
        TOP_11 = len(distances[distances <= 0.11])
        TOP_12 = len(distances[distances <= 0.12])
        TOP_13 = len(distances[distances <= 0.13])
        TOP_14 = len(distances[distances <= 0.14])
        TOP_15 = len(distances[distances <= 0.15])
        TOP_16 = len(distances[distances <= 0.16])
        top_k_08 = list(np.argsort(distances)[:TOP_08])
        top_k_09 = list(np.argsort(distances)[:TOP_09])
        top_k_10 = list(np.argsort(distances)[:TOP_10])
        top_k_11 = list(np.argsort(distances)[:TOP_11])
        top_k_12 = list(np.argsort(distances)[:TOP_12])
        top_k_13 = list(np.argsort(distances)[:TOP_13])
        top_k_14 = list(np.argsort(distances)[:TOP_14])
        top_k_15 = list(np.argsort(distances)[:TOP_15])
        top_k_16 = list(np.argsort(distances)[:TOP_16])
        predictions_08.append(' '.join(train['posting_id'].iloc[top_k_08].values))
        predictions_09.append(' '.join(train['posting_id'].iloc[top_k_09].values))
        predictions_10.append(' '.join(train['posting_id'].iloc[top_k_10].values))
        predictions_11.append(' '.join(train['posting_id'].iloc[top_k_11].values))
        predictions_12.append(' '.join(train['posting_id'].iloc[top_k_12].values))
        predictions_13.append(' '.join(train['posting_id'].iloc[top_k_13].values))
        predictions_14.append(' '.join(train['posting_id'].iloc[top_k_14].values))
        predictions_15.append(' '.join(train['posting_id'].iloc[top_k_15].values))
        predictions_16.append(' '.join(train['posting_id'].iloc[top_k_16].values))

    val_predictions = ground_truth.loc[val_image.index]
    val_predictions['predictions_08'] = predictions_08
    val_predictions['predictions_09'] = predictions_09
    val_predictions['predictions_10'] = predictions_10
    val_predictions['predictions_11'] = predictions_11
    val_predictions['predictions_12'] = predictions_12
    val_predictions['predictions_13'] = predictions_13
    val_predictions['predictions_14'] = predictions_14
    val_predictions['predictions_15'] = predictions_15
    val_predictions['predictions_16'] = predictions_16
    val_predictions['f1_08'] = f1_score(val_predictions['matches'], val_predictions['predictions_08'])
    val_predictions['f1_09'] = f1_score(val_predictions['matches'], val_predictions['predictions_09'])
    val_predictions['f1_10'] = f1_score(val_predictions['matches'], val_predictions['predictions_10'])
    val_predictions['f1_11'] = f1_score(val_predictions['matches'], val_predictions['predictions_11'])
    val_predictions['f1_12'] = f1_score(val_predictions['matches'], val_predictions['predictions_12'])
    val_predictions['f1_13'] = f1_score(val_predictions['matches'], val_predictions['predictions_13'])
    val_predictions['f1_14'] = f1_score(val_predictions['matches'], val_predictions['predictions_14'])
    val_predictions['f1_15'] = f1_score(val_predictions['matches'], val_predictions['predictions_15'])
    val_predictions['f1_16'] = f1_score(val_predictions['matches'], val_predictions['predictions_16'])
    print('Our f1 score with threshold 0.08 for the validation set is {}'.format(val_predictions['f1_08'].mean()))
    print('Our f1 score with threshold 0.09 for the validation set is {}'.format(val_predictions['f1_09'].mean()))
    print('Our f1 score with threshold 0.10 for the validation set is {}'.format(val_predictions['f1_10'].mean()))
    print('Our f1 score with threshold 0.11 for the validation set is {}'.format(val_predictions['f1_11'].mean()))
    print('Our f1 score with threshold 0.12 for the validation set is {}'.format(val_predictions['f1_12'].mean()))
    print('Our f1 score with threshold 0.13 for the validation set is {}'.format(val_predictions['f1_13'].mean()))
    print('Our f1 score with threshold 0.14 for the validation set is {}'.format(val_predictions['f1_14'].mean()))
    print('Our f1 score with threshold 0.15 for the validation set is {}'.format(val_predictions['f1_15'].mean()))
    print('Our f1 score with threshold 0.16 for the validation set is {}'.format(val_predictions['f1_16'].mean()))
    return val_predictions

train, test, label_mapper, label_mapper_inv, N_CLASSES, ground_truth = preprocess()
model, val_image = train_and_evaluate(train['image'], train['label_group'])
val_predictions = get_cv_score(train['image'], train['label_group'], model, val_image)
val_predictions.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ground_truth['f1'] = f1_score(ground_truth['matches'], ground_truth['posting_id'])


Using the same posting id as prediction our f1 score is 0.4831068224713004


--------------------------------------------------
Downloading data from https://github.com/Callidior/keras-applications/releases/download/efficientnet/efficientnet-b0_weights_tf_dim_ordering_tf_kernels_autoaugment_notop.h5
Epoch 1/5
