<a href="https://colab.research.google.com/gist/nazmi/65867c9d99fc81fe1f63803c507cb74d/tsp-cv-best.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import sys
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import zipfile

In [None]:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
tf.random.set_seed(42)
tf.config.run_functions_eagerly(False)

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tf.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
if gpus:
    for gpu in gpus:
        print(tf.config.experimental.get_device_details(gpu))
else:
    print("GPU is NOT AVAILABLE")



# Fetch dataset

In [None]:
IN_COLAB = 'COLAB_GPU' in os.environ

if IN_COLAB:
    PATH = "/content/"
else:
    PATH = "."

PATH_INPUT = os.path.join(PATH,"input/")
PATH_DATASET = os.path.join(PATH_INPUT, "dataset/")   
PATH_TRAIN = os.path.join(PATH_INPUT,"train.pkl")
PATH_TEST = os.path.join(PATH_INPUT,"test.pkl")

if not os.path.exists(PATH_DATASET):
    ! kaggle competitions download -c tsp-cv
    with zipfile.ZipFile("tsp-cv.zip", 'r') as zip_ref:
        zip_ref.extractall(path=PATH_DATASET)
        zip_ref.close()

if not os.path.exists(PATH_TRAIN):
    if IN_COLAB:
        ! python /content/src/dataset_parser.py
    else:
        ! python /src/dataset_parser.py

In [None]:
train_df = pd.read_pickle(PATH_TRAIN)
test_df = pd.read_pickle(PATH_TEST)
train_df.head()

## Split dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_df["image_hist"].to_list(), train_df["distance"].values,
                                                    test_size=0.15, train_size=0.85, random_state=42)

# tf.data pipeline

In [None]:
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model

def prepare(ds, shuffle=False, repeat=False, cache=False, batch_size=32):

    if shuffle:
        ds = ds.shuffle(buffer_size=1000)

    if repeat:
        ds = ds.repeat()

    ds = ds.batch(batch_size, num_parallel_calls=tf.data.AUTOTUNE)

    if cache:
        ds = ds.cache()

    return ds.prefetch(buffer_size=tf.data.AUTOTUNE)


In [None]:
BATCH_SIZE = 24

train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_data = prepare(train_data,batch_size=BATCH_SIZE,shuffle=True,cache=True )

valid_data = tf.data.Dataset.from_tensor_slices((X_test, y_test))
valid_data = prepare(valid_data,batch_size=BATCH_SIZE,cache=True )

In [None]:
for x,y in train_data.take(1):
    print(x.shape, y.shape)
    print(x[0,1])

# Create Model

In [None]:
from tensorflow.keras.metrics import RootMeanSquaredError
fc_layer = 1504
ratio = 2
input_shape = (3,256,)

inputs = layers.Input(shape=input_shape, name="input_layer", dtype=tf.float32)
x = layers.Flatten()(inputs)
x = layers.Dense(fc_layer, activation="relu")(x)
x = layers.Dense(fc_layer//ratio, activation="relu")(x)

outputs = layers.Dense(1,activation="linear", name="output_layer",dtype=tf.float32)(x)
model = models.Model(inputs, outputs)



In [None]:
tf.keras.utils.plot_model(model, show_shapes=True)

In [None]:
import datetime
LEARNING_RATE = 0.00022186103371585797
EPOCHS = 5 #EDIT THIS


stop_callback = callbacks.EarlyStopping(monitor='val_loss', min_delta=1e-4,
                                        patience=25, verbose=1, mode='auto',
                                        restore_best_weights=True)

lr_callback = callbacks.ReduceLROnPlateau(monitor='loss',
                                          factor=0.2, min_lr=1e-10, patience=2)

checkpoint_callback = callbacks.ModelCheckpoint("models/best/",monitor='val_loss',save_best_only=True,save_weights_only=True)

model.compile(optimizer=Adam(learning_rate=LEARNING_RATE),
              loss="mse",
              metrics=[RootMeanSquaredError(name="rmse")])

In [None]:
history = (
    model.fit(
        train_data,
        epochs=EPOCHS,
        validation_data=valid_data,
        callbacks=[stop_callback, lr_callback,checkpoint_callback]
    )
)

In [None]:
def plot_loss_curves(history):
    """
    Returns separate loss curves for training and validation metrics.
    Args:
      history: TensorFlow model History object (see: https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/History)
    """
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    accuracy = history.history['rmse']
    val_accuracy = history.history['val_rmse']

    epochs = range(len(history.history['loss']))

    # Plot loss
    plt.plot(epochs, loss, label='training_loss')
    plt.plot(epochs, val_loss, label='val_loss')
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.legend()

    # Plot accuracy
    plt.figure()
    plt.plot(epochs, accuracy, label='training_accuracy')
    plt.plot(epochs, val_accuracy, label='val_accuracy')
    plt.title('Accuracy')
    plt.xlabel('Epochs')
    plt.legend()

    
plot_loss_curves(history)

# Evaluate

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error


def calculate_results(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    max_err = max_error(y_true, y_pred)

    model_results = {"Mean Absolute Error": mae,
                     "Mean Square Error": mse,
                     "Root Mean Square Error": np.sqrt(mse),
                     "Max Error": max_err}

    return model_results

def calculate_results_scaled(y_true, y_pred):
    inv_y_true = minmax_scaler.inverse_transform(y_true.reshape(-1, 1))
    inv_y_pred = minmax_scaler.inverse_transform(y_pred.reshape(-1, 1))
    model_results = calculate_results(inv_y_true, inv_y_pred)
    
    return model_results


In [None]:
model_pred_probs = model.predict(valid_data)
model_results = calculate_results(y_test, model_pred_probs)

In [None]:
model_results

# Top 10% Wrong

In [None]:
y_pred = model_pred_probs.squeeze()
error = (y_test - y_pred).squeeze()
square_error = np.square(error)

validation_df = pd.DataFrame({'true': y_test,
                             'pred': y_pred,
                             'error': error,
                             'square_error': square_error})



In [None]:
top_100_wrong = validation_df.sort_values("error", ascending=False).head(100)
top_100_wrong

# Prediction

In [None]:
test_data = tf.data.Dataset.from_tensor_slices(test_df["image_hist"].to_list()).batch(BATCH_SIZE).prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
model_predictions = model.predict(test_data)

In [None]:
model_predictions

In [None]:
submission_df = pd.DataFrame({"id": test_df["id"], "distance": model_predictions.squeeze()})

submission_df.to_csv("submission-best.csv", index=False)