In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

In [None]:
labels = pd.read_csv("../input/dog-breed-identification/labels.csv")
labels.columns
labels

# End-to-End multi-class Dog breed classification

This notebook builds an end-to end multi-class image classifier using TensorFlow 2.0 and Tensorflow Hub

## 1.Problem

Identifying the breed of a dog given an image of a dog.

When I'm sitting at the cafe and I take a photo of a dog, I want to know what breed of dog it is.

## 2.Data

The data we're using from https://www.kaggle.com/c/dog-breed-identification/data

## 3. Evaluation
 
The evaluation is a file with prediction probabilities for each dog breed of each test image
https://www.kaggle.com/c/dog-breed-identification/overview/evaluation

## 4. Features

Some information about the data:
* we're dealing images (unstructured data) so it's probably best we use deep learning/ transfer learning. 
* There are 120 breeds of dogs(this means there are 120 different classes).
* There are around 10,000+ images in the training set(these images have labels)
* There are around 10,000+ images in the test set (these images have no labels, because we'll want to predict them).





## Get our workspace ready

1. Import TensorFlow 2.x
2. Import TensorFlow Hub
3. Make sure we're using a GPU

In [None]:
# Import Tensorflow into Colab
import tensorflow as tf
import tensorflow_hub as hub
print("TF Version", tf.__version__)
print("TF Hub version", hub.__version__)

# Check for GPU availability
print("GPU", "available(YESS!!!)" if tf.config.list_physical_devices("GPU") else "not available")

## Getting our Data ready Turning into Tensors

With all machine learning models, our data has to be in numerical format so that's what we'll doing first. Turning our images into Tensors (numerical representations).

Let's start by accessing our data and checking out the labels.


In [None]:
# Checkout the labels of our data
import pandas as pd

labels_csv = pd.read_csv("../input/dog-breed-identification/labels.csv")
print(labels_csv.describe())

In [None]:
labels_csv.head()

In [None]:
# How many images are there of each breed?

labels_csv["breed"].value_counts().plot.bar(figsize= (20,30))

In [None]:
# Get the mean of the the count values

labels_csv["breed"].value_counts().mean()

In [None]:
# Get the median of the count for more accurate value or avoid error
labels_csv["breed"].value_counts().median()

In [None]:
# Let's view an image
from IPython.display import Image
Image("../input/dog-breed-identification/train/001513dfcb2ffafc82cccf4d8bbaba97.jpg")

### Getting images and their labels

let's get a list of all of our image file pathnames.

In [None]:
labels_csv.head()

In [None]:
# Create pathnames from image ID's

filenames = ["../input/dog-breed-identification/train/"+fname+".jpg" for fname in labels_csv["id"]]
filenames

In [None]:
# Check wheather number of filenames matches number of actual image files

import os
if len(os.listdir("../input/dog-breed-identification/train/")) == len(filenames):
    print("Filenames match actual amount of files !! proceed")
else:
    print("Filename do not match actual amount of files check the target directoru")


In [None]:
# One more check
Image(filenames[9000])

In [None]:
labels_csv["breed"][9000]

Since we've now got our training image filepaths in a list,
let's prepare our labels.


In [None]:
import numpy as np
labels = labels_csv["breed"].to_numpy()
# labels = np.array(labels) # does same thing as above
labels

In [None]:
len(labels)

In [None]:
# See if number of labels matches with number of fileames
if len(labels) == len(filenames):
    print("Number of labels matches number of filenames")
else:
    print("Number of labels does not match number of fileames, check data directories")

In [None]:
# Find the unique label values
unique_breeds = np.unique(labels)
len(unique_breeds)

In [None]:
# Turn a single label into an array of booleans
print(labels[0])
labels[0] == unique_breeds

In [None]:
# Turn every labels into boolean array
boolean_labels = [label == unique_breeds for label in labels]
boolean_labels[:2]

In [None]:
len(boolean_labels)

In [None]:
# Example : Turning boolean aarray into integers
print(labels[0]) # original label
print(np.where(unique_breeds == labels[0])) # Index where label occurs
print(boolean_labels[0].argmax()) # index where label occurs in boolean array
print(boolean_labels[0].astype(int)) # There will be a 1 where the sample labels occurs

### Creating our own validation set

Since the dataset from kaggle doesn't come with validation set that's why we're creating a new validation set from the data.

In [None]:
# Setup X and y variable

X = filenames
y = boolean_labels


In [None]:
len(filenames)

We're going to start off experimeting with ~1000 images and increase as needed

In [None]:
# Set number of images to use for experimeting
NUM_IMAGES = 1000 #@param {type:"slider", min:1000, max:10000,step:1000}
NUM_IMAGES

In [None]:
# Let's split our data into train and validation sets

from sklearn.model_selection import train_test_split

# Split them into training and validation of total size NUM_IMAGES
X_train, X_val, y_train, y_val = train_test_split(X[:NUM_IMAGES],
                                                 y[:NUM_IMAGES],
                                                 test_size = 0.2,
                                                 random_state = 42)

len(X_train), len(y_train), len(X_val), len(y_val)

In [None]:
# Let's have a geer at the training data
X_train[:5], y_train[:2]


# Preprocessing images (Turning images into Tensors)

To preprocess our images into Tensors we're going to write a function which does a few things:

* Take an image filepath as input
* Use Tensorflow to read the file and save it to a variable `image`

* Turn `image` (a jpg) into Tensors
* Normalize our image (Convert color channel values from 0-255 to 0-1)
* Resize the `image` to be a shape of (224, 224)
* Return the modified `image`

Before we do, let's see what importing an image looks like.

In [None]:
# Convert image to a Numpy array
from matplotlib.pyplot import imread
image = imread(filenames[42])
image.shape

In [None]:
image

In [None]:
image.max()

In [None]:
image[:2]

In [None]:
# Turn image into Tensors
tf.constant(image)

Now we've seen what an image looks like as a Tensor, let's make a function to preprocess them.

In [None]:
# Define image size
IMG_SIZE = 224

# Create a function for preprocessing images
def process_image(image_path, img_size = IMG_SIZE):
    """
    Takes an image file path and turns the image into a Tensor
    """
    # Read in an image file
    image = tf.io.read_file(image_path)
    # Turn the jpeg image into numerical Tensor with 3 color channels (Read, Green, blue)
    image = tf.image.decode_jpeg(image, channels = 3)
    # Convert the colour channel values from 0-255 to 0-1 values
    image = tf.image.convert_image_dtype(image, tf.float32)
    # Resize the image to our desired value (224, 224)
    image = tf.image.resize(image, size = [IMG_SIZE, IMG_SIZE])
    
    return image
    

In [None]:
tensor = tf.io.read_file(filenames[26])
# tensor

In [None]:
# tf.image.decode_jpeg(tensor, channels = 3)

## Turning our Data into batches

why to turn our data into batches?

Let's say you're trying to process 10,000+ images in one go.. They all might not fit into memory.

so that's why we do about`32 images (this the batch size)` at a time ( you can manually adjust the batch size if need be).

In order to  Tensorflow effectively , we need our data in the form of Tensor Tuples which look like this:

`(image, label)`

In [None]:
# Create a simple function to retur a tuple (image, label)
def get_image_label(image_path, label):
    """
    Takes an image file path name and the associated label,
    process the image and returns a tuple of (image. label).
    """
    
    image = process_image(image_path)
    return image, label

In [None]:
# Demo of the above
(process_image(X[42], tf.constant(y[42])))



Now we've got a way to turn our data into tuples of Tensors
in the form of : `(image, label)` , let make a function to turn all of our data (`X` and `y`) into batches

In [None]:
# Define the batch size, 32 is a good start
BATCH_SIZE = 32

# Create a function to turn data into batches
def create_data_batches(X, y = None, batch_size = BATCH_SIZE, valid_data = False, test_data = False):
    """
    create batches of data our of image(X) and label (y) pairs
    it shuffles the data if it's training data but doesn't shuffle if its's validation data.
    also accepts test data as input (no labels)
    """
    # If the data is test dataset, we probably don't have labels
    if test_data:
        print("Creating test data batches...")
        data = tf.data.Dataset.from_tensor_slices((tf.constant(X))) # only filepaths (no labels)
        data_batch = data.map(process_image).batch(BATCH_SIZE)
        return data_batch
    
    # If the data is valid datasets, we don't need to shuffle it
    elif valid_data:
        print("Creating validation data batches...")
        data = tf.data.Dataset.from_tensor_slices((tf.constant(X), # file path
                                                 tf.constant(y))) # labels
        data_batch = data.map(get_image_label).batch(BATCH_SIZE)
        return data_batch
        
    else:
        print("Creating training data batches....")
        # Turn filepaths and labels into Tensors
        data = tf.data.Dataset.from_tensor_slices((tf.constant(X),
                                                 tf.constant(y)))
        
        # Shuffling pathnames and labels before mapping image processor function is faster than shuffling
        data = data.shuffle(buffer_size = len(X))
        
        # Create (image, label) tuples (this also turn the image path into a preprocessed iamge)
        data = data.map(get_image_label)
        
        # Turn the training data into batches
        data_batch = data.batch(BATCH_SIZE)
        return data_batch

In [None]:
# Creating training and validation data batches
train_data = create_data_batches(X_train, y_train)
val_data = create_data_batches(X_val,y_val, valid_data = True)

In [None]:
# Check out the different attributes of our data batches
train_data.element_spec, val_data.element_spec

In [None]:
y[0]

### Visualizing Data Batches

Our data is now in batches, however these can be a little hard to understand/comprehend , let's visualize the batches

In [None]:
import matplotlib.pyplot as plt

# Create a function for viewing images in a data batch

def show_25_images(images, labels):
    """
    Display a plot of 25 images and their labels from a data batch
    """
    # Setup the figure
    plt.figure(figsize = (10,10))
    
    # Loop through 25 (for displaying 25 images)
    for i in range(25):
        ax = plt.subplot(5, 5, i+1)
        # Display an image
        plt.imshow(images[i])
        # Add the image label as the title
        plt.title(unique_breeds[labels[i].argmax()])
        # Turn the grid lines off
        plt.axis("off")
        

In [None]:
train_images, train_labels = next(train_data.as_numpy_iterator())
len(train_images), len(train_labels)

In [None]:
# Now let's visualize the data in a training batch
show_25_images(train_images, train_labels)

In [None]:
# Visualize the validation set
val_images, val_labels =  next(val_data.as_numpy_iterator())
show_25_images(val_images, val_labels)

## Building the models

Before we build a model, there are a few things we need to define:

* The input shape (our images shape, in the form of Tensors) to our model.
* The output shape(image labels, in the form of Tensors) of our Model.
* The URL of the model we want to use. from tensorflow hub

https://tfhub.dev/google/imagenet/mobilenet_v2_130_224/classification/5


In [None]:
# Setup input shape to the model

INPUT_SHAPE = [None, IMG_SIZE, IMG_SIZE, 3]


# Setup output shape of our model
OUTPUT_SHAPE = len(unique_breeds)

# Setup Model URL from TensorFlow Hub
MODEL_URL = "https://tfhub.dev/google/imagenet/mobilenet_v2_130_224/classification/5"

Now we've got our inputs, outputs and model ready to go.
Let's put them together into a Keras deep learing model


Knowing this, let's create a function which:
* Takes the input shape, output shape and the model we've chosen as parameters.
* Define the layers in a Keras model in sequential fashion (Do this first then this m then that).
* Complies the model (says it should be evaluated and improved).
* Build the model (tells the model the input shape it'll be getting)
* Finally return the model.


In [None]:
# Create a function which builds a Keras model

def create_model(input_shape = INPUT_SHAPE, output_shape = OUTPUT_SHAPE, model_url = MODEL_URL):
    print("Building model with: ", MODEL_URL)
    
    # Setup the model layers
    model = tf.keras.Sequential([hub.KerasLayer(MODEL_URL), # Layer 1 (input layer)
                                tf.keras.layers.Dense(units = OUTPUT_SHAPE,  
                                activation ="softmax")]) # Layer 2 (output Layer)
    
    # Compile the model
    model.compile(loss = tf.keras.losses.CategoricalCrossentropy(),
                optimizer = tf.keras.optimizers.Adam(),
                metrics = ["accuracy"])
    
    # Build the model
    model.build(INPUT_SHAPE)
    
    return model

In [None]:
model = create_model()
model.summary()

## Creating callbacks

callbacks are helper function a model can use training to do such things as save its progress, check its progress or stop training early if a model stops improving.


we'll create two callbacks, one for TensorBoard which helps track our models progress and another for early stopping which prevents our model from training for too long.

### TensorBoard CallBack

To setup a TensorBoard callback, we eed to do 3 things:
1. Load The TensorBoard Notebook extension
2. Create a TensorBoard callback which is able to save logs to a directory and pass it to our models `fit()` functions
3. Visualize our models training logs with the `%tensorboard` magic function (we'll do this after model training).

In [None]:
# Load TensorBoard Notebook extension
%load_ext tensorboard



In [None]:
! mkdir logs
! ls


In [None]:
import datetime

# Create a function to buiild a TensorBoard callback
def create_tensorboard_callback():
    # Create a log directory for storing TensorBoard logs
    logdir = os.path.join('logs',
                         # Make it so the logs get tracked whenever we run an experiment 
                         datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    
    return tf.keras.callbacks.TensorBoard(logdir)
    

## Early Stopping Callback

early stopping helps stop our model from overfitting by stopping training if a certain evaluation metric stop


In [None]:
# Create early stopping callback

early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                 patience = 3)

## Training a model (on subset of data)

our first model is only going to train on 1000 images, to make sure everything is working

In [None]:
NUM_EPOCHS = 100  #@param {type:"slider", min:10, max:100, step:10}

In [None]:
# Check to make sure we're still running on a GPU
print("GPU", "avilable (yess)" if tf.config.list_physical_devices("GPU") else "not avilable")

Let's create a function which trains a model.

* Create a model using `create_model()`
* Setup a TensorBoard callback using `create_tensorboard_callback()`
* Call the `fit()` function on our model passing it the training data, validation data, number of epochs to train for (`NUM_EPOCHES`) and the callbacks we'd like to use
* Return the model

In [None]:
# Build a function to train and return a trained model

def train_model():
    """
    Train a given model and returns the trained version.
    """
    # Create a model
    model = create_model()
    
    # Create new TesorBoard session everytime we train a model
    tensorboard = create_tensorboard_callback()
    
    # Fit the model to the data passing it the callbacks we created
    model.fit(x = train_data, epochs = NUM_EPOCHS, validation_data = val_data,
             validation_freq = 1,
             callbacks = [tensorboard, early_stopping])
    # return the fitted model
    return model
    

In [None]:
# Fit eh model to the data
model = train_model()


In [None]:
! ls


In [None]:
! cd logs

In [None]:
! ls

### Checking the TensorBoard Logs
The TensorBoard magic function (`%tensorboard`) will access the logs directory we created earlier and visualize its contents

In [None]:
! kill 6484
%tensorboard --logdir logs


## Making and evaluating prediction using trained model

In [None]:
val_data

In [None]:
# Make prediction on the validation data (not used to train on)
prediction = model.predict(val_data, verbose = 1)
prediction

In [None]:
prediction.shape

In [None]:
len(y_val)

In [None]:
len(unique_breeds)

In [None]:
# First Prediction
index = 69
print(prediction[0])
print(f"Max value (probability of prediction):{np.max(prediction[index])}")
print(f"Sum:{np.sum(prediction[index])}")
print(f"Max index: {np.argmax(prediction[index])}")
print(f"Predicted label: {unique_breeds[np.argmax(prediction[index])]}")

Having the above functionality is great but we want to be able to do it at scale.

and it would have even better if we could see the image the prediction is being made on !

**Note:** Predicition probabilities are also known as `confidence level`

In [None]:
# Turn probabilities into their respective label (Easier to understand)

def get_pred_label(prediction_probabilities):
    """
    Turn an array of prediction probabilities into a label
    """
    
    return unique_breeds[np.argmax(prediction_probabilities)]

# Get a predicted label based on an array of prediction probabilities

pred_label = get_pred_label(prediction[69])
pred_label

since our validation data is still in a batch dataset,
we'll have to ubatchfy it to make predicitons on the validation images and them compare those predicitons to the validation labels(truth labels).

In [None]:
images_ = []
labels_ = []

# loop through unbatched data
for image, label in val_data.unbatch().as_numpy_iterator():
    images_.append(image)
    labels_.append(label)
    
labels_[0], images_[0]

In [None]:
# Create a function to unbatch a batch dataset
def unbatchify(data):
    """
    Takes a batched dataset of (image, label) Tensors and return separate arrays
    of images and labels.
    """
    images = []
    labels = []
    # Loop through unbatched data
    for image , label in data.unbatch().as_numpy_iterator():
        images.append(image)
        labels.append(unique_breeds[np.argmax(label)])
    return images, labels


# unbatchify the validation data

val_images , val_labels = unbatchify(val_data)
val_images[0], val_labels[0]


In [None]:
get_pred_label(val_labels[0])

Now we've got ways to get :

* Prediction labels
* validation labels (truth labels)
* validation images

Let's make some function to make these all a bit more visualize

we'll create a function which:
* Takes an array of predcition probabilities, an array of truth labels and an array of image and a integers.
* Convert the prediction probabilities to a predicted label.
* plot the predicted label, its predicted probability, the truth label and the target image on a single plot.

In [None]:
def plot_pred(prediction_probabilities, labels, images, n = 1):
    """
    View the prediction , groud truth and imave for sample n
    """
    pred_prob, true_label, image = prediction_probabilities[n], labels[n], images[n]
    
    # Get the pred label
    pred_label = get_pred_label(pred_prob)
    
    # Plot image
    plt.imshow(image)
    plt.xticks([])
    plt.yticks([])
    
    # Change the colour  of the title depending on if the prediction is right or wrong
    if pred_label == true_label:
        color = "green"
    else:
        color = "red"
    
    # Change plot title to be predicted , probability of prediction and truth label
    plt.title("{} {:2.0f}% {}".format(pred_label,np.max(pred_prob)*100, true_label), color = color)
    
    

In [None]:
plot_pred(prediction_probabilities=prediction,
         labels = val_labels,
         images = val_images,
         n = 77)


Now we've got one function to visualize our models top predictions, let's make another to view our models top 10 predictions. 

This functions will:
* Take an input of prediction probabilities array and a ground truth array and an integer.
* Find the prediction using `get_pred_label()`
* Find the top 10:
    * Prediction probabilities indexes
    * Prediction probabilities values
    * Predicition labels
* Plot the top 10 prediction probability values and labels, coloring the true label green

In [None]:
def plot_pred_conf(prediction_probabilities, labels, n=1):
    """
    Plus the top 10 highest prediction confidences along with the true labels
    for sample n.
    """
    pred_prob, true_label = prediction_probabilities[n], labels[n]
    
    # Get the predicted label
    pred_label = get_pred_label(pred_prob)
    
    # Find the top 10 prediction confidence indexes
    top_10_pred_indexes = pred_prob.argsort()[-10:][::-1]
    
    # Find the top 10 prediction confidence values
    top_10_pred_values = pred_prob[top_10_pred_indexes]
    
    # Find the top 10 prediction labels
    top_10_pred_labels = unique_breeds[top_10_pred_indexes]
    
    # Setup plot 
    top_plot = plt.bar(np.arange(len(top_10_pred_labels)), top_10_pred_values,color ="grey")
    
    plt.xticks(np.arange(len(top_10_pred_labels)),
              labels = top_10_pred_labels,
              rotation = "vertical")
    
    # Change color of true  label
    if np.isin(true_label, top_10_pred_labels):
        top_plot[np.argmax(top_10_pred_labels == true_label)].set_color('green')
    else:
        pass
    
    

In [None]:
plot_pred_conf(prediction_probabilities = prediction, labels = val_labels, n = 9)

Now we've got some function to help us visualize our predictions and evaluate our model let check our a few prediction

In [None]:
# Let's check out a few predictions and their differenct values
i_multiplier = 10
num_rows = 3
num_cols = 2
num_images = num_rows* num_cols

plt.figure(figsize =(10*num_cols, 5*num_rows))
for i in range(num_images):
    plt.subplot(num_rows, 2* num_cols, 2* i+1)
    plot_pred(prediction_probabilities = prediction,
             labels = val_labels,
             images = val_images,
             n = i+i_multiplier)
    plt.subplot(num_rows, 2*num_cols, 2 * i+2)
    plot_pred_conf(prediction_probabilities = prediction,
                  labels = val_labels,
                  n = i +i_multiplier)
plt.tight_layout(h_pad =1.0)    
plt.show()
    

 Create a confusion matrix with models predictions and true labels?
 
 

## Save and Reload Models

In [None]:
# Create a function to save a model
def save_model(model , suffix = None):
    """
    save a given model in a models directory and appedns a suffix(string)
    """
    # Create a model directory pathname with current time
    modeldir = os.path.join("models", datetime.datetime.now().strftime("%Y%m%d-%H%M%s"))
    model_path = modeldir + '-'+ suffix + '.h5' # Save format of model
    
    print(f"saving model to: {model_path}...")
    model.save(model_path)
    return model_path


In [None]:
# Create a function to load a trained model
def load_model(model_path):
    """
    Load a saved model from a specified path.
    """
    print(f"Loading saved model from: {model_path}")
    model = tf.keras.models.load_model(model_path,
                                       custom_objects = {"KerasLayer": hub.KerasLayer})
    
    return model

Now we've created the functions to save and load a trained models let's make sure they work


In [None]:
save_model(model, suffix = "1000-images-mobilenetv2-Adam")

In [None]:
# Load the trained model
loaded_1000_image_model = load_model('models/20211118-05181637212684-1000-images-mobilenetv2-Adam.h5')

In [None]:
# Evaluate the pre-saved model
model.evaluate(val_data)

In [None]:
# Evaluate the loaded model
loaded_1000_image_model.evaluate(val_data)

## Training a big dog model on the full data

In [None]:
len(X), len(y)

In [None]:
len(X_train)

In [None]:
# create a data batch with the full dataset
full_data = create_data_batches(X, y)


In [None]:
full_data

In [None]:
# Create a model for full model
full_model = create_model()

In [None]:
# Create full model callbacks
full_model_tensorboard = create_tensorboard_callback()

# No validation set when traiing on all the data, so we can't monitor validation accuracy
full_model_early_stopping = tf.keras.callbacks.EarlyStopping(monitor="accuracy",
                                                            patience = 3)


In [None]:
# Fit the full model to the full data
full_model.fit(x= full_data,
              epochs = NUM_EPOCHS,
              callbacks = [full_model_tensorboard, full_model_early_stopping])

In [None]:
save_model(full_model, suffix ="full-image-set-mobilenetv2-Adam")

In [None]:
loaded_full_model =  load_model('models/20211118-05481637214524-full-image-set-mobilenetv2-Adam.h5'
)


In [None]:
len(X)

# Making predictions on the test datasets

since our model has been trained on images in the form of Tensor batches
to make predictions on the test data, we'll have to get it into the same format.

we created `create_data_batches()` earlier which can take a list of filenames as input and cover them into Tensor batches.

To make predictions on the test data we'll:
* Get the test image filenames
* Convert the filenames into test data batches `create_data_batches` and setting the `test_data` parameter to `True` (since the test data doesn't have labels).
* Make a predictions array by passing the test batches to the `predict()` method called on our model.


In [None]:
# Load test image filenames
test_path = ('../input/dog-breed-identification/test/')

test_filenames = [test_path + fname for fname in os.listdir(test_path)]
test_filenames[:10]

In [None]:
len(test_filenames)

In [None]:
# Create test data batch
test_data = create_data_batches(test_filenames, test_data = True)

In [None]:
test_data

**Note:**  Calling `predict()` on our full model and passing it the test data batch will take a long time to run

In [None]:
# Make predictions on test data batch using the loaded full model

test_predictions= loaded_full_model.predict(test_data,verbose=1)

In [None]:
# Save predictions numpy array to csv file for access later
np.savetxt("preds_array.csv", test_predictions, delimiter = ",")

In [None]:
test_predictions = np.loadtxt("preds_array.csv", delimiter = ",")

In [None]:
test_predictions[:10]

In [None]:
test_predictions.shape

## Preparing test dataset predictions for kaggle

To get the data in this format we'll:
* create a pandas dataframe with an ID column as well as a column for each dog breed
* add data to the ID column by extracting the test image ID's from their filepaths.
* add data the prediction probabilities to each of the dog breed columns
* Export the dataframe as a csv to submit it to kaggle

In [None]:
# Create a panad data with empty columns
preds_df = pd.DataFrame(columns =['id']+ list(unique_breeds))

preds_df.head()

In [None]:
# Append test image ID's to prediction dataframe
test_ids = [os.path.splitext(path)[0] for path in os.listdir(test_path)]
preds_df["id"] = test_ids

In [None]:
preds_df.head()

In [None]:
# Add the predicitions probabilities to each log dog breed column
preds_df[list(unique_breeds)] =  test_predictions
preds_df.head()

In [None]:
# Save our predicition dataframe to csv
preds_df.to_csv("full_model_prediction_submission_1_mobilenetV2.csv",
               index= False)