In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Vision Workshop - Model Experimentation

## Overview

[Vision Workshop](https://github.com/mblanc/vision-workshop) is a series of labs on how to build an image classification system on Google Cloud. Throughout the Vision Workshop labs, you will learn how to read image data stored in data lake, perform exploratory data analysis (EDA), train a model, register your model in a model registry, evaluate your model, deploy your model to an endpoint, do real-time inference on your model.

### Objective

This notebook shows how to pull features from Feature Store for training, run data exploratory analysis on features, build a machine learning model locally, experiment with various hyperparameters, evaluate the model and deloy it to a Vertex AI endpoint. 

This lab uses the following Google Cloud services and resources:

- [Vertex AI](https://cloud.google.com/vertex-ai/)

Steps performed in this notebook:

- Use a Feature Store to pull training data
- Do some exploratory analysis on the extracted data
- Train the model and track the results using Vertex AI Experiments

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI
pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing
Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

### Load configuration settings from the setup notebook

Set the constants used in this notebook and load the config settings from the `00_environment_setup.ipynb` notebook.

In [None]:
GCP_PROJECTS = !gcloud config get-value project
PROJECT_ID = GCP_PROJECTS[0]
BUCKET_NAME = f"{PROJECT_ID}-vision-workshop"
config = !gsutil cat gs://{BUCKET_NAME}/config/notebook_env.py
print(config.n)
exec(config.n)

### Mount Google Cloud Storage with gcsfuse

What if I told you there is no need to `gsutil cp -r `?

If you’ve developed machine learning models before, you know that data quality and governance issues are predominant. When developing models, you’ll spin up a Vertex AI Workbench Jupyter Notebook and copy some data from Cloud Storage. If the dataset is large, then you’ll wait some time while all data is copied to the notebook. Now you have two copies of the data. Multiply this X times the number of data scientists in your organization and now you have a data reconciliation problem.

Now, with Cloud Storage FUSE, you can mount Cloud Storage buckets as file systems on Vertex AI Workbench Notebooks and Vertex AI training jobs. This way you can keep all your data in a single repository (Cloud Storage) and make it available across multiple teams as a single source of truth.

#### Cloud Storage FUSE

Cloud Storage FUSE is a File System in User Space mounted on Vertex AI systems. It provides 3 benefits over the traditional ways of accessing Cloud Storage:

Jobs can start quickly without downloading any data

Jobs can perform I/O easily at scale, without the friction of calling the Cloud Storage APIs, handling the responses, or integrating with client-side libraries.

Jobs can leverage the optimized performance of Cloud Storage FUSE.

In all custom training jobs, Vertex AI mounts Cloud Storage buckets that you have access to in the /gcs/ directory of each training node’s filesystem. You can read and write directly to the local filesystem in order to read data from Cloud Storage or write data to Cloud Storage.

For Vertex AI Workbench Notebooks, Cloud Storage FUSE is supported with just a few steps and next we’ll go through how to do this. Let’s get started!

In [None]:
!fusermount -u /home/jupyter/gcs/{BUCKET_NAME}
!rm -rf ~/gcs

In [None]:
!mkdir -p ~/gcs/{BUCKET_NAME}

In [None]:
BUCKET_NAME

In [None]:
!gcsfuse --implicit-dirs \
--rename-dir-limit=100 \
--disable-http2 \
--max-conns-per-host=100 \
{BUCKET_NAME} /home/jupyter/gcs/{BUCKET_NAME}

In [None]:
import pathlib
data_dir = pathlib.Path(f"/home/jupyter/gcs/{BUCKET_NAME}/flowers")

In [None]:
image_count = len(list(data_dir.glob('*/*.jpg')))
print(image_count)

### Import libraries

In [None]:
import numpy as np
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import PIL
import PIL.Image
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import copy

import timm
import time
from datetime import datetime, timedelta
from google.cloud import aiplatform as vertex_ai

In [None]:
print(torchvision.__version__)

### Define constants

In [None]:
TIMESTAMP = str(int(time.time()))

## Experiment
EXPERIMENT_NAME = "vision-experiment-" + TIMESTAMP

### Initialize clients

In [None]:
vertex_ai.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_NAME, experiment=EXPERIMENT_NAME)

## Load data using a Keras utility

Next, load these images off disk using the helpful tf.keras.utils.image_dataset_from_directory utility. This will take you from a directory of images on disk to a tf.data.Dataset in just a couple lines of code. If you like, you can also write your own data loading code from scratch by visiting the [Load and preprocess images](https://www.tensorflow.org/tutorials/load_data/images) tutorial.

In [None]:
import pathlib
data_dir = pathlib.Path(f"/home/jupyter/gcs/{BUCKET_NAME}/aiornot/train")

In [None]:
image_count = len(list(data_dir.glob('*/*.jpg')))
print(image_count)

### Create a dataset

Define some parameters for the loader:

In [None]:
# Models to choose from [resnet, alexnet, vgg, squeezenet, densenet, inception]
model_name = "maxxvit_rmlp_small_rw_256"
# Number of classes in the dataset
num_classes = 2
# Batch size for training (change depending on how much memory you have)
batch_size = 32
# Number of epochs to train for
num_epochs = 10
# Flag for feature extracting. When False, we finetune the whole model,
#   when True we only update the reshaped layer params
feature_extract = True

input_size=256

It's good practice to use a validation split when developing your model. Use 80% of the images for training and 20% for validation.

In [None]:
data_transforms = transforms.Compose([
    transforms.Resize((input_size, input_size)),
    transforms.RandomResizedCrop(input_size),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

In [None]:
!ls -l {data_dir}

In [None]:
!rm -rf {data_dir}/models

In [None]:
ds = torchvision.datasets.ImageFolder(
    data_dir, 
    data_transforms
)

In [None]:
train_ds, val_ds = torch.utils.data.random_split(ds, [round(len(ds)*0.8),round(len(ds)*0.2)])
image_datasets = {'train': train_ds, 'val': val_ds}

In [None]:
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Data exploration
Here we use a subset of data for data exploration and better understanding of the data.

You can find the class names in the class_names attribute on these datasets. These correspond to the directory names in alphabetical order.

In [None]:
class_names = ds.targets
print(class_names)
num_classes = len(set(ds.targets))
num_classes

Here are the first nine images from the training dataset:

In [None]:
class_names = ['daisy', 'dandelion', 'roses', 'sunflowers', 'tulips']
class_names = ['0', '1']

In [None]:
for i, batch in enumerate(dataloaders_dict['val']):
    images, labels = batch
    print(labels)

In [None]:
images, labels = next(iter(dataloaders_dict['val']))

In [None]:
plt.figure(figsize=(10, 10))
images, labels = next(iter(dataloaders_dict['val']))
for i, image in enumerate(images, start=1):
    plt.subplot(3,3,i)
    plt.imshow(np.transpose(image.squeeze(), (1, 2, 0)))
    plt.axis('off')
    plt.title(class_names[labels[i].item()])
    if (i >= 9): break
plt.show()

Overfitting generally occurs when there are a small number of training examples. Data augmentation takes the approach of generating additional training data from your existing examples by augmenting them using random transformations that yield believable-looking images. This helps expose the model to more aspects of the data and generalize better.

You will implement data augmentation using the following Keras preprocessing layers: tf.keras.layers.RandomFlip, tf.keras.layers.RandomRotation, and tf.keras.layers.RandomZoom. These can be included inside your model like other layers, and run on the GPU.

Visualize a few augmented examples by applying data augmentation to the same image several times:

In [None]:
# import numpy as np

# for images, labels in train_ds.take(1):
#     plt.figure(figsize=(10, 10))
#     first_image = images[0]
#     for i in range(9):
#         ax = plt.subplot(3, 3, i + 1)
#         augmented_image = data_augmentation(
#             tf.expand_dims(first_image, 0), training=True
#         )
#         plt.imshow(augmented_image[0].numpy().astype("int32"))
#         plt.title(train_ds.class_names[int(labels[0])])
#         plt.axis("off")

## Builing a custom model

Make sure to use buffered prefetching, so you can yield data from disk without having I/O become blocking. These are two important methods you should use when loading data:

Dataset.cache keeps the images in memory after they're loaded off disk during the first epoch. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache.
Dataset.prefetch overlaps data preprocessing and model execution while training.
Interested readers can learn more about both methods, as well as how to cache data to disk in the Prefetching section of the [Better performance with the tf.data API](https://www.tensorflow.org/guide/data_performance) guide.

### Training

In this section, we will train a model using tensorflow. Typically, to perform training, you might want to use a Vertex AI traning pipeline, however, as we are experimenting here, we simply use the tensorflow package interactively to train our model in this notebook. 

We will test two different architectures, and will logs or expriments in Vertex AI Experiments.

Let's start with a basic Keras model :

The Keras Sequential model consists of three convolution blocks (tf.keras.layers.Conv2D) with a max pooling layer (tf.keras.layers.MaxPooling2D) in each of them. There's a fully-connected layer (tf.keras.layers.Dense) with 128 units on top of it that is activated by a ReLU activation function ('relu'). This model has not been tuned for high accuracy; the goal of this tutorial is to show a standard approach.

In [None]:
from torchvision import models

#model_ft = models.efficientnet_v2_s(weights=torchvision.models.EfficientNet_V2_S_Weights.DEFAULT)
model_ft = timm.create_model(model_name, num_classes=0, pretrained=True) 
for param in model_ft.parameters():
    param.requires_grad = False

In [None]:

#self.model.fc = nn.Linear(n_features, 2)

In [None]:
if hasattr(model_ft, 'head'):
    for param in model_ft.head.parameters():
        param.requires_grad = True
        
if hasattr(model_ft, 'fc'):
    for param in model_ft.fc.parameters():
        param.requires_grad = True
        
if hasattr(model_ft, 'classifier'):
    for param in model_ft.classifier.parameters():
        param.requires_grad = True

In [None]:
n_features = model_ft.head.fc.in_features







In [None]:
# num_ftrs = model_ft.classifier[1].in_features
# model_ft.classifier[1] = nn.Linear(num_ftrs,num_classes)

#### Training the model
Before running Tensorflow, we can set some hyperparameters, which has a strong impact on performance. As a best practice, you can use Vertex AI HyperParameter Tuning to automatically find the best parameters. However, in this notebook, for the sake of simplicity and expedience, we specify these hyperparemeters manually and randomly. 

In [None]:
from tqdm import tqdm
def train_model(model, dataloaders, criterion, optimizer, num_epochs=25):
    since = time.time()

    
    history = {
        'accuracy': [],
        'val_accuracy': [],
        'loss': [],
        'val_loss': []
    }

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in tqdm(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    # Get model outputs and calculate loss
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)

                    _, preds = torch.max(outputs, 1)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
            if phase == 'val':
                history['val_loss'].append(epoch_loss)
                history['val_accuracy'].append(epoch_acc)
            else:
                history['loss'].append(epoch_loss)
                history['accuracy'].append(epoch_acc)

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, history

In [None]:
model_ft = model_ft.to(device)

# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
    params_to_update = []
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

# Observe that all parameters are being optimized
optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.003)

In [None]:
# Setup the loss fxn
criterion = nn.CrossEntropyLoss()

In [None]:
# epochs=5

# run_name=f"pytorch"
# vertex_ai.start_run(run=run_name)
# vertex_ai.log_params({"type": model_name})
# vertex_ai.log_params({"lr": 0.003})
# vertex_ai.log_params({"epochs": epochs})

Train the model for 5 epochs with the Keras Model.fit method:

In [None]:
from torchsummary import summary

summary(model_ft, input_size=(3, input_size, input_size))

In [None]:
model_ft

In [None]:
# Train and evaluate
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=num_epochs)

Create plots of the loss and accuracy on the training and validation sets:

In [None]:
def plot_loss_accuracy(history):
    acc = [h.cpu().item() for h in history['accuracy']]
    val_acc = [h.cpu().item() for h in history['val_accuracy']]

    loss = history['loss']
    val_loss = history['val_loss']

    epochs_range = range(num_epochs)

    plt.figure(figsize=(8, 8))
    plt.subplot(2, 1, 1)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')

    plt.subplot(2, 1, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.show()
    
plot_loss_accuracy(hist)

In [None]:
loss, accuracy = model_ft.evaluate(val_ds)
vertex_ai.log_metrics({"loss": loss, "accuracy": accuracy})


In [None]:
vertex_ai.end_run()

In a second experiment we will try to fine tune an EfficientNetV2 architecture :

We can also extract all parameters and metrics associated with any experiment into a dataframe for further analysis.

In [None]:
experiment_df = vertex_ai.get_experiment_df()
experiment_df

Also we can visualize experiments in Cloud Console. Run the following to get the URL of Vertex AI Experiments for your project and click on that URL to see those results on the Cloud Console.

In [None]:
print("Vertex AI Experiments:")
print(
    f"https://console.cloud.google.com/ai/platform/experiments/experiments?folder=&organizationId=&project={PROJECT_ID}"
)

Let's test our last model by making a prediction on a new image

In [None]:
model_path = os.path.join("trained_model", "model.pth")

torch.save(model_ft.state_dict(), model_path)

In [None]:
model_ft.eval()

In [None]:
model_checkpoint = "google/vit-base-patch16-224" # pre-trained model from which to fine-tune
batch_size = 32 # batch size for training and evaluation

In [None]:
from transformers import pipeline
clf = pipeline("image-classification", model=model_checkpoint)
clf(sunflower_path)

In [None]:
from datasets import load_dataset

In [None]:
ds = load_dataset("imagefolder", data_dir=f"/home/jupyter/gcs/{BUCKET_NAME}/sample", drop_labels=False)

In [None]:
ds['train'][0]

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

In [None]:
ds["train"].features

In [None]:
labels = ds["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

id2label[2]

In [None]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
feature_extractor

In [None]:
feature_extractor.size.values()

In [None]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
train_transforms = Compose(
        [
            RandomResizedCrop(tuple(feature_extractor.size.values())),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

val_transforms = Compose(
        [
            Resize(tuple(feature_extractor.size.values())),
            CenterCrop(tuple(feature_extractor.size.values())),
            ToTensor(),
            normalize,
        ]
    )

def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch

In [None]:
# split up training into training + validation
splits = ds["train"].train_test_split(test_size=0.1)
train_ds = splits['train']
val_ds = splits['test']

In [None]:
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)

In [None]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint, 
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

In [None]:
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned",
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)

In [None]:
import numpy as np

# the compute_metrics function takes a Named Tuple as input:
# predictions, which are the logits of the model as Numpy arrays,
# and label_ids, which are the ground-truth labels as Numpy arrays.
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
import torch

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

In [None]:
train_results = trainer.train()
# rest is optional but nice to have
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

In [None]:
import urllib.request


sunflower_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/592px-Red_sunflower.jpg"
sunflower_path = "592px-Red_sunflower.jpg"
urllib.request.urlretrieve(sunflower_url, sunflower_path)

img = PIL.Image.open(sunflower_path)

data_transforms = transforms.Compose([
    transforms.Resize((input_size, input_size)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

img = data_transforms(img)


plt.imshow(np.transpose(img.squeeze(), (1, 2, 0)))

In [None]:
model_ft = timm.create_model('regnetx_040', num_classes=2)
model_ft.load_state_dict(torch.load("trained_model/model.pth"))
model_ft.to(device)
model_ft.eval()

In [None]:
predictions = model_ft(torch.unsqueeze(img, 0).to(device))
_, index = torch.max(predictions, 1)
percentage = torch.nn.functional.softmax(predictions[0]) * 100

# print(
#     "This image most likely belongs to {} with a {:.2f} percent confidence."
#     .format(class_names[index[0]], percentage[index[0]].item())
# )
predictions