## Overview
## Outline
1. Import Library
2. Setup pre-requisites
3. Extract dataset to images
4. Upload images to Azure data store
4. Setting up Azure ML Infrastructure

### 1. Import library

In [5]:
## Install needed packages
%pip install opencv-python torch torchvision torchaudio azureml-core

Collecting azureml-core
  Obtaining dependency information for azureml-core from https://files.pythonhosted.org/packages/e5/da/ed67a11a1baa8dff268ce71965fb936cd39e244ddd82134c3325f89fa7a8/azureml_core-1.53.0-py3-none-any.whl.metadata
  Using cached azureml_core-1.53.0-py3-none-any.whl.metadata (3.2 kB)
Collecting backports.tempfile (from azureml-core)
  Using cached backports.tempfile-1.0-py2.py3-none-any.whl (4.4 kB)
Collecting pathspec<1.0.0 (from azureml-core)
  Obtaining dependency information for pathspec<1.0.0 from https://files.pythonhosted.org/packages/b4/2a/9b1be29146139ef459188f5e420a66e835dda921208db600b7037093891f/pathspec-0.11.2-py3-none-any.whl.metadata
  Using cached pathspec-0.11.2-py3-none-any.whl.metadata (19 kB)
Collecting msal<2.0.0,>=1.15.0 (from azureml-core)
  Obtaining dependency information for msal<2.0.0,>=1.15.0 from https://files.pythonhosted.org/packages/35/33/0fd933b627879a9855d02a83a57929b45d0bdbeb050ddd63109cc404fbf6/msal-1.24.1-py2.py3-none-any.whl.meta

In [1]:
import os
import cv2
from azureml.core import Workspace, Dataset, Datastore

## Using torchvision to create a dataset
from torchvision.datasets import ImageFolder
from torchvision import transforms
import torchvision
import torch


### 2. Setup pre-requisites

In [4]:
## Setup Azure ML Workspace
ws = Workspace.from_config()
## From workspace, get/create the default datastore
ds = ws.get_default_datastore()
ws, ds

(Workspace.create(name='ws-vunn-iusai-sea-sp6k7', subscription_id='d554f489-6933-4c33-8722-a536b3682bd7', resource_group='rg-vunn-iusai-sea-sp6k7'),
 {
   "name": "workspaceblobstore",
   "container_name": "azureml-blobstore-5fddeec9-0fbc-4996-b95f-26996d3f3bcd",
   "account_name": "wsvunniusaisea7140421596",
   "protocol": "https",
   "endpoint": "core.windows.net"
 })

### 3. Extract dataset to images

In [5]:
## define videos location + images output location
video_path = 'videos/'
images_path = 'images/'
print('video_path: ', video_path)
print('images_path: ', images_path)

video_path:  videos/
images_path:  images/


In [4]:
from v2i import extract_images_from_videos
def extract_images_from_videos_collection(video_path, images_path):
    ## get all videos file in video_path
    video_list_location_collection = os.listdir(video_path)

    ## for each video file
    for video_list_location in video_list_location_collection:
        ## check if video_location is not a directory (i.e. is a file), then skip
        if not os.path.isdir(video_path + video_list_location):
            continue
        ## list videos in video_location
        extract_images_from_videos(video_path + video_list_location, images_path)

    ## check images in images_path
    images_list_location_collection = os.listdir(images_path)
    print('images_list_location_collection: ', images_list_location_collection)

video_subject_6_path = video_path + '/subject6/'
## list videos in video_location
label_dict = extract_images_from_videos(video_subject_6_path, images_path)


Image prefix: subject6
Label dict:[['0', 0], ['2', 0], ['1', 0], ['5', 0], ['6', 0], ['3', 0]]
Label dict:[['0', 21650], ['2', 100], ['1', 2300], ['5', 225], ['6', 175], ['3', 3000]]


In [8]:
## show label_dict
print('label_dict: ', label_dict)
## show label_dict keys
label_keys = []
for label in label_dict:
    label_keys.append(label[0])
label_keys.sort()
print('label_dict.keys(): ', label_keys)
## check images in images_path
images_list_location_collection = os.listdir(images_path)
print('images_list_location_collection: ', images_list_location_collection)

NameError: name 'label_dict' is not defined

### 4. Upload images to Azure data store

In [6]:
## upload images to data asset
ds.upload(src_dir=images_path, target_path='images-small', overwrite=True, show_progress=True)

"Datastore.upload" is deprecated after version 1.0.69. Please use "Dataset.File.upload_directory" to upload your files             from a local directory and create FileDataset in single method call. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 27450 files
Uploading images\0\subject6_0.jpg
Uploaded images\0\subject6_0.jpg, 1 files out of an estimated total of 27450
Uploading images\0\subject6_1.jpg
Uploaded images\0\subject6_1.jpg, 2 files out of an estimated total of 27450
Uploading images\0\subject6_10.jpg
Uploaded images\0\subject6_10.jpg, 3 files out of an estimated total of 27450
Uploading images\0\subject6_1000.jpg
Uploaded images\0\subject6_1000.jpg, 4 files out of an estimated total of 27450
Uploading images\0\subject6_10000.jpg
Uploaded images\0\subject6_10000.jpg, 5 files out of an estimated total of 27450
Uploading images\0\subject6_10001.jpg
Uploaded images\0\subject6_10001.jpg, 6 files out of an estimated total of 27450
Uploading images\0\subject6_10002.jpg
Uploaded images\0\subject6_10002.jpg, 7 files out of an estimated total of 27450
Uploading images\0\subject6_10003.jpg
Uploaded images\0\subject6_10003.jpg, 8 files out of an estimated total of 27450
Uploading images\0\subject6_10004.

$AZUREML_DATAREFERENCE_7aca042e1418437a8ccdcd8d930d2b31

In [14]:
## create dataset from datastore
images_ds = Dataset.File.from_files(path=(ds, 'images'))
## register dataset
images_ds = images_ds.register(workspace=ws, name='images_ds', description='images dataset')

### 5. Setup public workspace endpoint

## Define and Train models
1. ResNET18

### Setup device + load dataset

In [3]:
## detect if machine has GPU available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
# download data asset to local if needed
images_path = 'images'
images_ds_path = "images-small"
if not os.path.exists(images_path):
    os.mkdir(images_path)
# download data asset to local if images_path is empty
if len(os.listdir(images_path)) == 0:
    print('images_path is empty, download images_ds to images_path')
    images_ds = Dataset.get_by_name(ws, 'images-small')
    images_ds.download(target_path=images_path,
                       overwrite=True, show_progress=True)

### 1. Resnes18

Using `torchvision` to load `CIFAR10`

In [4]:
from importlib import reload
from torch.utils.data import DataLoader
import data_set as ds
reload(ds)

transform = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor()])
testDs = ds.ImageDataset('images', transform=transform)
## define batch_size
batch_size = 64

dataloader = DataLoader(testDs, batch_size=batch_size, shuffle=True, num_workers=0)

In [5]:
image, label = testDs.get_image(0)
# image.show()
labels = testDs.labels()
## show labels in Interger
print('labels: ',  [int(l) for l in labels])

labels:  [0, 1, 2, 3, 5, 6]


In [6]:
## split dataset into train and test dataset using random_split
from torch.utils.data import random_split
train_size = int(0.8 * len(testDs))
test_size = len(testDs) - train_size
train_ds, val_ds = random_split(testDs, [train_size, test_size])
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
test_dataloader = DataLoader(val_ds, batch_size=batch_size, shuffle=True, num_workers=0)
print('train_ds: ', len(train_ds))
print('val_ds: ', len(val_ds))


train_ds:  21960
val_ds:  5490


In [7]:
## get cnn model of image classification from torchvision
model = torchvision.models.resnet18(pretrained=True)

## define optimizer using Adam and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.CrossEntropyLoss()
optimizer, loss_fn



(Adam (
 Parameter Group 0
     amsgrad: False
     betas: (0.9, 0.999)
     capturable: False
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.001
     maximize: False
     weight_decay: 0
 ),
 CrossEntropyLoss())

In [8]:
import train as t
reload(t)
from train import train

## train model
train(model, optimizer, loss_fn, train_dataloader, test_dataloader, epochs=1, device=device)

Epoch 1/1, Train Loss: 0.1780, Test Loss: 0.3703


In [9]:
## test model using device (CPU or GPU) 1 epoch
model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        images, labels = batch
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        print('predicted: ', predicted)
        print('labels: ', labels)
        break

predicted:  tensor([0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,
        0, 0, 0, 3, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0], device='cuda:0')
labels:  tensor([5, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0,
        0, 0, 0, 3, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 1], device='cuda:0')


In [13]:
## train model using device (CPU or GPU) 10 epoch
train(model, optimizer, loss_fn, train_dataloader, test_dataloader, epochs=10, device=device)

KeyboardInterrupt: 

In [None]:
## test model using device (CPU or GPU) 10 epoch
model.eval()
with torch.no_grad():
    for batch in test_dataloader:
        images, labels = batch
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        print('predicted: ', predicted)
        print('labels: ', labels)
        break

In [None]:
## save model
torch.save(model.state_dict(), 'model_resnes18.pth')