### Libraries

In [2]:
import os
import time
import glob
import numpy as np
import pandas as pd

### Input Parameters

In [4]:
if not os.path.exists("/dbfs/mnt/officehours"):
  dbutils.fs.mount(
    source = "wasbs://officehours@adlscovid19.blob.core.windows.net/",
    mount_point = "/mnt/officehours",
    extra_configs = {"fs.azure.account.key.adlscovid19.blob.core.windows.net" : dbutils.secrets.get("covid19", "storage_key")})

In [5]:
_TRAIN = '/dbfs/mnt/officehours/train/'
_TEST = '/dbfs/mnt/officehours/test/'

In [6]:
CUSTOM_VISION_PROJECT_NAME = "KAREN-DEMO"
ENDPOINT = "https://covid-cv.cognitiveservices.azure.com/"
training_key = dbutils.secrets.get("covid19", "cv_training_key")
prediction_key = dbutils.secrets.get("covid19", "cv_prediction_key")
prediction_resource_id = "/subscriptions/5cf319e7-8fcb-4a24-bd7e-251785cc9e4f/resourceGroups/rg-covid19/providers/Microsoft.CognitiveServices/accounts/covidcv-Prediction"

### Step 1: Loading the Custom Vision Client
To install the Custom Vision service SDK for Python, you just need to install it from PyPI:
- `pip install azure-cognitiveservices-vision-customvision`

In [8]:
from azure.cognitiveservices.vision.customvision.training import CustomVisionTrainingClient

trainer = CustomVisionTrainingClient(training_key, endpoint=ENDPOINT)
PROJECTS_CUSTOM_VISION = {p.name : p.id for p in trainer.get_projects()}

Try to find a project named `CUSTOM_VISION_PROJECT_NAME` under the Custom Vision environment, if not found, a new project with this name is created.

In [10]:
try:
  project = trainer.get_project(PROJECTS_CUSTOM_VISION[CUSTOM_VISION_PROJECT_NAME])
  print("Importing the project: "+CUSTOM_VISION_PROJECT_NAME)
except:
  print("Creating a new project named: "+CUSTOM_VISION_PROJECT_NAME)
  project = trainer.create_project(CUSTOM_VISION_PROJECT_NAME, classification_type="Multiclass")
  # Create two tags in the new project (for healthy and unhealthy images)
  unhealthy_tag = trainer.create_tag(project.id, "unhealthy")
  healthy_tag = trainer.create_tag(project.id, "healthy")


In [11]:
DICT_TAGS = {tag.name : tag.id for tag in trainer.get_tags(project.id)}

### Step 2: Defining our Training Dataset
Creating a DataFrame with the `filepath` of each image from the training dataset

In [13]:
model_df = pd.DataFrame([])
for t in DICT_TAGS.keys():
  temp_df = pd.DataFrame([])
  temp_df["filepath"] = glob.glob(os.path.join(_TRAIN, t, '*.png'))
  temp_df["name"] = temp_df.filepath.apply(lambda x: os.path.basename(x).split('.')[0])
  temp_df["label"] = t
  model_df = model_df.append(temp_df)

In [14]:
model_df.head(3)

Unnamed: 0,filepath,name,label
0,/dbfs/mnt/officehours/train/healthy/train-neg-...,train-neg-0003,healthy
1,/dbfs/mnt/officehours/train/healthy/train-neg-...,train-neg-0010,healthy
2,/dbfs/mnt/officehours/train/healthy/train-neg-...,train-neg-0011,healthy


In [15]:
model_df.groupby("label").count()["name"]

### Step 3: Sending image files to Custom Vision
Create the Custom Vision image entries:

In [17]:
from azure.cognitiveservices.vision.customvision.training.models import ImageFileCreateEntry

def custom_vision_entry(x):
  with open(x.filepath, "rb") as image_contents:
    entry = ImageFileCreateEntry(name=x.name, contents=image_contents.read(), tag_ids=[DICT_TAGS[x.label]])
  return entry

In [18]:
custom_vision_image_list = model_df.apply(custom_vision_entry, axis=1).to_list()
len(custom_vision_image_list)

Split the images in smaller chunks of 64 entries, since Custom Vision batch upload feature has a limit of 64 files at a time:

In [20]:
chunks = [custom_vision_image_list[x:x+64] for x in range(0, len(custom_vision_image_list), 64)]

In [21]:
# Sending each chunk to the Custom Vision environment
for c in chunks:
  upload_result = trainer.create_images_from_files(project.id, images=c)
  if not upload_result.is_batch_successful:
    print("Image batch upload failed.")
    for image in upload_result.images:
      if image.status != 'OK':
        print(image.source_url, "Image status: ", image.status)

### Step 4: Model Training

In [23]:
iteration = trainer.train_project(project.id)
print("Iteration ID:",iteration.id)
print ("Training status: " + iteration.status)
while (iteration.status != "Completed"):
  time.sleep(10)
  iteration = trainer.get_iteration(project.id, iteration.id)
  print ("Training status: " + iteration.status)

### Step 5: Model Publishing

In [25]:
# The iteration is now trained. Publish it to the project endpoint
trainer.publish_iteration(project.id, iteration.id, "officehours", prediction_resource_id)
print ("Done!")

### Step 6: Defining our Test Dateset
Creating a DataFrame with the `filepath` of each image from the test dataset

In [27]:
test_df = pd.DataFrame([])
for t in DICT_TAGS.keys():
  temp_df = pd.DataFrame([])
  temp_df["filepath"] = glob.glob(os.path.join(_TEST, t, '*.png'))
  temp_df["name"] = temp_df.filepath.apply(lambda x: os.path.basename(x).split('.')[0])
  temp_df["label"] = t
  test_df = test_df.append(temp_df)

In [28]:
test_df.head(3)

Unnamed: 0,filepath,name,label
0,/dbfs/mnt/officehours/test/unhealthy/train-pos...,train-pos-0019,unhealthy
1,/dbfs/mnt/officehours/test/unhealthy/train-pos...,train-pos-0022,unhealthy
2,/dbfs/mnt/officehours/test/unhealthy/train-pos...,train-pos-0024,unhealthy


In [29]:
test_df.groupby("label").count()["name"]

### Step 7: Calculating performance metrics from `test_df`
Test the published endpoint:

In [31]:
from azure.cognitiveservices.vision.customvision.prediction import CustomVisionPredictionClient

# Now there is a trained endpoint that can be used to make a prediction
predictor = CustomVisionPredictionClient(prediction_key, endpoint=ENDPOINT)
for i in range(test_df.shape[0])[:5]:
  with open(test_df.filepath.to_list()[i], "rb") as image_contents:
    results = predictor.classify_image(project.id, "officehours", image_contents.read())
    # Display the results.
    print("Image: "+test_df.name.to_list()[i])
    print("True label: "+test_df.label.to_list()[i])
    for prediction in results.predictions:
      print("  Score " + prediction.tag_name + ": {0:.2f}%".format(prediction.probability * 100))
    print("")


Create a python function to iterate over the whole `test_df`, getting the predictions from ther published model:

In [33]:
def get_prediction(x, publish_name='officehours'):
  with open(x.filepath, "rb") as image_contents:
    results = predictor.classify_image(project.id, publish_name, image_contents.read())
  prediction = results.predictions[0]
  return (prediction.tag_name, prediction.probability)

Get the predictions for all the images on `test_df`:

In [35]:
aux = test_df.apply(get_prediction, axis=1)
test_df["pred"] = [v[0] for v in aux.to_list()]
test_df["prob"] = [v[1] for v in aux.to_list()]

Finally, we can calculate some model performance metrics:

In [37]:
from sklearn import metrics

print("Accuracy:", "{:.2%}".format(metrics.accuracy_score(test_df.label, test_df.pred)))
print("Recall:", "{:.2%}".format(metrics.recall_score(test_df.label, test_df.pred, pos_label='unhealthy')))
print("Precision:", "{:.2%}".format(metrics.precision_score(test_df.label, test_df.pred, pos_label='unhealthy')))
print("F1 Score:", "{:.2%}".format(metrics.f1_score(test_df.label, test_df.pred, pos_label='unhealthy')))

### Step 8: Advanced training (OPTIONAL)

In [39]:
iteration = trainer.train_project(project.id, training_type = 'Advanced', reserved_budget_in_hours = 1)
print("Iteration ID:",iteration.id)
print ("Training status: " + iteration.status)
while (iteration.status != "Completed"):
  time.sleep(120)
  iteration = trainer.get_iteration(project.id, iteration.id)
  print ("Training status: " + iteration.status)

Publishing the model from the advanced training iteration with a different `publish_name`:

In [41]:
# The iteration is now trained. Publish it to the project endpoint
trainer.publish_iteration(project.id, iteration.id, "officehoursadv", prediction_resource_id)
print ("Done!")

Testing the new model on `test_df`:

In [43]:
aux = test_df.apply(lambda x: get_prediction(x, "officehoursadv"), axis=1)
test_df["pred_adv"] = [v[0] for v in aux.to_list()]
test_df["prob_adv"] = [v[1] for v in aux.to_list()]

Updated metrics:

In [45]:
from sklearn import metrics

print("Accuracy:", "{:.2%}".format(metrics.accuracy_score(test_df.label, test_df.pred_adv)))
print("Recall:", "{:.2%}".format(metrics.recall_score(test_df.label, test_df.pred_adv, pos_label='unhealthy')))
print("Precision:", "{:.2%}".format(metrics.precision_score(test_df.label, test_df.pred_adv, pos_label='unhealthy')))
print("F1 Score:", "{:.2%}".format(metrics.f1_score(test_df.label, test_df.pred_adv, pos_label='unhealthy')))

### Step 9: Deleting the Custom Vision project (OPTIONAL)

In [47]:
#for iteration in trainer.get_iterations(project.id):
#  trainer.unpublish_iteration(project.id, iteration.id)
#trainer.delete_project(project.id)

In [48]:
#dbutils.fs.unmount(mount_point = "/mnt/officehours")