## Automated Snow Leopard Detection with Microsoft ML for Apache Spark

<img src="https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/SLTrust.PNG" width="900" style="float: right;"/>

In [2]:
import os
BING_IMAGE_SEARCH_KEY = os.environ["BIN_IMAGE_SEARCH_KEY"] #please add your key here

In [3]:
from mmlspark import *
from mmlspark import FluentAPI
import os
from pyspark.sql.functions import lit

def bingPhotoSearch(name, queries, pages):
  offsets = [offset*10 for offset in range(0, pages)] 
  parameters = [(query, offset) for offset in offsets for query in queries]
  
  return spark.createDataFrame(parameters, ("queries","offsets")) \
    .mlTransform(
      BingImageSearch()                             # Apply Bing Image Search
        .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)  # Set the API Key
        .setOffsetCol("offsets")                    # Specify a column containing the offsets
        .setQueryCol("queries")                     # Specify a column containing the query words
        .setCount(10)                               # Specify the number of images to return per offset
        .setImageType("photo")                      # Specify a filter to ensure we get photos
        .setOutputCol("images")) \
    .mlTransform(BingImageSearch.getUrlTransformer("images", "urls")) \
    .withColumn("labels", lit(name)) \
    .limit(200)


<img src="https://mmlspark.blob.core.windows.net/graphics/SparkSummit2/cog_services.png" width="800" style="float: right;"/>

In [5]:
def displayDF(df, n=5, image_cols = set(["urls"])):
  rows = df.take(n)
  cols = df.columns
  header = "".join(["<th>" + c  + "</th>" for c in cols])
  
  style = """
<!DOCTYPE html>
<html>
<head>
<style>
table {
    font-family: arial, sans-serif;
    border-collapse: collapse;
    width: 300;
}

td, th {
    border: 1px solid #dddddd;
    text-align: left;
    padding: 8px;
}

tr:nth-child(even) {
    background-color: #dddddd;
}
</style>
</head>"""
  
  table = []
  for row in rows:
    table.append("<tr>")
    for col in cols:
      if col in image_cols:
        rep = '<img src="{}",  width="100">'.format(row[col])
      else:
        rep = row[col]
      table.append("<td>{}</td>".format(rep))
    table.append("</tr>")
  tableHTML = "".join(table)
  
  body = """
<body>
<table>
  <tr>
    {} 
  </tr>
  {}
</table>
</body>
</html>
  """.format(header, tableHTML)
  try:
    displayHTML(style + body)
  except:
    pass

In [6]:
snowLeopardQueries = ["snow leopard"]
snowLeopardUrls = bingPhotoSearch("snow leopard", snowLeopardQueries, pages=100)
displayDF(snowLeopardUrls)

In [7]:
randomWords = spark.read.parquet("wasb://publicwasb@mmlspark.blob.core.windows.net/random_words.parquet").repartition(20)
randomWords.show()

In [8]:
randomLinks = randomWords \
  .mlTransform(BingImageSearch()
    .setSubscriptionKey(BING_IMAGE_SEARCH_KEY)
    .setCount(10)
    .setQueryCol("words")
    .setOutputCol("images")) \
  .mlTransform(BingImageSearch.getUrlTransformer("images", "urls")) \
  .withColumn("label", lit("other")) \
  .limit(200)
  
displayDF(randomLinks)

In [9]:
images = snowLeopardUrls.union(randomLinks)\
  .mlTransform(BingImageSearch.downloadFromUrls("urls", "image", concurrency=5, timeout=5000))\
  .dropna().coalesce(50)

train, test = images.randomSplit([.7,.3], seed=1)

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression

def getIndex(row):
  return float(row[1])

network = ModelDownloader(spark, "dbfs:/Models/").downloadByName("ResNet50")
model = Pipeline(stages=[
  StringIndexer(inputCol = "labels", outputCol="index"),
  ImageFeaturizer(inputCol="image", outputCol="features", cutOutputLayers=2).setModel(network),
  LogisticRegression(maxIter=20, labelCol="index"),
  UDFTransformer()\
      .setUDF(udf(getIndex, DoubleType()))\
      .setInputCol("probability")\
      .setOutputCol("leopard_prob")
])

fitModel = model.fit(train)

<img src="https://mmlspark.blob.core.windows.net/graphics/SnowLeopardAD/SLPipeline.PNG" width="900" style="float: right;"/>

In [12]:
def plotConfusionMatrix(df, label, prediction, classLabels):
  from mmlspark.plot import confusionMatrix
  import matplotlib.pyplot as plt
  fig = plt.figure(figsize=(4.5, 4.5))
  confusionMatrix(df, label, prediction, classLabels)
  display(fig)

plotConfusionMatrix(fitModel.transform(test), "index", "prediction", fitModel.stages[0].labels)

In [13]:
test_subsample = test.limit(1).cache()

lime = ImageLIME()\
  .setModel(fit_model)\
  .setLabelCol("leopard_prob")\
  .setOutputCol("weights")\
  .setInputCol("image")\
  .setCellSize(100.0)\
  .setModifier(50.0)\
  .setNSamples(200)

result = lime.transform(test_subsample)

In [14]:
import matplotlib.pyplot as plt
import PIL
import io

def plot_superpixels(sp_list, bytesRow):
  img = (PIL.Image.open(io.BytesIO(bytesRow))).convert('RGBA')
  f = np.asarray(img).copy()
  fig, ax = plt.subplots()
  for sp in sp_list:
    for pixel in sp[1]: #access the pixel info
      f[pixel[1],pixel[0],0] = 0
      f[pixel[1],pixel[0],1] = 255
      f[pixel[1],pixel[0],2] = 0
      f[pixel[1],pixel[0],3] = 200 #opacity
    plt.imshow(f)
  display()

# Gets first row from the LIME-transformed data frame
topRow= result.take(1)[0]
urlRow = topRow['urls']
bytesRow = topRow['image']
superpixelsRow = topRow['superpixels']
weightsRow = topRow['weights']

#selects top 5 superpixels- zipped with weights to keep them together
z = sorted(list(zip(weightsRow, superpixelsRow['clusters'])))

meanMaxDifference = np.max(weightsRow) - np.mean(weightsRow)
hideAmount = 3
threshold = np.max(weightsRow) - (meanMaxDifference / hideAmount)
plot_superpixels([i for i in z if i[0] >= threshold], bytesRow)