<a href="https://colab.research.google.com/github/ml2-picme/PicMe/blob/master/Image%20Download%20and%20Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Helper Functions

In [31]:
# contact: tai.truong@software-developer.org

import nltk
# regular expression
import re

nltk.download('stopwords')
nltk.download('porter_test')
# get all english stop words like 'the', 'is', 'are', 'over'
stopWords = nltk.corpus.stopwords.words('english')

# returns a list of filtered words
def normalizeWords(input):
  # source: https://www.kdnuggets.com/2018/03/simple-text-classifier-google-colaboratory.html
  # replace all non-letters to whitespace. Example: hello#!world => hello  world
  input = re.sub('[^a-zA-Z]', ' ',  str(input))
  # replace non unicode words (\w), unicode digits (\d), or unicode whitespaces with whitespace
  input = re.sub(r'[^\w\d\s]', ' ', input)
  # replace trailing whitespaces into one whitespace
  input = re.sub(r'\s+', ' ', input)
  # make lowercase
  input = re.sub(r'^\s+|\s+?$', '', input.lower())

  filteredWords = []
  for word in input.split():
    if word not in stopWords:
      filteredWords.append(word)
  return filteredWords

def stem(list):
  ps = nltk.PorterStemmer()
  stemmedList = []
  for word in list:
    stemmedList.append(ps.stem(word))
  return stemmedList

def match(inputText, searchTermList):
  normalized = filterWords(input)
  stemmed = stem(normalized)
  stemmedSearchTermList = stem(searchTermList)
  match = []
  for word in stemmed:
    if word in stemmedSearchTermList:
      index = stemmed.index(word)
      match.append(normalized[index])
  return match

# test code
input = 'the quick brown foxes 123 jumped ❤☀ over äääßßß the lAzy Dog!'
print('input:', input)
normalized = filterWords(input)
print('normalized: ', normalized)
stemmed = stem(normalized)
print('stemmed: ', stemmed)
searchTermList = ['fox', 'dogs', 'cat']
stemmedSearchTermList = stem(searchTermList)
print('stemmed search terms: ', stemmedSearchTermList)
match(input, searchTermList)
print('match: ', match(input, searchTermList))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package porter_test to /root/nltk_data...
[nltk_data]   Package porter_test is already up-to-date!
input: the quick brown foxes 123 jumped ❤☀ over äääßßß the lAzy Dog!
normalized:  ['quick', 'brown', 'foxes', 'jumped', 'lazy', 'dog']
stemmed:  ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']
stemmed search terms:  ['fox', 'dog', 'cat']
match:  ['foxes', 'dog']


In [0]:
import sys

!rm -r PicMe
!git clone https://github.com/ml2-picme/PicMe.git

sys.path.append("/content/PicMe")

from imagenet_processing_script import *

In [0]:
parentToChildrenDictionary = getParentToChildrenDictionary()
childToParentsDictionary = getChildToParentsDictionary()

In [0]:
def expandResultsByImageNetTreeSearch(synsetId, localPath, modelName, predicationProbability):
  results = getWords(synsetId, parentToChildrenDictionary, childToParentsDictionary)
  
  cnx = mysql.connector.connect(user='ml2', password='ml2@hsOg#2019!',
                              host='192.52.33.218',
                              database='ml2',
                              autocommit=True)
  cursor = cnx.cursor()
  add_result = ("insert ignore into results (local_path, model, prediction_class, prediction_probability) values (%s, %s, %s, %s)")
  
  for result in results:
    data_result = (localPath, modelName, result, float(predicationProbability))
    cursor.execute(add_result, data_result)
    result_id = cursor.lastrowid
        
    print(result_id, " | ", localPath, " | ", modelName, " | ", result, " | ", predicationProbability)
       
    cnx.commit()
        
  cursor.close()
  cnx.close()
  
  

In [0]:
from keras.preprocessing.image import load_img, img_to_array
import urllib.request

def downloadFileFromUrl(URL, localPath):
  print("Lade ", URL, " nach ", localPath + " herunter")
  with urllib.request.urlopen(URL) as url:
    with open(localPath, 'wb') as f:
      f.write(url.read())
  load_img(localPath)

In [0]:
def getFileNameFromPath(path):
  filename = path.split("/")[len(path.split("/")) - 1]
  return filename

In [0]:
import os

def createLocalDirectory(dir):
  
  print("Erstelle lokales Verzeichnis:", dir, end='\t')
  
  try:  
    os.makedirs(dir)

  except OSError:  
    print (" - Failed")
  else:
    print(" - OK")

In [0]:
import os
import shutil

def deleteLocalFiles(dir):
  try:
    print("Delete local files", end='')  
    shutil.rmtree(dir) 
    print(" - OK")
  except OSError:
    print(" - Failed")

In [0]:
import os

def findFilesInPath(dir, extensions):

  files = []
  
  # r=root, d=directories, f = files
  for r, d, f in os.walk(dir):
    for file in f:
      for extension in extensions:
        if file.lower().endswith(extension.lower()):
            files.append(os.path.join(r, file))
            break
            
  return files

In [0]:
from PIL import Image
import numpy as np

def prepareImagesForClassification(files, sizeX, sizeY):
  images = []
  
  for file in files:
    try:
      image = Image.open(file)
      image = image.resize((sizeX, sizeY), Image.LANCZOS)
      image = image.convert("RGB")
      
      image = np.asarray(image)
      images.append(image)
    except OSError:
      pass
  
  images = np.asarray(images)
  return images

In [0]:
def classifyImages(preparedImages, preprocess_input_function, decode_predictions_function, model):
  
  # preprocess the images to fit to the model
  images_preprocessed = preprocess_input_function(preparedImages)
  
  # use the model to classifi the images
  images_pred = model.predict(images_preprocessed, verbose=1)
  
  pred_results = decode_predictions_function(images_pred)
  
  return pred_results

In [0]:
import matplotlib.pyplot as plt

def compareResults(fileNames, resulsList, modelList, threshold, images):
  
  for i in range(len(fileNames)):
    print("Comparing the Results for File: " + fileNames[i])
    plt.figure()
    plt.imshow(images[i])
    plt.axis('off')
    plt.show()
    plt.clf()
    
    for j in range(5):
      for k in range(len(modelList)):
        resultToPrint = resultsList[k][i][j]
        
        # Setze 5% als Threshold
        if(resultToPrint is not None and resultToPrint[2] > threshold):
          print(" > ", (j+1), ". Platz @ ", modelList[k], " : ", resultsList[k][i][j])
        else:
          print(" > ", (j+1), ". Platz @ ", modelList[k], " : ", "--- Threshold-Filter ---")
      
      print("=================================================================")
      
    print("")

In [0]:
def generateCsvForModelComparison(fileNames, resulsList, modelList, filesDict):
  allModelCSVs = []
  
  for k in range(len(modelList)):
    modelCSV = "URL"
    for i in range(len(fileNames)):
      modelCSV += ";" + "=HYPERLINK(\"" + filesDict[fileNames[i]] + "\")"
  
    modelCSV += "\n"
    modelCSV += modelList[k]
    
    for i in range(len(fileNames)):
      modelCSV += ";" + getFileNameFromPath(fileNames[i])
      
    modelCSV += "\n"
    
    for j in range(5):
      modelCSV += str(j+1) + ".Platz"
      
      for i in range(len(fileNames)):
        modelCSV += ";" + str(resultsList[k][i][j][1])
        
      modelCSV += "\n"
      
    allModelCSVs.append(modelCSV)
        
  return allModelCSVs

In [0]:
# MySQL Part -> Writing Image Classification Results to DB
!pip install mysql-connector-python-rf

import mysql.connector

def storeResultsToDB(fileNames, resultsList, modelList):
  
  cnx = mysql.connector.connect(user='ml2', password='ml2@hsOg#2019!',
                              host='192.52.33.218',
                              database='ml2',
                              autocommit=True)
  cursor = cnx.cursor()
  add_result = ("insert ignore into results (local_path, model, prediction_class, prediction_probability) values (%s, %s, %s, %s)")
  
  for k in range(len(modelList)):
    
    print("==== other model =====")
    
    for i in range(len(fileNames)):
      
      print("==== other file =====")
      
      for j in range(5):
        
        print("Counter:")
        print("Model", (k+1), "von", len(modelList))
        print("File", (i+1), "von", len(fileNames))
        print("Platz", (j+1), "von", 5)
        
        fileName = fileNames[i]
        modelName = modelList[k]
        predictedClassSynsetId = resultsList[k][i][j][0]
        predictedClass = resultsList[k][i][j][1]
        predictedPropability = resultsList[k][i][j][2]
  
        #print(fileName)
        #print(modelName)
        #print(predictedClass)
        #print(predictedPropability)
        #print("==============")
        
        data_result = (fileName, modelName, predictedClass, float(predictedPropability))
        cursor.execute(add_result, data_result)
        result_id = cursor.lastrowid
        
        print(result_id, " | ", fileName, " | ", modelName, " | ", predictedClass, " | ", predictedPropability)
       
        cnx.commit()
        
        print("Now searching for similar words in ImageNet tree (parent / child search)")
        
        expandResultsByImageNetTreeSearch(predictedClassSynsetId, fileName, modelName, predictedPropability)
        
  cursor.close()
  cnx.close()

In [0]:
!pip install mysql-connector-python-rf

import mysql.connector
import matplotlib.pyplot as plt

def querySearchWord(searchWord):
  cnx = mysql.connector.connect(user='ml2', password='ml2@hsOg#2019!',
                              host='192.52.33.218',
                              database='ml2',
                              autocommit=True)
  cursor = cnx.cursor()
  query = ("select distinct local_path, prediction_class from results where prediction_class = %s")
  
  cursor.execute(query, (searchWord,))
  
  print("Found following files for your search word \"" + searchWord + "\":")
  
  for (local_path, prediction_class) in cursor:
    foundFiles = [open(local_path, 'rb')]
    preparedImage224x224 = prepareImagesForClassification(foundFiles, 224, 224)[0]

    plt.figure()
    plt.imshow(preparedImage224x224)
    plt.title("{}".format(local_path))
    plt.axis('off')
    plt.show()
    plt.clf()



  cursor.close()
  cnx.close()  

## Logic

In [0]:
# Parameters
path = "/tmp/image_classification"
hashrange = 20

In [0]:
# Preparation: Delete local files
deleteLocalFiles(path)

In [0]:
# Preparation: Create local directory structure
for i in range(hashrange):
  if(i % 10 == 0):
    parentPath = path + "/" + str((int)(i/10))
    createLocalDirectory(parentPath)
  normalizedI = '%02d' % i  # Normalization, pad zeroes
  filePath = parentPath + "/" + normalizedI
  createLocalDirectory(filePath)

In [0]:
from urllib.request import urlopen

filesDict = {}

data = urlopen("https://raw.githubusercontent.com/ml2-picme/PicMe/master/input/images.txt")
for line in data:
  if not line.startswith(b'#'):  # Ignore Lines that begin with a comment (#)
    line = line.decode("utf-8").split("\n")[0]  # Normalization
    url = line.split(";")[0]
    label = line.split(";")[1]
    
    filename = getFileNameFromPath(url)

    hashvalue = abs(hash(filename)) % hashrange
    parent_dir = (int)(hashvalue / 10)
    hashvalue = '%02d' % hashvalue  # Normalization, pad zeroes
    
    filetype = filename.split(".")[len(filename.split(".")) - 1]
    newFilename = label + "." + filetype
    #print(newFilename)

    print(url, " -> ", hashvalue, " -> ", label, " -> ", parent_dir, " -> ", filename)

    localPath = path + "/" + str(parent_dir) + "/" + hashvalue + "/" + newFilename

    downloadFileFromUrl(url, localPath)
    
    filesDict[localPath] = url
    
for x, y in filesDict.items():
  print(x, "->", y)

In [0]:
extensionsToCheck = [".jpg", ".png", ".bmp"]
foundFiles = findFilesInPath(path, extensionsToCheck)

for foundFile in foundFiles:
  print(foundFile)

preparedImages224x224 = prepareImagesForClassification(foundFiles, 224, 224)
preparedImages299x299 = prepareImagesForClassification(foundFiles, 299, 299)

In [0]:
from keras.applications import *

# Achtung: Hier werden Funktionen übergeben: 
# 1) preprocess_input Funktion
# 2) decode_predictions Funktion
# => Dies sorgt dafür, dass die gesamte Klassifizierung ausgelagert und dynamisch aufgerufen werden kann!

predictedClassesVGG16 = classifyImages(preparedImages224x224, vgg16.preprocess_input, vgg16.decode_predictions, vgg16.VGG16(input_shape=(224, 224, 3)))
predictedClassesVGG19 = classifyImages(preparedImages224x224, vgg19.preprocess_input, vgg19.decode_predictions, vgg19.VGG19(input_shape=(224, 224, 3)))
predictedClassesMobileNetV2 = classifyImages(preparedImages224x224, mobilenet_v2.preprocess_input, mobilenet_v2.decode_predictions, mobilenet_v2.MobileNetV2(input_shape=(224, 224, 3)))
predictedClassesResNet50 = classifyImages(preparedImages224x224, resnet50.preprocess_input, resnet50.decode_predictions, resnet50.ResNet50(input_shape=(224, 224, 3)))
predictedClassesDenseNet201 = classifyImages(preparedImages224x224, densenet.preprocess_input, densenet.decode_predictions, densenet.DenseNet201(input_shape=(224, 224, 3)))
predictedClassesInceptionV3 = classifyImages(preparedImages299x299, inception_v3.preprocess_input, inception_v3.decode_predictions, inception_v3.InceptionV3(input_shape=(299, 299, 3)))
predictedClassesXception = classifyImages(preparedImages299x299, xception.preprocess_input, xception.decode_predictions, xception.Xception(input_shape=(299, 299, 3)))
predictedClassesInceptionResNet = classifyImages(preparedImages299x299, inception_resnet_v2.preprocess_input, inception_resnet_v2.decode_predictions,inception_resnet_v2.InceptionResNetV2(input_shape=(299, 299, 3)))

In [0]:
resultsList = [predictedClassesVGG16, predictedClassesVGG19, predictedClassesMobileNetV2, predictedClassesResNet50, predictedClassesDenseNet201, predictedClassesInceptionV3, predictedClassesXception, predictedClassesInceptionResNet]
modelList = ['VGG16', 'VGG19', 'MobileNetV2', 'ResNet50', 'DenseNet201', 'InceptionV3', 'Xception', 'InceptionResNet']

# Diese compareResults Methode braucht 4 Parameter:
# 1. Die Liste der Dateinamen
# 2. Die Ergebnisse der einzelnen Modellen, als Array zusammengefasst
# 3. Die Namen der Modelle, einfach als String Array
# 4. Threshold (mind. Sicherheit der Modellvorhersage) => nach Treffen am 17.04. auf 0.0 gesetzt (= kein Filter)
compareResults(foundFiles, resultsList, modelList, 0.00, preparedImages299x299)

In [0]:
allResultsCsv = generateCsvForModelComparison(foundFiles, resultsList, modelList, filesDict)

for result in allResultsCsv:
  print(result)

In [0]:
storeResultsToDB(foundFiles, resultsList, modelList)

In [0]:
querySearchWord("bucket")