In [1]:
import cv2
import numpy as np
# import numpy
import scipy
from scipy.misc import imread
import pickle
# import random
import os
from sklearn.externals import joblib
import matplotlib.pyplot as plt
import sparkpickle
from io import BytesIO
import time
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sc

In [2]:
print(cv2.__version__)

3.4.2


### Feature extraction from a image

In [3]:
def extract_features(image_path, vector_size=128):
    image = imread(image_path, mode="RGB")
    try:
        alg = cv2.KAZE_create()
        kps = alg.detect(image)
        kps = sorted(kps, key=lambda x: -x.response)[:vector_size]
        kps, dsc = alg.compute(image, kps)
        dsc = dsc.flatten()
        needed_size = (vector_size * 64)
        if(dsc.size < needed_size):
            dsc = np.concatenate([dsc, np.zeros(needed_size - dsc.size)])
    except cv2.error as e:
        print('Error: ', e)
        return None
    return dsc


### Feature extraction from batch of image

In [4]:
def batch_extractor(img_path):
    files = [os.path.join(img_path, p) for p in sorted(os.listdir(img_path))]
        
    result = []
    for f in files:
        print("%s" %f)
        name = f.split('/')[-2].lower()
        print(name)
        features = extract_features(f)
        print(features)
        result.append(features)
        dump_file_name = "./db_dump/"+name+".pkl"
#     joblib.dump(result, dump_file_name)
    outfile = open(dump_file_name,'wb')
    pickle.dump(result, outfile)
    outfile.close()
    return result


### From extracing and storing pickel file

In [5]:
# folder_path='./image'
# files = [os.path.join(folder_path, p) for p in sorted(os.listdir(folder_path))]
# for i in files:
#     print(i)
#     batch_extractor(i)

# batch_extractor('./image/pond')

In [6]:
import numpy

In [5]:
class Matcher(object):
    
    def __init__(self,db_dump= './db_dump',vector_size=128 ):
        self.db_dump = db_dump
        self.vector_size = vector_size

    def extract_features(self,image_path):
        image = imread(image_path, mode="RGB")
        try:
#             alg = cv2.xfeatures2d.SURF_create()
            alg = cv2.KAZE_create()
            kps = alg.detect(image)
            kps = sorted(kps, key=lambda x: -x.response)[:self.vector_size]
            kps, dsc = alg.compute(image, kps)
            dsc = dsc.flatten()
            needed_size = (self.vector_size * 64)
            if(dsc.size < needed_size):
                dsc = np.concatenate([dsc, np.zeros(needed_size - dsc.size)])
        except cv2.error as e:
            print('Error: ', e)
            return None
        return dsc

        
    def match(self, image_path):
        files = [os.path.join(self.db_dump, p) for p in sorted(os.listdir(self.db_dump))]
        features = self.extract_features(image_path)
        print(features)
        self.dist = {}
        for i in files:
#             pkl = joblib.load(i)
            pkl_file = open(i,'rb')
            pkl = pickle.load(pkl_file)
#             print(pkl)
#             print(pkl)
#             pkl_file = sc.binaryFiles(i)
#             pkl = pkl_file.values()
#             print(pkl)
#             print(pkl.take(1))
#             print(pkl.first())
#             pkl = np.asarray(pkl.first())


            
            
            img_dist = scipy.spatial.distance.cdist(pkl, features.reshape(1,-1), 'euclidean').reshape(-1)
            name = i.split('/')[-1].lower()
            name = name.split('.')[0]
            self.dist[name] = min(img_dist)
            pkl_file.close()
        print(self.dist)
        return self.dist

In [6]:
m = Matcher()
start = time.time()
m.match('cut_watermelon.jpg.653x0_q80_crop-smart.jpg')
end = time.time() - start
print('time = ',end)

[-0.08858097  0.03786298  0.09164749 ...  0.0316714   0.05101985
  0.03982824]
{'fruit': 0.0, 'pond': 7.887596372197239, 'staircase': 8.042629064206457}
time =  1.1024937629699707


In [8]:
m = Matcher()
start = time.time()
m.match('./gsun_0b054a9f68b8d0ba36f620ed9e99850d.jpg')
end = time.time() - start
print('time = ',end)

[-0.02409058  0.05696743  0.02413429 ...  0.03673164  0.17281295
  0.03693292]
{'fruit': 9.621196014778496, 'pond': 8.881111575463148, 'staircase': 0.0}
time =  0.17062854766845703


In [9]:
m = Matcher()
start = time.time()
m.match('./gsun_0a2cc110677e3611d8b9ae97278fd489.jpg')
end = time.time() - start
print('time = ',end)

[-0.00488521  0.00223366  0.0104039  ...  0.00152859  0.03190349
  0.06472354]
{'fruit': 10.512500950081577, 'pond': 0.0, 'staircase': 9.983319028083455}
time =  0.1675713062286377


In [11]:
m.match('water-bottle.png')

[0.10285926 0.15143955 0.10808402 ... 0.08395358 0.13674533 0.09273567]
{'fruit': 0.0, 'pond': 8.867754493042764, 'staircase': 8.769415293237534}


{'fruit': 0.0, 'pond': 8.867754493042764, 'staircase': 8.769415293237534}

In [12]:
m.match('foun.jpg')

[-0.02653385 -0.02261907  0.09161882 ... -0.00085813  0.0057378
  0.00831428]
{'fruit': 8.654220710487563, 'pond': 8.17867356885083, 'staircase': 8.19352921667635}


{'fruit': 8.654220710487563,
 'pond': 8.17867356885083,
 'staircase': 8.19352921667635}

In [13]:
# batch_extractor('./test/img/')

In [14]:
m.match('stir.jpg')

[-0.08028362  0.04824756  0.09115983 ...,  0.00141928  0.09569666
  0.01787554]
{'fruit': 8.5395672746138285, 'pond': 7.5304099914639302, 'staircase': 7.5548699058538711}


{'fruit': 8.5395672746138285,
 'pond': 7.5304099914639302,
 'staircase': 7.5548699058538711}

In [15]:
m.match('cut_watermelon.jpg.653x0_q80_crop-smart-ConvertImage.jpg')

[-0.08810823  0.03952925  0.09145768 ..., -0.0113938   0.03848445
  0.04105413]
{'fruit': 7.5656677237029024, 'pond': 7.9878637794427609, 'staircase': 8.0902829720718952}


{'fruit': 7.5656677237029024,
 'pond': 7.9878637794427609,
 'staircase': 8.0902829720718952}

In [13]:

words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)
counts = words.count()
print("Number of elements in RDD -> %i" % (counts))

Number of elements in RDD -> 8


In [17]:
words = sc.parallelize (
   ["scala", 
   "java", 
   "hadoop", 
   "spark", 
   "akka",
   "spark vs hadoop", 
   "pyspark",
   "pyspark and spark"]
)

words_map = words.map(lambda x: (x, 1))
print(words_map)
mapping = words_map.collect()

print("Key value pair -> %s" % (mapping))

PythonRDD[3] at RDD at PythonRDD.scala:53
Key value pair -> [('scala', 1), ('java', 1), ('hadoop', 1), ('spark', 1), ('akka', 1), ('spark vs hadoop', 1), ('pyspark', 1), ('pyspark and spark', 1)]


In [16]:
words = sc.parallelize (
   [1,2,3,4]
)
def f(x):
    
    return x*43
fore = words.map(lambda x: f(x))
print(fore.collect())


# print("Key value pair -> %s" % (mapping))

[43, 86, 129, 172]


In [17]:
# words_map.max()
# words_map.sortByKey()

NameError: name 'words_map' is not defined

In [18]:
def extract_features1(image_path, vector_size=128):
    image = imread(image_path, mode="RGB")
    try:
        alg = cv2.KAZE_create()
        kps = alg.detect(image)
        kps = sorted(kps, key=lambda x: -x.response)[:vector_size]
        kps, dsc = alg.compute(image, kps)
        dsc = dsc.flatten()
        needed_size = (vector_size * 64)
        if(dsc.size < needed_size):
            dsc = np.concatenate([dsc, np.zeros(needed_size - dsc.size)])
    except cv2.error as e:
        print('Error: ', e)
        return None
    return dsc


In [19]:
# features = extract_features1('./cut_watermelon.jpg.653x0_q80_crop-smart.jpg',128)
# features = extract_features1('./staircase/gsun_0b054a9f68b8d0ba36f620ed9e99850d.jpg')
# features = extract_features1('./image/pond/gsun_0a2cc110677e3611d8b9ae97278fd489.jpg')
dist={}
list2=[]
def match1(i):
    dist = {}
    pkl_file = open(i,'rb')
    pkl = pickle.load(pkl_file)
    img_dist = scipy.spatial.distance.cdist(pkl, features.reshape(1,-1), 'euclidean').reshape(-1)
    name = i.split('/')[-1].lower()
    name = name.split('.')[0]
    dist[name] = min(img_dist)
    list2.append(min(img_dist))
    pkl_file.close()
    return dist

In [20]:
features = extract_features1('./cut_watermelon.jpg.653x0_q80_crop-smart.jpg',128)
db_dump = './db_dump'
start = time.time()
files = [match1(os.path.join(db_dump, p)) for p in sorted(os.listdir(db_dump))]
print(files)
listt  = sc.parallelize(files)
# yu = listt.map(lambda x:match1(x))
# match1('./db_dump/img.pkl')
a= listt.values()
listt.collect()
end = time.time()-start
print('time =',end)

[{'fruit': 0.0}, {'pond': 7.887596372197239}, {'staircase': 8.042629064206457}]
time = 0.05432915687561035


In [21]:
features = extract_features1('./gsun_0b054a9f68b8d0ba36f620ed9e99850d.jpg')
db_dump = './db_dump'
start = time.time()
files = [match1(os.path.join(db_dump, p)) for p in sorted(os.listdir(db_dump))]
print(files)
listt  = sc.parallelize(files)
# yu = listt.map(lambda x:match1(x))
# match1('./db_dump/img.pkl')
a= listt.values()
listt.collect()
end = time.time()-start
print('time =',end)

[{'fruit': 9.621196014778496}, {'pond': 8.881111575463148}, {'staircase': 0.0}]
time = 0.060945749282836914


In [22]:
features = extract_features1('./gsun_0a2cc110677e3611d8b9ae97278fd489.jpg')
db_dump = './db_dump'
start = time.time()
files = [match1(os.path.join(db_dump, p)) for p in sorted(os.listdir(db_dump))]
print(files)
listt  = sc.parallelize(files)
# yu = listt.map(lambda x:match1(x))
# match1('./db_dump/img.pkl')
a= listt.values()
listt.collect()
end = time.time()-start
print('time =',end)

[{'fruit': 10.512500950081577}, {'pond': 0.0}, {'staircase': 9.983319028083455}]
time = 0.07507157325744629
