In [1]:
from collections import defaultdict
import glob
import hashlib 
from PIL import Image, ImageCms
import numpy as np
from blockhash import blockhash
import cv2
import pandas as pd
from siftdwthash import sift_dwt_hash


In [2]:
blur_folders = ['boxblurred', 'gaussianblurred', 'medianblurred', 'modeblurred']
noise_folders = ['gaussiannoise', 'peppernoise', 'poissonnoise', 'saltnoise', 'specklenoise', 'diffformat']
geo_folders = ['cropped', 'rotated', 'scaled']

In [3]:
def get_images_originals():
    images = []
    for filename in glob.glob('images/original/*'):
        img = Image.open(filename)
        images.append(img)
    return images
images  = get_images_originals()

def shuffle_sort(a):
	d = defaultdict(list)    
	for thing in a:
		if (len(thing) < 1):
			continue
		d[thing[0]] += thing,
	return dict(sorted(d.items())).values()
def reduce_aggr(xs):
	sum = 0
	for x in xs:
		sum += x[1]
	return [xs[0][0], sum]
def map_sift_dwt_hash():
    files = []
    counter = 0
    for filename in glob.glob('images/original/*'):
        files.append([sift_dwt_hash(filename), 1])
        counter += 1
    return files

# def map_md5_hash():
#     files = []
#     for filename in glob.glob('images/original/*'):
#         f = open(filename, 'rb')
#         jpgdata = f.read()
#         f.close() 
#         files.append([hashlib.md5(jpgdata).hexdigest(), 1])
#     return files




In [4]:
# get blur results
def map_md5_hash(folder):
    files = []
    for filename in glob.glob('images/original/*'):
        f = open(filename, 'rb')
        jpgdata = f.read()
        f.close() 
        files.append([hashlib.md5(jpgdata).hexdigest(), 1])
    for filename in glob.glob('images/'+folder+'/*'):
        f = open(filename, 'rb')
        jpgdata = f.read()
        f.close() 
        files.append([hashlib.md5(jpgdata).hexdigest(), 1])
    return files


In [5]:
# test md5 hash
def map_md5_hash(folder):
    print("CHECKING MD5", folder)
    files = []
    for filename in glob.glob('images/original/*'):
        f = open(filename, 'rb')
        jpgdata = f.read()
        f.close() 
        files.append([hashlib.md5(jpgdata).hexdigest(), 1])
    for filename in glob.glob('images/'+folder+'/*'):
        f = open(filename, 'rb')
        jpgdata = f.read()
        f.close() 
        files.append([hashlib.md5(jpgdata).hexdigest(), 1])
    return files
def map_block_hash(folder):
    print("CHECKING BLOCK", folder)
    files = []
    for filename in glob.glob('images/original/*'):
        img = Image.open(filename)
        img = img.resize((256,256))
        files.append([blockhash(img,16), 1])
    for filename in glob.glob('images/'+folder+'/*'):
        img = Image.open(filename)
        img = img.resize((256,256))
        files.append([blockhash(img,16), 1])    
    return files

results = []
md_5_results = []
block_results = []
for folder in blur_folders:
    print(folder)
    map_result_md5 = map_md5_hash(folder)
    shuffle_sort_result_md5 = shuffle_sort(map_result_md5)
    reduce_result_md5 = [reduce_aggr(x) for x in shuffle_sort_result_md5]
    
    map_result_block = map_block_hash(folder)
    shuffle_sort_result_block = shuffle_sort(map_result_block)
    reduce_result_block = [reduce_aggr(x) for x in shuffle_sort_result_block]
    
    
    undetected_dups_md5 = len(reduce_result_md5) - 331
    print("UNDETECTED MD%")
    percentage_detected_md5 = (331 - undetected_dups_md5)
    print("md5",(percentage_detected_md5 / 331) * 100)
    md_5_results.append((percentage_detected_md5 / 331) * 100)
    
    undetected_dups_block = len(reduce_result_block) - 331
    percentage_detected_block = (331 - undetected_dups_block)
    print("block",(percentage_detected_block / 331) * 100)
    block_results.append((percentage_detected_block / 331) * 100)
#     break
    
    
    
    

results.append(md_5_results)
results.append(block_results)
    
df = pd.DataFrame(results, columns = blur_folders)
df.to_csv('blur.csv')
df

boxblurred
CHECKING MD5 boxblurred
CHECKING BLOCK boxblurred
UNDETECTED MD%
md5 0.0
block 38.972809667673715
gaussianblurred
CHECKING MD5 gaussianblurred
CHECKING BLOCK gaussianblurred
UNDETECTED MD%
md5 0.0
block 27.492447129909365
medianblurred
CHECKING MD5 medianblurred
CHECKING BLOCK medianblurred
UNDETECTED MD%
md5 0.0
block 36.25377643504532
modeblurred
CHECKING MD5 modeblurred
CHECKING BLOCK modeblurred
UNDETECTED MD%
md5 0.0
block 18.12688821752266


Unnamed: 0,boxblurred,gaussianblurred,medianblurred,modeblurred
0,0.0,0.0,0.0,0.0
1,38.97281,27.492447,36.253776,18.126888


In [6]:
results = []
md_5_results = []
block_results = []
for folder in noise_folders:
    print(folder)
    map_result_md5 = map_md5_hash(folder)
    shuffle_sort_result_md5 = shuffle_sort(map_result_md5)
    reduce_result_md5 = [reduce_aggr(x) for x in shuffle_sort_result_md5]
    
    map_result_block = map_block_hash(folder)
    shuffle_sort_result_block = shuffle_sort(map_result_block)
    reduce_result_block = [reduce_aggr(x) for x in shuffle_sort_result_block]
    
    
    undetected_dups_md5 = len(reduce_result_md5) - 331
    print("UNDETECTED MD%")
    percentage_detected_md5 = (331 - undetected_dups_md5)
    print("md5",(percentage_detected_md5 / 331) * 100)
    md_5_results.append((percentage_detected_md5 / 331) * 100)
    
    undetected_dups_block = len(reduce_result_block) - 331
    percentage_detected_block = (331 - undetected_dups_block)
    print("block",(percentage_detected_block / 331) * 100)
    block_results.append((percentage_detected_block / 331) * 100)
#     break
    
    
    
    

results.append(md_5_results)
results.append(block_results)
    
df = pd.DataFrame(results, columns = noise_folders)
df.to_csv('noise.csv')
df

gaussiannoise
CHECKING MD5 gaussiannoise
CHECKING BLOCK gaussiannoise
UNDETECTED MD%
md5 0.0
block 12.386706948640484
peppernoise
CHECKING MD5 peppernoise
CHECKING BLOCK peppernoise
UNDETECTED MD%
md5 0.0
block 10.876132930513595
poissonnoise
CHECKING MD5 poissonnoise
CHECKING BLOCK poissonnoise
UNDETECTED MD%
md5 0.0
block 44.71299093655589
saltnoise
CHECKING MD5 saltnoise
CHECKING BLOCK saltnoise
UNDETECTED MD%
md5 0.0
block 10.876132930513595
specklenoise
CHECKING MD5 specklenoise
CHECKING BLOCK specklenoise
UNDETECTED MD%
md5 0.0
block 37.764350453172206
diffformat
CHECKING MD5 diffformat
CHECKING BLOCK diffformat
UNDETECTED MD%
md5 0.0
block 100.0


Unnamed: 0,gaussiannoise,peppernoise,poissonnoise,saltnoise,specklenoise,diffformat
0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.386707,10.876133,44.712991,10.876133,37.76435,100.0


In [7]:
results = []
md_5_results = []
block_results = []
for folder in geo_folders:
    print(folder)
    map_result_md5 = map_md5_hash(folder)
    shuffle_sort_result_md5 = shuffle_sort(map_result_md5)
    reduce_result_md5 = [reduce_aggr(x) for x in shuffle_sort_result_md5]
    
    map_result_block = map_block_hash(folder)
    shuffle_sort_result_block = shuffle_sort(map_result_block)
    reduce_result_block = [reduce_aggr(x) for x in shuffle_sort_result_block]
    
    
    undetected_dups_md5 = len(reduce_result_md5) - 331
    print("UNDETECTED MD%")
    percentage_detected_md5 = (331 - undetected_dups_md5)
    print("md5",(percentage_detected_md5 / 331) * 100)
    md_5_results.append((percentage_detected_md5 / 331) * 100)
    
    undetected_dups_block = len(reduce_result_block) - 331
    percentage_detected_block = (331 - undetected_dups_block)
    print("block",(percentage_detected_block / 331) * 100)
    block_results.append((percentage_detected_block / 331) * 100)
#     break
    
    
    
    

results.append(md_5_results)
results.append(block_results)
    
df = pd.DataFrame(results, columns = geo_folders)
df.to_csv('geo.csv')
df

cropped
CHECKING MD5 cropped
CHECKING BLOCK cropped
UNDETECTED MD%
md5 0.0
block 0.0
rotated
CHECKING MD5 rotated
CHECKING BLOCK rotated
UNDETECTED MD%
md5 0.0
block 1.5105740181268883
scaled
CHECKING MD5 scaled
CHECKING BLOCK scaled
UNDETECTED MD%
md5 0.0
block 28.3987915407855


Unnamed: 0,cropped,rotated,scaled
0,0.0,0.0,0.0
1,0.0,1.510574,28.398792


In [8]:
# test block hash
# def map_md5_hash(folder):
#     files = []
#     for filename in glob.glob('images/original/*'):
#         f = open(filename, 'rb')
#         jpgdata = f.read()
#         f.close() 
#         files.append([hashlib.md5(jpgdata).hexdigest(), 1])
#     for filename in glob.glob('images/'+folder+'/*'):
#         f = open(filename, 'rb')
#         jpgdata = f.read()
#         f.close() 
#         files.append([hashlib.md5(jpgdata).hexdigest(), 1])
#     return files
# def map_block_hash(folder):
#     files = []
#     for filename in glob.glob('images/original/*'):
# #         print("ORIGINAL", filename)

#         img = Image.open(filename)
#         img = img.resize((256,256))
#         files.append([blockhash(img,16), 1])
#     for filename in glob.glob('images/'+folder+'/*'):
# #         print(filename)
#         img = Image.open(filename)
#         img = img.resize((256,256))
#         files.append([blockhash(img,16), 1])    
#     return files
# results = []
# for folder in folders:
#     print(folder)
#     map_result = map_block_hash(folder)
#     shuffle_sort_result = shuffle_sort(map_result)
#     reduce_result = [reduce_aggr(x) for x in shuffle_sort_result]
#     undetected_dups = len(reduce_result) - 339
#     percentage_detected = (339 - undetected_dups)
#     print((percentage_detected / 339) * 100)
#     results.append((percentage_detected / 339) * 100)
# df = pd.DataFrame([results], columns = folders)
# df
    

In [9]:
df.to_csv("block_hash.csv")