## install
 - pip install imagehash
 - pip install Image

### 文字列を数値変換する
1. binascii.hexlifyで文字列を16進数に変換
2. int(,16)で10進数に変換
3. binで2進数に変換

In [1]:
from PIL import Image
import imagehash
import binascii

In [2]:
bin(int(binascii.hexlify(b'select * from test_table'),16))

'0b11100110110010101101100011001010110001101110100001000000010101000100000011001100111001001101111011011010010000001110100011001010111001101110100010111110111010001100001011000100110110001100101'

In [3]:
bin(int(binascii.hexlify(b'select * from test_table2'),16))

'0b1110011011001010110110001100101011000110111010000100000001010100010000001100110011100100110111101101101001000000111010001100101011100110111010001011111011101000110000101100010011011000110010100110010'

In [4]:
bin(int(binascii.hexlify(b'sample * format test_tablet'),16))

'0b11100110110000101101101011100000110110001100101001000000010101000100000011001100110111101110010011011010110000101110100001000000111010001100101011100110111010001011111011101000110000101100010011011000110010101110100'

### 画像に変換してみる
1. 2進数文字列の先頭"0b"を削除し、画像に変換する
 - "0b"は2進数を意味するが、画像にする際は不要

In [5]:
sample1 = bin(int(binascii.hexlify('select * from test_table'),16))[2:]
canvas = Image.new('1', (len(sample1), 1))

In [6]:
for index,bit in enumerate(sample1):
    canvas.putpixel((index, 0), int(bit))

In [7]:
canvas.save('sample1.png', 'PNG')

In [8]:
sample2 = bin(int(binascii.hexlify('select * from test_table2'),16))[2:]
canvas = Image.new('1', (len(sample2), 1))

In [9]:
for index,bit in enumerate(sample2):
    canvas.putpixel((index, 0), int(bit))

In [10]:
canvas.save('sample2.png', 'PNG')

In [11]:
sample3 = bin(int(binascii.hexlify('sample * format test_tablet'),16))[2:]
canvas = Image.new('1', (len(sample3), 1))

In [12]:
for index,bit in enumerate(sample3):
    canvas.putpixel((index, 0), int(bit))

In [13]:
canvas.save('sample3.png', 'PNG')

### 画像をdhashでハッシュ化する
1. 画像をインプットにdhashを算出
2. 単純な差分で距離(類似度に相当)を算出

In [14]:
sample1hash = imagehash.dhash(Image.open('sample1.png'))
print(sample1hash)

3434343434343434


In [15]:
sample2hash = imagehash.dhash(Image.open('sample2.png'))
print(sample2hash)

3535353535353535


In [16]:
sample3hash = imagehash.dhash(Image.open('sample3.png'))
print(sample3hash)

2525252525252525


In [17]:
print(sample1hash - sample2hash)
print(sample1hash - sample3hash)
print(sample2hash - sample3hash)

8
16
8


2と3がやけに近い

### 10進数で同様の処理を実行
1. 10進数をカラー255に変換する
 - 視覚的にわかりやすくするためにmax255に正規化する

In [18]:
sample1 = str(int(binascii.hexlify('select * from test_table'),16))
canvas = Image.new('L', (len(sample1), 1))
_max = int(max(sample1))

In [19]:
for index,bit in enumerate(sample1):
    canvas.putpixel((index, 0), int(int(bit)/float(_max)*255))

In [20]:
canvas.save('sample1.png', 'PNG')

In [21]:
sample2 = str(int(binascii.hexlify('select * from test_table2'),16))
canvas = Image.new('L', (len(sample2), 1))
_max = int(max(sample2))

In [22]:
for index,bit in enumerate(sample2):
    canvas.putpixel((index, 0), int(int(bit)/float(_max)*255))

In [23]:
canvas.save('sample2.png', 'PNG')

In [24]:
sample3 = str(int(binascii.hexlify('sample * format test_tablet'),16))
canvas = Image.new('L', (len(sample3), 1))
_max = int(max(sample3))

In [25]:
for index,bit in enumerate(sample3):
    canvas.putpixel((index, 0), int(bit))

In [26]:
canvas.save('sample3.png', 'PNG')

In [27]:
sample1hash = imagehash.dhash(Image.open('sample1.png'))
print(sample1hash)
sample2hash = imagehash.dhash(Image.open('sample2.png'))
print(sample2hash)
sample3hash = imagehash.dhash(Image.open('sample3.png'))
print(sample3hash)

7676767676767676
9393939393939393
0909090909090909


In [28]:
print(sample1hash - sample2hash)
print(sample1hash - sample3hash)
print(sample2hash - sample3hash)

40
56
32


さっきよりも悪い結果に

### LSHで分類してみる
1. 文字列を2進数変換
2. csv形式で出力する
 - 0列目はデータの名称
    - format:hogehoge_[0-9]{1,}
 - 各列にビットを立てる
3. LSHでclusteringする

In [29]:
from LocalitySensitiveHashing import *
import pandas as pd

In [30]:
sample1 = bin(int(binascii.hexlify('select * from test_table'),16))[2:]
sample2 = bin(int(binascii.hexlify('select * from test_table2'),16))[2:]
sample3 = bin(int(binascii.hexlify('sample * format test_tablet'),16))[2:]
print(sample1)
print(sample2)
print(sample3)

11100110110010101101100011001010110001101110100001000000010101000100000011001100111001001101111011011010010000001110100011001010111001101110100010111110111010001100001011000100110110001100101
1110011011001010110110001100101011000110111010000100000001010100010000001100110011100100110111101101101001000000111010001100101011100110111010001011111011101000110000101100010011011000110010100110010
11100110110000101101101011100000110110001100101001000000010101000100000011001100110111101110010011011010110000101110100001000000111010001100101011100110111010001011111011101000110000101100010011011000110010101110100


In [31]:
length = max([len(sample1),len(sample2),len(sample3)])
print(length)

215


In [32]:
df = pd.DataFrame()

In [33]:
list = []
list.append('sample_10')
for data in enumerate(sample1):
    list.append(data[1])
for index in range(length - len(sample1)):
    list.append('0')
df = df.append([list])

In [34]:
list = []
list.append('sample_1')
for data in enumerate(sample2):
    list.append(data[1])
for index in range(length - len(sample2)):
    list.append('0')
df = df.append([list])

In [35]:
list = []
list.append('sample_2')
for data in enumerate(sample3):
    list.append(data[1])
for index in range(length - len(sample3)):
    list.append('0')
df = df.append([list])

In [36]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,206,207,208,209,210,211,212,213,214,215
0,sample_10,1,1,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
0,sample_1,1,1,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
0,sample_2,1,1,1,0,0,1,1,0,1,...,0,1,0,1,1,1,0,1,0,0


In [37]:
datafile = "data.csv"

In [38]:
df.to_csv(datafile, index=False, header=False)

In [39]:
lsh = LocalitySensitiveHashing(
                   datafile = datafile,
                   dim = length,
                   r = 50,
                   b = 100,
                   expected_num_of_clusters = len(df), # 全てのデータがバラバラのクラスタであることも想定
          )

In [40]:
lsh.get_data_from_csv()
lsh.initialize_hash_store()
lsh.hash_all_data()
similarity_groups = lsh.lsh_basic_for_neighborhood_clusters()
coalesced_similarity_groups = lsh.merge_similarity_groups_with_coalescence( similarity_groups )
merged_similarity_groups = lsh.merge_similarity_groups_with_l2norm_sample_based( coalesced_similarity_groups )
lsh.write_clusters_to_file( merged_similarity_groups, "clusters.txt")

.

In [41]:
merged_similarity_groups

[{'sample_1', 'sample_10'}, {'sample_2'}]