In [1]:
import cv2
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
### First step is to extract the features using ROOT_SIFT
class FeatureEx():
    
    #keypoints will be a list of keypoints and img_features is a descriptor, numpy array of shape Number_of_Keypoints×128.
    def __init__(self):
        self.sift = cv2.xfeatures2d.SIFT_create()
        
    def extract(self, img):
        keypoints, img_features = self.sift.detectAndCompute(img, None)
        return keypoints, img_features
    
    #### For computing ROOT_SIFT feature
    def compute(self, img):
        img = cv2.imread(img, cv2.IMREAD_GRAYSCALE)
        if img is None:
            return "No image"
        
        kpts, descriptors = self.extract(img)
        
        ## Hellinger normalization
        descriptors += np.finfo(np.float32).eps
        descriptors /= np.sum(descriptors, axis=1)[:, np.newaxis]
        descriptors = np.sqrt(descriptors)
        
        return (kpts, descriptors)

In [3]:

def get_feature_matrix(path):
    
    feature = {}
    fEx = FeatureEx() #creating Feature extraction object
    
    directory = path
    
    # features is a matrix of features(row-wise)
    features = np.asarray(())
    
    for file in os.listdir(directory):
        #print (file)
        if file.endswith(".png"): 
            #get file path of image joined eith directory path
            
            filePath = os.path.join(directory, file)
            
            kpts, descriptors = fEx.compute(filePath)
            features = descriptors
            
            print(os.path.join(directory, file))
            
            
            x = file.find('.')
            writerno = file[0:x]
            
            #feature[writerno]=descriptors
            
            mydf = pd.DataFrame(descriptors)
            #mydf.to_csv(writerno + ".csv", index = False, header = False)
            
            linearvector = mydf.mean(axis=0)
                        
            feature[writerno]=linearvector
            
            #if writerno in feature.keys():
                 #vertically stack features and descriptor array into features
                #feature[writerno] = np.vstack((feature[writerno], descriptors))
            #else:
                #feature[writerno] = mydf
        else:
            continue
    
    return feature

In [4]:
result = get_feature_matrix('../iam/')

../iam/b10_1.png
../iam/b10_10.png
../iam/b10_2.png
../iam/b10_3.png
../iam/b10_4.png
../iam/b10_5.png
../iam/b10_6.png
../iam/b10_7.png
../iam/b10_8.png
../iam/b10_9.png
../iam/b11_1.png
../iam/b11_10.png
../iam/b11_2.png
../iam/b11_3.png
../iam/b11_4.png
../iam/b11_5.png
../iam/b11_6.png
../iam/b11_7.png
../iam/b11_8.png
../iam/b11_9.png
../iam/b12_1.png
../iam/b12_10.png
../iam/b12_2.png
../iam/b12_3.png
../iam/b12_4.png
../iam/b12_5.png
../iam/b12_6.png
../iam/b12_7.png
../iam/b12_8.png
../iam/b12_9.png
../iam/b13_1.png
../iam/b13_10.png
../iam/b13_2.png
../iam/b13_3.png
../iam/b13_4.png
../iam/b13_5.png
../iam/b13_6.png
../iam/b13_7.png
../iam/b13_8.png
../iam/b13_9.png
../iam/b14_1.png
../iam/b14_10.png
../iam/b14_2.png
../iam/b14_3.png
../iam/b14_4.png
../iam/b14_5.png
../iam/b14_6.png
../iam/b14_7.png
../iam/b14_8.png
../iam/b14_9.png
../iam/b15_1.png
../iam/b15_10.png
../iam/b15_2.png
../iam/b15_3.png
../iam/b15_4.png
../iam/b15_5.png
../iam/b15_6.png
../iam/b15_7.png
../iam/b

../iam/b9_4.png
../iam/b9_5.png
../iam/b9_6.png
../iam/b9_7.png
../iam/b9_8.png
../iam/b9_9.png


In [5]:
w = {}
for key in result.keys():
    x = key.find('_')
    writerno = key[0:x]
    
    if writerno in w.keys():
        w[writerno] = np.vstack((w[writerno], result[key]))
    else:
        w[writerno] = result[key]
    
print(w['b25'].shape)


(10, 128)


In [6]:
same = {}

classification = np.array([['same'] for i in range(1, 46)])

for writer in w.keys():
    a = np.array(w['b10'])
    b = np.zeros((1, 128), dtype=float)

    for i in range(0, 10):
        if i != 9:
            for j in range(i+1, 10):
                c = np.absolute(a[i] - a[j])
                b=np.vstack((b, c))
    b=b[1:]    
    b = np.hstack((b, classification))
    same[writer]=b

print(same['b35'])


[['0.004768161103129387' '0.015896301716566086' '0.009512141346931458'
  ... '0.004417382180690765' '0.005958840250968933' 'same']
 ['0.0012669898569583893' '0.008074725046753883' '0.01642483100295067'
  ... '0.008855694904923439' '0.008310552686452866' 'same']
 ['0.003215167671442032' '0.003311626613140106' '0.012342799454927444'
  ... '0.010465409606695175' '0.018021777272224426' 'same']
 ...
 ['0.011423163115978241' '0.00037731602787971497' '0.0008055716753005981'
  ... '0.0010998845100402832' '0.0048574358224868774' 'same']
 ['0.010636668652296066' '0.001584429293870926' '0.0037166178226470947'
  ... '0.0016585402190685272' '0.0032677724957466125' 'same']
 ['0.0007864944636821747' '0.001207113265991211' '0.004522189497947693'
  ... '0.000558655709028244' '0.00812520831823349' 'same']]


In [7]:
different = {}

def compute_differences(w1, w2):
    ans = np.zeros((1, 128), dtype=float)
    for i in range(0, 10):
        for j in range(0, 10):
            temp= np.absolute(w1[i]-w2[j])
            ans = np.vstack((ans, temp))
    ans=ans[1:]
    return ans
            
classification = np.array([['different'] for i in range(1, 101)])
    
for w1 in w.keys():
    for w2 in w.keys():
        x=w1[1:]
        y=w2[1:]
        
        if x < y:
            w12 = w1 + '&' + w2
        else:
            w12=w2+'&'+w1
        
        if (w1 != w2) and (w12 not in different):
            temp = compute_differences(w[w1], w[w2])
            temp = np.hstack((temp, classification))
            different[w12]=temp;
            

print(different['b10&b11'])

[['0.01303018257021904' '0.013847589492797852' '0.008120067417621613' ...
  '0.006311729550361633' '0.017393235117197037' 'different']
 ['0.021646562963724136' '0.019614141434431076' '0.0045570433139801025'
  ... '0.007529962807893753' '0.007243156433105469' 'different']
 ['0.01090966910123825' '0.013372428715229034' '0.0054592229425907135'
  ... '0.003444027155637741' '0.017392363399267197' 'different']
 ...
 ['0.0025962665677070618' '0.010824877768754959' '0.0029168538749217987'
  ... '0.005583859980106354' '0.007859814912080765' 'different']
 ['0.0022453218698501587' '0.0033995211124420166' '0.003199409693479538'
  ... '0.00028515979647636414' '0.006359990686178207' 'different']
 ['0.0001255720853805542' '0.00337422639131546' '0.010293126106262207'
  ... '0.005613315850496292' '0.000916946679353714' 'different']]


In [8]:
def extract_writers(w):
    w = w[1:]
    pos=w.find('b')
    
    n1=int(w[0:pos-1])
    n2 = int(w[pos+1:])
    
    return n1, n2
    
def generate_trainandtest_dataset(w1, w2):
    
    train = np.zeros((1, 129), dtype=float)
    test = np.zeros((1, 129), dtype=float)
    
    testarray = np.array(range(w1, w2+1))
    allwriters = np.array(range(1, 51))
    trainarray = np.setdiff1d(allwriters, testarray)
    
    for i in trainarray:
        train = np.vstack((train, same['b'+str(i)]))
    
    for i in testarray:
        test = np.vstack((test, same['b'+str(i)]))
        
    for key in different.keys():
        w1, w2 = extract_writers(key)
        
        if w1 in trainarray and w2 in trainarray:
            temp = different[key]
            train = np.vstack((train, temp[0:3, :]))
        elif w1 in testarray and w2 in testarray:
            temp = different[key]
            test = np.vstack((test, temp))
            
    train=train[1:]
    test=test[1:]
    
    return train, test    

In [9]:
train1, test1= generate_trainandtest_dataset(1, 10)

a = np.array(['A' + str(i) for i in range(1, 129)])
b=np.array(['Class'])
a = np.hstack((a, b))
header=a
a=np.vstack((a, train1));
a= np.vstack((a, test1))

mydataset1 = pd.DataFrame(a)
mydataset1.to_csv("Dataset1(1-10)" + ".csv", index = False, header = False)

print("Dataset1(1-10) : train ratio : " + str((train1.shape[0]*100)/(train1.shape[0]+test1.shape[0])))

#mytest1 = pd.DataFrame(test1)
#mytest1.to_csv("Dataset1(1-10)_test" + ".csv", index = False, header = False)
            

Dataset1(1-10) : train ratio : 45.54455445544554


In [10]:
a = np.array([['A' + str(i) for i in range(1, 129)]])
b=np.array([['Class']])
a = np.hstack((a, b))
print(a.shape[0])

1


In [11]:
#Creation of 2nd dataset

train1, test = generate_trainandtest_dataset(11, 20)

a=np.vstack((header, train1));
a= np.vstack((a, test))

mydataset1 = pd.DataFrame(a)
mydataset1.to_csv("Dataset2(11-20)" + ".csv", index = False, header = False)

print("Dataset1(1-10) : train ratio : " + str(((train1.shape[0])*100)/(train1.shape[0]+test.shape[0])))


Dataset1(1-10) : train ratio : 45.54455445544554


In [12]:
#Creation of 3rd dataset

train1, test = generate_trainandtest_dataset(21, 30)


a=np.vstack((header, train1));
a= np.vstack((a, test))

mydataset1 = pd.DataFrame(a)
mydataset1.to_csv("Dataset3(21-30)" + ".csv", index = False, header = False)

print("Dataset3(21-30): train ratio : " + str(((train1.shape[0]*100)/(train1.shape[0]+test.shape[0]))))


Dataset3(21-30): train ratio : 45.54455445544554


In [13]:
#Creation of 4th dataset

train1, test = generate_trainandtest_dataset(31, 40)

a=np.vstack((header, train1));
a= np.vstack((a, test))

mydataset1 = pd.DataFrame(a)
mydataset1.to_csv("Dataset4(31-40)" + ".csv", index = False, header = False)

print("Dataset4(31-40) : train ratio : " + str(((train1.shape[0])*100)/(train1.shape[0]+test.shape[0])))


Dataset4(31-40) : train ratio : 45.54455445544554


In [16]:
#Creation of the 5th dataset
train1, test1 = generate_trainandtest_dataset(41, 50)

a=np.vstack((header, train1))
a= np.vstack((a, test1))

mydataset1 = pd.DataFrame(a)
mydataset1.to_csv("Dataset5(41-50)" + ".csv", index = False, header = False)

print("Dataset5(41-50) : train ratio : " + str((train1.shape[0]*100)/(train1.shape[0]+test1.shape[0])))


Dataset5(41-50) : train ratio : 45.54455445544554


In [12]:
train, test = generate_trainandtest_dataset(1, 10)

In [16]:
part1 = train[0:828]
part2 = train[828:1656]
part3 = train[1656:2484]
part4 = train[2484:3312]
part5 = train[3312:4140]

a = np.array(['A' + str(i) for i in range(1, 129)])
b=np.array(['Class'])
a = np.hstack((a, b))
header=a

train = np.vstack((header, train))
mydataset1 = pd.DataFrame(np.vstack((train, test)))
mydataset1.to_csv("Dataset_1.1" + ".csv", index = False, header = False)


train = np.vstack((header, part2))
train = np.vstack((train, part3))
train = np.vstack((train, part4))
train = np.vstack((train, part5))
train = np.vstack((train, part1))

mydataset1 = pd.DataFrame(np.vstack((train, test)))
mydataset1.to_csv("Dataset_1.2" + ".csv", index = False, header = False)

train = np.vstack((header, part3))
train = np.vstack((train, part4))
train = np.vstack((train, part5))
train = np.vstack((train, part1))
train = np.vstack((train, part2))

mydataset1 = pd.DataFrame(np.vstack((train, test)))
mydataset1.to_csv("Dataset_1.3" + ".csv", index = False, header = False)

train = np.vstack((header, part4))
train = np.vstack((train, part5))
train = np.vstack((train, part1))
train = np.vstack((train, part2))
train = np.vstack((train, part3))

mydataset1 = pd.DataFrame(np.vstack((train, test)))
mydataset1.to_csv("Dataset_1.4" + ".csv", index = False, header = False)

train = np.vstack((header, part5))
train = np.vstack((train, part1))
train = np.vstack((train, part2))
train = np.vstack((train, part3))
train = np.vstack((train, part4))

mydataset1 = pd.DataFrame(np.vstack((train, test)))
mydataset1.to_csv("Dataset_1.5" + ".csv", index = False, header = False)


In [14]:
print("Dataset5(41-50) : train ratio : " + str((train.shape[0]*100)/(train.shape[0]+test.shape[0])))


Dataset5(41-50) : train ratio : 45.54455445544554
