In [2]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

## PROCESS DATA FOR HUMAN OBSERVED DATASET ONLY

In [3]:
def processdata(dataset):
    
    imagename=dataset['img_id'].values
    features=dataset[['f1','f2','f3','f4','f5','f6','f7','f8','f9']].values
    
    return imagename,features

## PROCESSING DATA FOR GSC DATASET ONLY

In [4]:
def processdataGSC(dataset):
    
    imagename=(dataset[:,0])
   
    features=dataset[:,1:dataset.shape[1]]
    
    return imagename, features

In [5]:
def extract_pair_features(nptrainingpairs,imagename,features):
    
    
    feature1=np.asarray([[0.0 for i in range(0,features.shape[1])] for j in range(0,nptrainingpairs.shape[0])])
    feature2=np.asarray([[0.0 for i in range(0,features.shape[1])] for j in range(0,nptrainingpairs.shape[0])])

    for i in range (0,nptrainingpairs.shape[0]):
        img1=nptrainingpairs[i][0]

        for j in range (0,imagename.shape[0]):
            if(img1 == imagename[j]):
                feature1[i]=features[j]
               
    
    for i in range (0,nptrainingpairs.shape[0]):
        img2=nptrainingpairs[i][1]
      
        for j in range (0,imagename.shape[0]):
            if img2==imagename[j]:
                feature2[i]=features[j]           
    
    return (feature1,feature2)
            

In [6]:
def subtract(first,second):
    rows=first.shape[0]
    cols=first.shape[1]
    difference=np.asarray([[0.0 for i in range(0,cols)] for j in range(0,rows)])
    
    for i in range (0,rows):
        for j in range (0,cols):
            difference[i][j]=first[i][j]-second[i][j]
    return difference

In [7]:
def concat(first, second):
    concat=np.concatenate((first,second),axis=1)
    print(concat.shape)
    return concat

## PREPARING HUMAN OBSERVED DATASET

In [8]:
    trainingdata1= pd.read_csv("HumanObserved-Features-Data.csv")
    imagenames, features=processdata(trainingdata1)
    
    
    trainingpairs=np.array(pd.read_csv("same_pairs.csv").values)
    different_train_pairs=np.array(pd.read_csv("diffn_pairs.csv").values)
    
    
    #deleting extra pairs from different pair dataset
    different_train_pairs=np.delete(different_train_pairs,np.s_[trainingpairs.shape[0]:],axis=0)
    #creating training dataset with equal same and different pairs
    pairs=np.concatenate((trainingpairs,different_train_pairs),axis=0)
    #shuffling the dataset
    np.random.shuffle(pairs)
    print(pairs.shape)
    
    
    

(1582, 3)


## PREPARING GSC DATASET 

## REMOVE FROM COMMENT IF YOU WANNA RUN THIS

In [9]:
#     trainingdata1= pd.read_csv("GSC-Features.csv")
#     trainingdata1=np.array(trainingdata1)
#     #print(trainingdata1.shape)
#     imagenames, features=processdataGSC(trainingdata1)
    
#     trainingpairs=np.array(pd.read_csv("same_pairs_gsc.csv").values)  #71531 rows
#     different_train_pairs=np.array(pd.read_csv("diffn_pairs_gsc.csv").values) #7622557 rows
#     trainingpairs=trainingpairs[0:1000]
# #     print("same pair",trainingpairs.shape)
#     #deleting extra pairs from different pair dataset
#     different_train_pairs=np.delete(different_train_pairs,np.s_[trainingpairs.shape[0]:],axis=0)
#     #creating training dataset with equal same and different pairs
#     pairs=np.concatenate((trainingpairs,different_train_pairs),axis=0)
#     #shuffling the dataset
#     np.random.shuffle(pairs)
#     print(pairs)
    

In [10]:


feature1, feature2 = extract_pair_features(pairs,imagenames,features)
#-------------------checks to ensure correct evaluation--------------------------#
print(feature1[0])
print(feature2[0])


[2. 1. 1. 0. 2. 2. 0. 2. 2.]
[1. 1. 1. 1. 2. 1. 0. 0. 2.]


##  PREPARING TRAINING VALIDATION AND TESTING SETS FOR DATA WITH SUBTRACTED FEATURES

In [11]:
    #CREATING SUBTRACTED FEATURE DATASET
    subtracted_features=subtract(feature1,feature2)
#-------------------checks to ensure correct evaluation--------------------------#    
    print(subtracted_features.shape)
    print(subtracted_features[0])

(1582, 9)
[ 1.  0.  0. -1.  0.  1.  0.  2.  0.]


In [12]:
#PREPARING TRAINING VALIDATION AND TESTING DATASETS WITH SUBTRACTED FEATURES

trainingpercent=0.8
validationpercent=0.1
testingpercent=0.1

sub_Training_x=(subtracted_features[0:int((len(subtracted_features))*trainingpercent)])
sub_Training_t=pairs[0:int((len(pairs))*trainingpercent),2]


sub_Validation_x=subtracted_features[int((len(subtracted_features))*trainingpercent):int((len(subtracted_features))*trainingpercent)+int(len(subtracted_features)*validationpercent)]
sub_Validation_t=pairs[int((len(subtracted_features))*trainingpercent):int((len(subtracted_features))*trainingpercent)+int(len(subtracted_features)*validationpercent),2]


sub_Test_x=subtracted_features[int((len(subtracted_features))*trainingpercent)+int(len(subtracted_features)*validationpercent):int((len(subtracted_features))*trainingpercent)+int(len(subtracted_features)*validationpercent)+int(len(subtracted_features)*testingpercent)]
sub_Test_t=pairs[int((len(subtracted_features))*trainingpercent)+int(len(subtracted_features)*validationpercent):int((len(subtracted_features))*trainingpercent)+int(len(subtracted_features)*validationpercent)+int(len(subtracted_features)*testingpercent),2]

print("training shape",sub_Training_t.shape)
print("validation shape",sub_Validation_t.shape)
print("testing shape", sub_Test_t.shape)

training shape (1265,)
validation shape (158,)
testing shape (158,)


## PREPARING TRAINING VALIDATION AND TESTING SETS FOR DATA WITH CONCATENATED FEATURES

In [13]:
#CREATING CONCATENATED FEATURES DATASET
concat_features=concat(feature1,feature2)
print(concat_features)

(1582, 18)
[[2. 1. 1. ... 0. 0. 2.]
 [2. 1. 1. ... 0. 3. 2.]
 [2. 1. 1. ... 0. 3. 2.]
 ...
 [2. 1. 1. ... 0. 1. 1.]
 [2. 1. 1. ... 1. 1. 2.]
 [2. 1. 1. ... 0. 3. 2.]]


## TRAINING, VALIDATION AND TESTING DATA FOR CONCATENATED FEATURES

In [14]:
#PREPARING TRAINING VALIDATION AND TESTING DATASETS WITH CONCATENATED FEATURES


trainingpercent=0.8
validationpercent=0.1
testingpercent=0.1


concat_Training_x=(concat_features[0:int((len(concat_features))*trainingpercent)])
concat_Training_t=pairs[0:int((len(pairs))*trainingpercent),2]


concat_Validation_x=concat_features[int((len(concat_features))*trainingpercent):int((len(concat_features))*trainingpercent)+int(len(concat_features)*validationpercent)]
concat_Validation_t=pairs[int((len(concat_features))*trainingpercent):int((len(concat_features))*trainingpercent)+int(len(concat_features)*validationpercent),2]


concat_Test_x=concat_features[int((len(concat_features))*trainingpercent)+int(len(concat_features)*validationpercent):int((len(concat_features))*trainingpercent)+int(len(concat_features)*validationpercent)+int(len(concat_features)*testingpercent)]
concat_Test_t=pairs[int((len(concat_features))*trainingpercent)+int(len(concat_features)*validationpercent):int((len(concat_features))*trainingpercent)+int(len(concat_features)*validationpercent)+int(len(concat_features)*testingpercent),2]

print(concat_Training_x.shape)
print(concat_Validation_x.shape)
print(concat_Test_x.shape)





(1265, 18)
(158, 18)
(158, 18)


In [15]:
def sigmoid(z):
    return (1.0 / (1 + np.exp(-z)))

In [16]:
def derivative(training_data, calculated_y, expected_y):
        delta=np.dot(training_data,(calculated_y-expected_y))
        return delta

In [17]:
def update_weights(x,a,t,epochs,w,l_rate):
    for epoch in range(epochs):
        a = np.array(sigmoid(x.dot(w.T)))
        w = w - (l_rate*((a-t.T).dot(x)))
    return w

In [18]:
def Accuracy(calc_output, target):
    counter=0
    for i in range(0,target.shape[0]):
        if (round(calc_output[i])==target[i]):
            counter=counter+1
    return(counter/(i+1))

## LOGISTIC REGRESSION FOR SUBTRACTED FEATURES

In [19]:

#INITIALIZING THE WEIGHTS, epochs and learning rate
Epochs=150
Learning_rate=0.05
# w=np.random.rand(512) #initial weights for GSC dataset
w=np.array([0.81130218, 0.1011935,  0.8209283,  0.87660455, 0.58273304, 0.57651164, 0.07282913, 0.08728755, 0.62051645])
# print("initial weights are ",w)
#print("shape of weights is ",w.shape)
#print(subtracted_features.shape)


for i in range(0,Epochs):

    wtx= np.dot(w,np.transpose(sub_Training_x))
    wtx = wtx.astype(float)
    a=sigmoid(wtx)
    deltaw=derivative(np.transpose(sub_Training_x),a,sub_Training_t)
    w=w-Learning_rate*deltaw
    #print(i,"th set of updated weights are as follows",w)
        
        
v=np.dot(w,np.transpose(sub_Validation_x))
v=v.astype(float)
final=sigmoid(v)


#print(final.shape)
print("!!***************SUBTRACTED FEATURES******************!!")
print("%================logistic regression===================")
print("||||||||||||||||   VALIDATION SET   ||||||||||||||")
print("NUMBER OF EPOCHS",Epochs,)
print("Learning Rate ", Learning_rate)
print("Accuracy")
print(Accuracy(final,sub_Validation_t))
print("%================logistic regression===================")
print("||||||||||||||||   TESTING SET   ||||||||||||||")
t=np.dot(w,np.transpose(sub_Test_x))
t=t.astype(float)
finalt=sigmoid(t)

print("NUMBER OF EPOCHS",Epochs,)
print("Learning Rate ", Learning_rate)
print("Accuracy")
print(Accuracy(finalt,sub_Test_t))

    

!!***************SUBTRACTED FEATURES******************!!
||||||||||||||||   VALIDATION SET   ||||||||||||||
NUMBER OF EPOCHS 150
Learning Rate  0.05
Accuracy
0.6772151898734177
||||||||||||||||   TESTING SET   ||||||||||||||
NUMBER OF EPOCHS 150
Learning Rate  0.05
Accuracy
0.6075949367088608


## LOGISTIC REGRESSION FOR CONCATENATED FEATURES

In [20]:
# w_concat=np.random.rand(1024)# initial weights for GSC dataset
w_concat=np.array([0.81130218, 0.1011935,  0.8209283,  0.87660455, 0.58273304, 0.57651164,0.6475638,
                   0.07282913, 0.08728755, 0.62051645,0.653624,0.213523532,0.356436,0.984264,0.12454,0,0.1234567,0.438563])
#print(w_concat)
concat_Epochs=1000
Learning_rate_concat=0.05


for i in range(0,concat_Epochs):

    wtx_concat= np.dot(w_concat,np.transpose(concat_Training_x))
    wtx_concat = wtx_concat.astype(float)
    a_concat=sigmoid(wtx_concat)
    deltaw_concat=derivative(np.transpose(concat_Training_x),a_concat,concat_Training_t)
    w_concat=w_concat-Learning_rate_concat*deltaw_concat

v=np.dot(w_concat,np.transpose(concat_Validation_x))
v=v.astype(float)
finalv=sigmoid(v)

print("!!**************CONCATENATED FEATURES****************!!")
print("%================logistic regression===================")
print("||||||||||||||||   VALIDATION SET   ||||||||||||||")
print("NUMBER OF EPOCHS",concat_Epochs)
print("Learning Rate ", Learning_rate_concat)
print("Accuracy")
print(Accuracy(finalv,concat_Validation_t))

print("%================logistic regression===================")
print("||||||||||||||||   TESTING SET   ||||||||||||||")
testing=np.dot(w_concat,np.transpose(concat_Test_x))
testing=testing.astype(float)
finalt=sigmoid(testing)
print("NUMBER OF EPOCHS",concat_Epochs)
print("Learning Rate ", Learning_rate_concat)
print("Accuracy")
    
print(Accuracy(finalt,concat_Test_t))


!!**************CONCATENATED FEATURES****************!!
||||||||||||||||   VALIDATION SET   ||||||||||||||
NUMBER OF EPOCHS 1000
Learning Rate  0.05
Accuracy
0.9493670886075949
||||||||||||||||   TESTING SET   ||||||||||||||
NUMBER OF EPOCHS 1000
Learning Rate  0.05
Accuracy
0.9050632911392406


## LINEAR REGRESSION MODEL

In [21]:
def linear_regression(train_data,train_target,val_data,val_target,test_data,test_target):
    M = 5
    epochs =3200
    learningRate = 0.001
    kmeans = KMeans(n_clusters=M, random_state=0).fit(train_data)
    Mu = np.array(kmeans.cluster_centers_)

    #print(kmeans,Mu)
    train_target = np.array(train_target)
    BigSigma = GenerateBigSigma(train_data,Mu)
    training_PHI = GetPhiMatrix(train_data,Mu,BigSigma)
#     print(training_PHI,training_PHI.shape)    
    
    Wts = np.random.rand(training_PHI.shape[1])
    L_Erms_Val   = []
    L_Erms_TR    = []
    L_Erms_Test  = []
    for i in range(0,epochs):
    
        wTx = np.array(np.dot(Wts,training_PHI.T))
#         print("wtx",wTx.shape)
        Delta_E_D = -np.dot((np.subtract(train_target.T,wTx)),training_PHI)
#         print("deled",Delta_E_D.shape)
        Wts = Wts - np.dot(learningRate,Delta_E_D)
#         print(Wts.shape)

    testing_PHI= GetPhiMatrix(test_data,Mu, BigSigma)
    final= np.array(np.dot(Wts,testing_PHI.T))
#     print(final)
    acuuracy, erms = GetErms(final,test_target)
    return acuuracy, epochs, learningRate,erms
    
    
def GetErms(Output,Act_Target):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    Output = np.transpose(Output)
    for i in range (len(Output)):
        sum = sum + math.pow((Act_Target[i] - Output[i]),2)
#         print(sum)
        if(int(np.around(Output[i], 0)) == Act_Target[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(Output)))
    Erms=(math.sqrt(sum/len(Output)))
    return accuracy, Erms 
    
def GenerateBigSigma(Data, MuMatrix):
    #print("\t\t",Data.shape, MuMatrix.shape)
    BigSigma    = np.zeros((len(Data[1]),len(Data[1])))
    #print(BigSigma.shape)
    DataT       = np.transpose(Data)
    Len = len(DataT)       
    varVect     = []
    for i in range(len(Data[0])):
        vct = []
        for j in range(Len):
            vct.append(DataT[i][j])    
        varVect.append(np.var(vct))
    
    for j in range(len(DataT)):
        if varVect[j]==0:
            varVect[j] = varVect[j] + 0.00001
        BigSigma[j][j] = varVect[j]
    #print ("BigSigma Generated..")
    #print(BigSigma)
    return BigSigma

def GetScalar(DataRow,MuRow, BigSigInv):  
    #print(DataRow.shape,MuRow.shape)
    R = np.subtract(DataRow,MuRow)
    T = np.dot(BigSigInv,R)  
    L = np.dot(R,T)
    return L

def GetRadialBasisOut(DataRow,MuRow, BigSigInv):    
    phi_x = math.exp(-0.5*GetScalar(DataRow,MuRow,BigSigInv))
    return phi_x

def GetPhiMatrix(Data, MuMatrix, BigSigma):
    #print(Data.shape)
    PHI = np.zeros((len(Data),len(MuMatrix))) 
    BigSigInv = np.linalg.inv(BigSigma)
    #print(BigSigInv.shape)
    #print(MuMatrix.shape)
    for  C in range(0,len(MuMatrix)):
        for R in range(0,len(Data)):
            #print(Data[R].shape,MuMatrix[C].shape,BigSigInv.shape)
            PHI[R][C] = GetRadialBasisOut(Data[R], MuMatrix[C], BigSigInv)

    #print (PHI)
    return PHI

## calling linear regression method for subtracted features

In [22]:
accuracy,epoch,learningrate,erms=linear_regression(sub_Training_x, sub_Training_t, sub_Validation_x,sub_Validation_t,sub_Test_x,sub_Test_t)
print("!!**************SUBTRACTED FEATURES****************!!")
print("%================LINEAR regression===================")
print("||||||||||||||||   TESTING SET   ||||||||||||||")
print("NUMBER OF EPOCHS",epoch)
print("Learning Rate ", learningrate)
# print("Accuracy ", accuracy)
print("Erms ", erms)


!!**************SUBTRACTED FEATURES****************!!
||||||||||||||||   TESTING SET   ||||||||||||||
NUMBER OF EPOCHS 3200
Learning Rate  0.001
Erms  0.7208163273949116


## calling linear regression method for concatenated features

In [23]:
accuracy,epoch,learningrate,erms=linear_regression(concat_Training_x, concat_Training_t, concat_Validation_x,concat_Validation_t,concat_Test_x,concat_Test_t)
print("!!**************CONCATENATED FEATURES****************!!")
print("%================LINEAR regression===================")
print("||||||||||||||||   TESTING SET   ||||||||||||||")
print("NUMBER OF EPOCHS",epoch)
print("Learning Rate ", learningrate)
# print("Accuracy", accuracy)
print("Erms  ", erms)

!!**************CONCATENATED FEATURES****************!!
||||||||||||||||   TESTING SET   ||||||||||||||
NUMBER OF EPOCHS 3200
Learning Rate  0.001
Erms   0.7334675749781334


## NEURAL NETWORK

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.callbacks import EarlyStopping, TensorBoard
from keras.optimizers import RMSprop, Adam
import numpy as np

Using TensorFlow backend.


## MODEL DEFINITION

In [None]:

print(sub_Training_x.shape)
input_size = sub_Training_x.shape[1]
drop_out = 0.15
first_dense_layer_nodes  = 256
second_dense_layer_nodes = 128
third_dense_layer_nodes = 2

def get_model():
    
    model = Sequential()
    
    model.add(Dense(first_dense_layer_nodes, input_dim=input_size))
    model.add(Activation('sigmoid'))
    model.add(Dropout(drop_out))

    model.add(Dense(second_dense_layer_nodes))
    model.add(Activation('sigmoid'))
    model.add(Dropout(drop_out))
    
    model.add(Dense(third_dense_layer_nodes))
    model.add(Activation('softmax'))
    
    model.summary()
    
    
    opt = Adam()

    model.compile(optimizer=opt,
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [None]:
model = get_model()

## RUNNING THE TRAINING DATA ON THE MODEL

In [None]:
validation_data_split = 0.2
num_epochs = 10000
model_batch_size = 128
tb_batch_size = 32
early_patience = 100

tensorboard_cb   = TensorBoard(log_dir='logs', batch_size= tb_batch_size, write_graph= True)
earlystopping_cb = EarlyStopping(monitor='val_loss', verbose=1, patience=early_patience, mode='min')

# Read Dataset
# dataset, target = get_feature_matrix(data='hod', method='concatenate')

y_train = sub_Training_t
X_train   = sub_Training_x
print("X_train.shape: {}".format(X_train.shape))
print("y_train.shape: {}".format(y_train.shape))

y_train = encodeLabel(y_train)


# Process Dataset
# processedData, processedLabel = processData(dataset)
history = model.fit(X_train
                    , y_train
                    , validation_split=validation_data_split
                    , epochs=num_epochs
                    , batch_size=model_batch_size
                    , callbacks = [tensorboard_cb,earlystopping_cb]
                   )

In [None]:
def decodeLabel(encodedLabel):
    if encodedLabel == 0:
        return 0
    elif encodedLabel == 1:
        return 1

## TESTING ACCURACY for subtracted data human observed 

In [None]:
wrong   = 0
right   = 0

y_test = np.array(sub_Test_x) #if we un comment the part where GSC data is being processed into sub_test and concat test we will get accuracy for both those datasets 
X_test = np.transpose(sub_Test_t)


processedTestData  = X_test
processedTestLabel = encodeLabel(y_test)
predictedTestLabel = []

for i,j in zip(processedTestData,processedTestLabel):
    y = model.predict(np.array(i).reshape(-1,processedTestData.shape[1]))
    predictedTestLabel.append(decodeLabel(y.argmax()))
    
    if j.argmax() == y.argmax():
        right = right + 1
    else:
        wrong = wrong + 1

print("Errors: " + str(wrong), " Correct :" + str(right))

print("Testing Accuracy: {0:.2f}%".format(right/(right+wrong)*100))




## TESTING ACCURACY for concatenated data human observed 

In [None]:
wrong   = 0
right   = 0

y_test = np.array(concat_Test_t)
X_test = np.transpose(concat_Test_x)


processedTestData  = X_test
processedTestLabel = encodeLabel(y_test)
predictedTestLabel = []

for i,j in zip(processedTestData,processedTestLabel):
    y = model.predict(np.array(i).reshape(-1,processedTestData.shape[1]))
    predictedTestLabel.append(decodeLabel(y.argmax()))
    
    if j.argmax() == y.argmax():
        right = right + 1
    else:
        wrong = wrong + 1

print("Errors: " + str(wrong), " Correct :" + str(right))

print("Testing Accuracy: {0:.2f}%".format(right/(right+wrong)*100))


