In [69]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import math
import csv
import math
import matplotlib.pyplot
from matplotlib import pyplot as plt


image_specs_df = pd.read_csv("HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv", index_col=0)

same_pair_df = pd.read_csv("HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv")

diff_pair_df = pd.read_csv("HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv")


In [70]:
print(image_specs_df.shape)
print(same_pair_df.shape)
print(diff_pair_df.shape)


(1026, 10)
(791, 3)
(293032, 3)


In [71]:
maxAcc = 0.0
maxIter = 0
C_Lambda = 0.03
TrainingPercent = 80
ValidationPercent = 10
TestPercent = 10
M = 10
PHI = []
IsSynthetic = False

In [72]:
# This function takes dataset containing pairs and 
# returns a dataframe of n*21 dimention containing their image specifications 
# It performs feature concatination, i.e. It takes two sets of 9 features for each image
# and concats them into 18 features

def merge_data_set_feature_concatenation(image_specs, pair_set):
    df3 = pd.merge(pair_set, image_specs ,left_on = "img_id_A", right_on = "img_id",how="inner")
    df4 = pd.merge(df3, image_specs ,left_on = 'img_id_B', right_on = 'img_id',how="inner")
    df4 = df4.drop(['img_id_x', 'img_id_y'], axis=1)
    
    return df4

In [73]:
# create a dataframe using the image specifications from same writers (matched pairs, 791 rows)

matched_dataset = merge_data_set_feature_concatenation(image_specs_df, same_pair_df)
print(matched_dataset.shape)
matched_dataset.head()

(791, 21)


Unnamed: 0,img_id_A,img_id_B,target,f1_x,f2_x,f3_x,f4_x,f5_x,f6_x,f7_x,...,f9_x,f1_y,f2_y,f3_y,f4_y,f5_y,f6_y,f7_y,f8_y,f9_y
0,0359a,0359b,1,2,1,1,0,2,2,0,...,2,3,2,1,0,2,2,3,0,2
1,0577a,0577b,1,2,1,1,0,2,2,0,...,2,2,1,0,3,2,2,1,2,2
2,0577a,0577c,1,2,1,1,0,2,2,0,...,2,1,1,1,1,2,3,0,0,2
3,0577b,0577c,1,2,1,0,3,2,2,1,...,2,1,1,1,1,2,3,0,0,2
4,1120a,1120b,1,2,1,1,3,2,2,0,...,2,1,1,1,0,2,2,0,2,2


In [74]:
# create a dataframe using the image specifications from different writers (unmatched pairs)
# Since unmatched dataset is huge we are randomly creating a sample with same data size (791 rows)
# because if the amount of unmatched data overwhelms the matched data, the model may overfit

chosen_idx = np.random.choice(diff_pair_df.shape[0]-1, replace=False, size = matched_dataset.shape[0])
diff_pair_df_sample = diff_pair_df.iloc[chosen_idx]
unmatched_dataset = merge_data_set_feature_concatenation(image_specs_df, diff_pair_df_sample)
print(unmatched_dataset.shape)
unmatched_dataset.head()

(791, 21)


Unnamed: 0,img_id_A,img_id_B,target,f1_x,f2_x,f3_x,f4_x,f5_x,f6_x,f7_x,...,f9_x,f1_y,f2_y,f3_y,f4_y,f5_y,f6_y,f7_y,f8_y,f9_y
0,0546b,0348c,0,3,1,1,3,1,3,0,...,2,3,1,0,2,2,2,3,3,2
1,0950a,0348c,0,2,1,1,3,0,2,2,...,1,3,1,0,2,2,2,3,3,2
2,1144c,0348c,0,1,1,1,2,2,1,0,...,2,3,1,0,2,2,2,3,3,2
3,1286a,0324c,0,1,2,1,3,2,1,1,...,2,1,1,0,4,2,2,0,3,2
4,0477a,1227b,0,2,1,1,1,2,1,0,...,2,3,1,1,3,2,1,0,4,2


In [75]:
# merge the two dataset into one dataset containing 1582 entries and randomizing the order

dataset = pd.concat([unmatched_dataset,matched_dataset])
dataset = dataset.sample(frac=1).reset_index(drop=True)
print(dataset.shape)
dataset.head()

(1582, 21)


Unnamed: 0,img_id_A,img_id_B,target,f1_x,f2_x,f3_x,f4_x,f5_x,f6_x,f7_x,...,f9_x,f1_y,f2_y,f3_y,f4_y,f5_y,f6_y,f7_y,f8_y,f9_y
0,1567c,1567b,1,2,4,1,4,2,2,0,...,2,3,1,0,2,2,2,0,0,2
1,1375a,0321b,0,1,0,1,1,2,2,0,...,2,3,2,1,3,2,2,1,4,2
2,1540b,1540c,1,1,1,1,0,2,2,1,...,2,1,2,1,0,2,2,1,0,1
3,0456b,0336c,0,0,1,1,1,2,2,0,...,1,3,2,1,2,2,2,0,1,1
4,1305a,1370c,0,1,1,1,0,2,2,0,...,2,3,1,1,1,2,2,1,1,2


In [76]:
# def extract_target_vector(dset):
#     targets = dset['target']
#     dset = dset.drop(['img_id_A', 'img_id_B', 'target'], axis=1)
#     return dset, targets

# def generate_unseen_dataset(dataset):
#     unique_imgs = np.union1d(dataset.img_id_A.unique(), dataset.img_id_B.unique())
#     no_of_unique_imgs = unique_imgs.shape[0]

#     training_range_index = int(no_of_unique_imgs*0.7)
#     unseen_trainig_dataset = dataset[dataset.img_id_A.isin(unique_imgs[:training_range_index]) | dataset.img_id_B.isin(unique_imgs[:training_range_index])]
#     unseen_trainig_dataset, unseen_training_dataset_targets = extract_target_vector(unseen_trainig_dataset)
     

#     rest_dataset = dataset[~dataset.isin(unseen_trainig_dataset)].dropna()


#     testing_range_index = int(no_of_unique_imgs*0.85)
#     unseen_testing_dataset = rest_dataset[rest_dataset.img_id_A.isin(unique_imgs[training_range_index:testing_range_index]) | rest_dataset.img_id_B.isin(unique_imgs[training_range_index:testing_range_index])]
#     unseen_testing_dataset, unseen_testing_dataset_targets = extract_target_vector(unseen_testing_dataset)
    
    
#     unseen_validation_dataset = rest_dataset[~rest_dataset.isin(unseen_testing_dataset)].dropna()
#     unseen_validation_dataset, unseen_validation_dataset_targets = extract_target_vector(unseen_validation_dataset)


#     print("total entires: ", dataset.shape[0])
#     print("training entires (Unseen): ", unseen_trainig_dataset.shape[0])
#     print("testing entires (Unseen): ", unseen_testing_dataset.shape[0])
#     print("validation entires (Unseen): ", unseen_validation_dataset.shape[0])


In [77]:

#generate_unseen_dataset(dataset)

In [78]:
def GenerateTrainingTarget(rawTraining,TrainingPercent = 80):
    TrainingLen = int(math.ceil(len(rawTraining)*(TrainingPercent*0.01)))
    t           = rawTraining[:TrainingLen]
    #print(str(TrainingPercent) + "% Training Target Generated..")
    return t

def GenerateTrainingDataMatrix(rawData, TrainingPercent = 80):
    T_len = int(math.ceil(len(rawData[0])*0.01*TrainingPercent))
    d2 = rawData[:,0:T_len]
    #print(str(TrainingPercent) + "% Training Data Generated..")
    return d2

def GenerateValData(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData[0])*ValPercent*0.01))
    V_End = TrainingCount + valSize
    dataMatrix = rawData[:,TrainingCount+1:V_End]
    #print (str(ValPercent) + "% Val Data Generated..")  
    return dataMatrix

def GenerateValTargetVector(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData)*ValPercent*0.01))
    V_End = TrainingCount + valSize
    t =rawData[TrainingCount+1:V_End]
    #print (str(ValPercent) + "% Val Target Data Generated..")
    return t

def GenerateBigSigma(Data, MuMatrix,TrainingPercent,IsSynthetic):
    # 1582 x 18, M x 18, 
    BigSigma    = np.zeros((len(Data),len(Data)))
    DataT       = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))        
    varVect     = []
    for i in range(0,len(DataT[0])):
        # 0 to 18
        vct = []
        for j in range(0,int(TrainingLen)):
            #0 to 1200
            vct.append(Data[i][j])    
        varVect.append(np.var(vct))
    
    for j in range(len(Data)):
        BigSigma[j][j] = varVect[j]
    if IsSynthetic == True:
        BigSigma = np.dot(3,BigSigma)
    else:
        BigSigma = np.dot(200,BigSigma)
    ##print ("BigSigma Generated..")
    return BigSigma

def GetScalar(DataRow,MuRow, BigSigInv):  
    R = np.subtract(DataRow,MuRow)
    T = np.dot(BigSigInv,np.transpose(R))  
    L = np.dot(R,T)
    return L

def GetRadialBasisOut(DataRow,MuRow, BigSigInv):    
    phi_x = math.exp(-0.5*GetScalar(DataRow,MuRow,BigSigInv))
    return phi_x

def GetPhiMatrix(Data, MuMatrix, BigSigma, TrainingPercent = 80):
    DataT = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))         
    PHI = np.zeros((int(TrainingLen),len(MuMatrix))) 
    BigSigInv = np.linalg.inv(BigSigma)
    for  C in range(0,len(MuMatrix)):
        for R in range(0,int(TrainingLen)):
            PHI[R][C] = GetRadialBasisOut(DataT[R], MuMatrix[C], BigSigInv)
    #print ("PHI Generated..")
    return PHI

def GetValTest(VAL_PHI,W):
    Y = np.dot(W,np.transpose(VAL_PHI))
    ##print ("Test Out Generated..")
    return Y

def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    ##print ("Accuracy Generated..")
    ##print ("Validation E_RMS : " + str(math.sqrt(sum/len(VAL_TEST_OUT))))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

In [79]:
targets = dataset['target']

RawTarget = targets.values
datasetStriped = dataset.drop(['img_id_A', 'img_id_B', 'target'], axis=1)
RawData = np.transpose(datasetStriped.values)

TrainingTarget = GenerateTrainingTarget(RawTarget,TrainingPercent)
TrainingData   = GenerateTrainingDataMatrix(RawData,TrainingPercent)
print(TrainingTarget.shape)
print(TrainingData.shape)

ValDataAct = GenerateValTargetVector(RawTarget,ValidationPercent, (len(TrainingTarget)))
ValData    = GenerateValData(RawData,ValidationPercent, (len(TrainingTarget)))
print(ValDataAct.shape)
print(ValData.shape)

TestDataAct = GenerateValTargetVector(RawTarget,TestPercent, (len(TrainingTarget)+len(ValDataAct)))
TestData = GenerateValData(RawData,TestPercent, (len(TrainingTarget)+len(ValDataAct)))
print(TestDataAct.shape)
print(TestData.shape)

(1266,)
(18, 1266)
(158,)
(18, 158)
(157,)
(18, 157)


In [80]:
kmeans = KMeans(n_clusters=M, random_state=0).fit(np.transpose(TrainingData))
Mu = kmeans.cluster_centers_

BigSigma     = GenerateBigSigma(RawData, Mu, TrainingPercent,IsSynthetic)
TRAINING_PHI = GetPhiMatrix(RawData, Mu, BigSigma, TrainingPercent)
# W            = GetWeightsClosedForm(TRAINING_PHI,TrainingTarget,(C_Lambda)) 
TEST_PHI     = GetPhiMatrix(TestData, Mu, BigSigma, 100) 
VAL_PHI      = GetPhiMatrix(ValData, Mu, BigSigma, 100)

print(BigSigma.shape)
print(TRAINING_PHI.shape)
print(TEST_PHI.shape)
print(VAL_PHI.shape)

(18, 18)
(1266, 10)
(157, 10)
(158, 10)


In [81]:
loop_range = TrainingTarget.shape[0]

W_Now        = np.ones(M)

La           = 2
learningRate = 0.01
L_Erms_Val   = []
L_Erms_TR    = []
L_Erms_Test  = []
W_Mat        = []


# We are going to ananlyze only a small number of data, as the weights 
# are already close to optimized, and there are not much room for improvement, 
# so analying whole training set is redundant.
for i in range(0,loop_range):
    
    # We are implementing Stochastic
    
    # here we are calculating the weight changes we have to get a more optimized solution
    # for that we have to calculate the error changes first and combining with learning rate
    # we decide what will be the new weights for next iteration.
    
    Delta_E_D     = -np.dot((TrainingTarget[i] - np.dot(np.transpose(W_Now),TRAINING_PHI[i])),TRAINING_PHI[i])
    La_Delta_E_W  = np.dot(La,W_Now)
    Delta_E       = np.add(Delta_E_D,La_Delta_E_W)    
    Delta_W       = -np.dot(learningRate,Delta_E)
    W_T_Next      = W_Now + Delta_W
    W_Now         = W_T_Next
    
    
    
    #-----------------TrainingData Accuracy---------------------#
    TR_TEST_OUT   = GetValTest(TRAINING_PHI,W_T_Next) 
    Erms_TR       = GetErms(TR_TEST_OUT,TrainingTarget)
    L_Erms_TR.append(float(Erms_TR.split(',')[1]))
    
    #-----------------ValidationData Accuracy---------------------#
    VAL_TEST_OUT  = GetValTest(VAL_PHI,W_T_Next) 
    Erms_Val      = GetErms(VAL_TEST_OUT,ValDataAct)
    L_Erms_Val.append(float(Erms_Val.split(',')[1]))
    
    #-----------------TestingData Accuracy---------------------#
    TEST_OUT      = GetValTest(TEST_PHI,W_T_Next) 
    Erms_Test = GetErms(TEST_OUT,TestDataAct)
    L_Erms_Test.append(float(Erms_Test.split(',')[1]))

In [84]:
print ('----------Gradient Descent Solution--------------------')
print ("M = 15 \nLambda  = 0.0001\neta=0.01")
print ("E_rms Training   = " + str(np.around(min(L_Erms_TR),5)))
print ("E_rms Validation = " + str(np.around(min(L_Erms_Val),5)))
print ("E_rms Testing    = " + str(np.around(min(L_Erms_Test),5)))
min(L_Erms_Val)

----------Gradient Descent Solution--------------------
M = 15 
Lambda  = 0.0001
eta=0.01
E_rms Training   = 0.49967
E_rms Validation = 0.49892
E_rms Testing    = 0.49789


0.4989191152741441