In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import csv
import math
import matplotlib.pyplot
from matplotlib import pyplot as plt



In [3]:
maxAcc = 0.0
maxIter = 0
C_Lambda = 0.03
TrainingPercent = 80
ValidationPercent = 10
TestPercent = 10
M = 5
PHI = []
IsSynthetic = False

In [4]:
def GenerateTrainingTarget(rawTraining,TrainingPercent = 80):
    TrainingLen = int(math.ceil(len(rawTraining)*(TrainingPercent*0.01)))
    t           = rawTraining[:TrainingLen]
    #print(str(TrainingPercent) + "% Training Target Generated..")
    return t

def GenerateTrainingDataMatrix(rawData, TrainingPercent = 80):
    T_len = int(math.ceil(len(rawData[0])*0.01*TrainingPercent))
    d2 = rawData[:,0:T_len]
    #print(str(TrainingPercent) + "% Training Data Generated..")
    return d2

def GenerateValData(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData[0])*ValPercent*0.01))
    V_End = TrainingCount + valSize
    dataMatrix = rawData[:,TrainingCount+1:V_End]
    #print (str(ValPercent) + "% Val Data Generated..")  
    return dataMatrix

def GenerateValTargetVector(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData)*ValPercent*0.01))
    V_End = TrainingCount + valSize
    t =rawData[TrainingCount+1:V_End]
    #print (str(ValPercent) + "% Val Target Data Generated..")
    return t

def GenerateBigSigma(Data, MuMatrix,TrainingPercent,IsSynthetic):

    BigSigma    = np.zeros((len(Data),len(Data)))
    DataT       = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))        
    varVect     = []
    for i in range(0,len(DataT[0])):
        
        vct = []
        for j in range(0,int(TrainingLen)):
            
            vct.append(Data[i][j])    
        varVect.append(np.var(vct))
    
    for j in range(len(Data)):
        BigSigma[j][j] = varVect[j]
    if IsSynthetic == True:
        BigSigma = np.dot(3,BigSigma)
    else:
        BigSigma = np.dot(200,BigSigma)
    ##print ("BigSigma Generated..")
    return BigSigma

def GetScalar(DataRow,MuRow, BigSigInv):  
    R = np.subtract(DataRow,MuRow)
    T = np.dot(BigSigInv,np.transpose(R))  
    L = np.dot(R,T)
    return L

def GetRadialBasisOut(DataRow,MuRow, BigSigInv):    
    phi_x = math.exp(-0.5*GetScalar(DataRow,MuRow,BigSigInv))
    return phi_x

def GetPhiMatrix(Data, MuMatrix, BigSigma, TrainingPercent = 80):
    DataT = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))         
    PHI = np.zeros((int(TrainingLen),len(MuMatrix))) 
    BigSigInv = np.linalg.inv(BigSigma)
    for  C in range(0,len(MuMatrix)):
        for R in range(0,int(TrainingLen)):
            PHI[R][C] = GetRadialBasisOut(DataT[R], MuMatrix[C], BigSigInv)
    #print ("PHI Generated..")
    return PHI

def generate_phis(_RawData, _TrainingData, _ValData, _TestData):
    kmeans = KMeans(n_clusters=M, random_state=0).fit(np.transpose(_TrainingData))
    Mu = kmeans.cluster_centers_

    _BigSigma     = GenerateBigSigma(_RawData, Mu, TrainingPercent,IsSynthetic)
    print("BigSigma: " + str(_BigSigma.shape))
    
    _TRAINING_PHI = GetPhiMatrix(_RawData, Mu, _BigSigma, TrainingPercent)
    print("Training Phi: " + str(_TRAINING_PHI.shape))
    
    _TEST_PHI     = GetPhiMatrix(_TestData, Mu, _BigSigma, 100) 
    print("Testing Phi: " + str(_TEST_PHI.shape))
    
    _VAL_PHI      = GetPhiMatrix(_ValData, Mu, _BigSigma, 100)
    print("Validation Phi: " + str(_VAL_PHI.shape))
    
    
    return _TRAINING_PHI, _TEST_PHI, _VAL_PHI

def GetValTest(VAL_PHI,W):
    Y = np.dot(W,np.transpose(VAL_PHI))
    ##print ("Test Out Generated..")
    return Y

def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (0,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    ##print ("Accuracy Generated..")
    ##print ("Validation E_RMS : " + str(math.sqrt(sum/len(VAL_TEST_OUT))))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

In [5]:
# This function takes dataset containing pairs and 
# returns a dataframe of n*21 dimention containing their image specifications 
# It performs feature concatination, i.e. It takes two sets of 9 features for each image
# and concats them into 18 features

def merge_data_set_feature_concatenation(image_specs, pair_set):
    df3 = pd.merge(pair_set, image_specs ,left_on = "img_id_A", right_on = "img_id",how="inner")
    df4 = pd.merge(df3, image_specs ,left_on = 'img_id_B', right_on = 'img_id',how="inner")
    
    targets = df4['target']

    _RawTarget = targets.values
    datasetStriped = df4.drop(['img_id_A', 'img_id_B', 'target','img_id_x', 'img_id_y'], axis=1)
    
    uniques = datasetStriped.apply(lambda x: x.nunique())
    datasetStriped = datasetStriped.drop(uniques[uniques==1].index, axis=1)
    
    _RawData = np.transpose(datasetStriped.values)
    
    return _RawTarget, _RawData

# This function takes dataset containing pairs and 
# returns a dataframe of n*12 dimention containing their image specifications 
# It performs feature subtraction, i.e. It takes two sets of 9 features for each image
# and subtracts them and return 9 features

def merge_data_set_feature_subtraction(image_specs, pair_set):
    targets = pair_set['target']
    
    df3 = pd.merge(pair_set, image_specs ,left_on = "img_id_A", right_on = "img_id",how="inner")
    df3 = df3.drop(['img_id_A', 'img_id_B', 'target','img_id'], axis=1)
    

    df4 = pd.merge(pair_set, image_specs ,left_on = 'img_id_B', right_on = 'img_id',how="inner")
    df4 = df4.drop(['img_id_A', 'img_id_B', 'target','img_id'], axis=1)
    
    df5 = df3.sub(df4)
    df5 = df5.abs()
    
    uniques = df5.apply(lambda x: x.nunique())
    df5 = df5.drop(uniques[uniques==1].index, axis=1)
    
    temp_np = df5.values
    
#     temp_np = np.absolute(np.subtract(df3.values, df4.values))

    _RawTarget = targets.values
    _RawData = np.transpose(temp_np)
    
    return _RawTarget, _RawData

In [6]:
def generate_training_validation_testing_ds(RawTarget, RawData):
    _TrainingTarget = GenerateTrainingTarget(RawTarget,TrainingPercent)
    _TrainingData   = GenerateTrainingDataMatrix(RawData,TrainingPercent)
    print("Training Target: "+ str(_TrainingTarget.shape))
    print("Training Data: "+ str(_TrainingData.shape))

    _ValDataAct = GenerateValTargetVector(RawTarget,ValidationPercent, (len(_TrainingTarget)))
    _ValData    = GenerateValData(RawData,ValidationPercent, (len(_TrainingTarget)))
    print("Validation Target: "+ str(_ValDataAct.shape))
    print("Validation Data: "+ str(_ValData.shape))

    _TestDataAct = GenerateValTargetVector(RawTarget,TestPercent, (len(_TrainingTarget)+len(_ValDataAct)))
    _TestData = GenerateValData(RawData,TestPercent, (len(_TrainingTarget)+len(_ValDataAct)))
    print("Testing Target: "+ str(_TestDataAct.shape))
    print("Testing Data: "+ str(_TestData.shape))
    
    return _TrainingTarget, _TrainingData, _ValDataAct, _ValData, _TestDataAct, _TestData

In [7]:
def stochastic_gradient_descent(_loop_range, _TrainingTarget, _TRAINING_PHI, _ValDataAct, _VAL_PHI, _TestDataAct, _TEST_PHI):


    W_Now        = np.ones(M)

    La           = 2
    learningRate = 0.01
    L_Erms_Val   = []
    L_Erms_TR    = []
    L_Erms_Test  = []
    L_Acc_Val   = []
    L_Acc_TR    = []
    L_Acc_Test  = []
    W_Mat        = []

    for i in range(0,_loop_range):

        # here we are calculating the weight changes we have to get a more optimized solution
        # for that we have to calculate the error changes first and combining with learning rate
        # we decide what will be the new weights for next iteration.

        Delta_E_D     = -np.dot((_TrainingTarget[i] - np.dot(np.transpose(W_Now), _TRAINING_PHI[i])), _TRAINING_PHI[i])
        La_Delta_E_W  = np.dot(La,W_Now)
        Delta_E       = np.add(Delta_E_D,La_Delta_E_W)    
        Delta_W       = -np.dot(learningRate,Delta_E)
        W_T_Next      = W_Now + Delta_W
        W_Now         = W_T_Next



        #-----------------TrainingData Accuracy---------------------#
        TR_TEST_OUT   = GetValTest( _TRAINING_PHI, W_T_Next) 
        Erms_TR       = GetErms(TR_TEST_OUT, _TrainingTarget)
        L_Acc_TR.append(float(Erms_TR.split(',')[0]))
        L_Erms_TR.append(float(Erms_TR.split(',')[1]))

        #-----------------ValidationData Accuracy---------------------#
        VAL_TEST_OUT  = GetValTest( _VAL_PHI, W_T_Next) 
        Erms_Val      = GetErms(VAL_TEST_OUT, _ValDataAct)
        L_Acc_Val.append(float(Erms_Val.split(',')[0]))
        L_Erms_Val.append(float(Erms_Val.split(',')[1]))

        #-----------------TestingData Accuracy---------------------#
        TEST_OUT      = GetValTest( _TEST_PHI, W_T_Next) 
        Erms_Test = GetErms(TEST_OUT, _TestDataAct)
        L_Acc_Test.append(float(Erms_Test.split(',')[0]))
        L_Erms_Test.append(float(Erms_Test.split(',')[1]))
        
    print ('----------Gradient Descent Solution--------------------')
    print ("E_rms Training   = " + str(np.around(min(L_Erms_TR),5)))
    print ("E_rms Validation = " + str(np.around(min(L_Erms_Val),5)))
    print ("E_rms Testing    = " + str(np.around(min(L_Erms_Test),5)))
    print ("Accuracy Training   = " + str(np.around(max(L_Acc_TR),5)))
    print ("Accuracy Validation = " + str(np.around(max(L_Acc_Val),5)))
    print ("Accuracy Testing    = " + str(np.around(max(L_Acc_Test),5)))

# Human Observed Data

In [8]:
def staging_gradient_descent_hod(RawTarget, RawData):  
    TrainingTarget, TrainingData, ValDataAct, ValData, TestDataAct, TestData = generate_training_validation_testing_ds(RawTarget, RawData)
    TRAINING_PHI, TEST_PHI, VAL_PHI = generate_phis(RawData, TrainingData, ValData, TestData)
    stochastic_gradient_descent(TrainingTarget.shape[0], TrainingTarget, TRAINING_PHI, ValDataAct, VAL_PHI, TestDataAct, TEST_PHI)


In [9]:
image_specs_df = pd.read_csv("HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv", index_col=0)

same_pair_df = pd.read_csv("HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv")

diff_pair_df = pd.read_csv("HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv")


# create a dataframe by taking a subset from samples of different writers
# Since unmatched dataset is huge we are randomly creating a sample with same data size (791 rows)
# because if the amount of unmatched data overwhelms the matched data, the model may overfit

diff_pair_df_sample = diff_pair_df.sample(n=same_pair_df.shape[0], replace=True)

#Merging same and different writer's data set into one. (1582 rows)
dataset_pairs = pd.concat([diff_pair_df_sample,same_pair_df]).sample(frac=1).reset_index(drop=True)




In [10]:
# Now we are merging the image specifications with the dataset, we will use this dataset to train
# test or validate data.

RawTarget, RawData = merge_data_set_feature_concatenation(image_specs_df, dataset_pairs)
print("=========================")
print("== Human Observed Data ==")
print("==== Concatenation ======")
print("=========================")
print("RawData : " + str(RawData.shape))

staging_gradient_descent_hod(RawTarget, RawData)


== Human Observed Data ==
RawData : (18, 1582)
Training Target: (1266,)
Training Data: (18, 1266)
Validation Target: (158,)
Validation Data: (18, 158)
Testing Target: (157,)
Testing Data: (18, 157)
BigSigma: (18, 18)
Training Phi: (1266, 5)
Testing Phi: (157, 5)
Validation Phi: (158, 5)
----------Gradient Descent Solution--------------------
E_rms Training   = 0.49964
E_rms Validation = 0.49857
E_rms Testing    = 0.48941
Accuracy Training   = 55.13428
Accuracy Validation = 57.59494
Accuracy Testing    = 61.78344


In [11]:
targets, RawData = merge_data_set_feature_subtraction(image_specs_df, dataset_pairs)
print("=========================")
print("== Human Observed Data ==")
print("==== Subtraction ======")
print("=========================")
print("RawData : " + str(RawData.shape))

staging_gradient_descent_hod(RawTarget, RawData)

== Human Observed Data ==
RawData : (9, 1582)
Training Target: (1266,)
Training Data: (9, 1266)
Validation Target: (158,)
Validation Data: (9, 158)
Testing Target: (157,)
Testing Data: (9, 157)
BigSigma: (9, 9)
Training Phi: (1266, 5)
Testing Phi: (157, 5)
Validation Phi: (158, 5)
----------Gradient Descent Solution--------------------
E_rms Training   = 0.49992
E_rms Validation = 0.49988
E_rms Testing    = 0.49116
Accuracy Training   = 51.34281
Accuracy Validation = 53.79747
Accuracy Testing    = 62.42038


# Gradient Structural Concavity (GSC) Data

In [12]:
def staging_gradient_descent_gscd(RawTarget, RawData):  
    TrainingTarget, TrainingData, ValDataAct, ValData, TestDataAct, TestData = generate_training_validation_testing_ds(RawTarget, RawData)
    TRAINING_PHI, TEST_PHI, VAL_PHI = generate_phis(RawData, TrainingData, ValData, TestData)
    stochastic_gradient_descent(800, TrainingTarget, TRAINING_PHI, ValDataAct, VAL_PHI, TestDataAct, TEST_PHI)


In [13]:
image_specs_df = pd.read_csv("GSC-Dataset/GSC-Features-Data/GSC-Features.csv")

same_pair_df = pd.read_csv("GSC-Dataset/GSC-Features-Data/same_pairs.csv")

diff_pair_df = pd.read_csv("GSC-Dataset/GSC-Features-Data/diffn_pairs.csv")


# create a dataframe by taking a subset from samples of different writers
# Since unmatched dataset is huge we are randomly creating a sample with same data size (791 rows)
# because if the amount of unmatched data overwhelms the matched data, the model may overfit

diff_pair_df_sample = diff_pair_df.sample(n=same_pair_df.shape[0], replace=True)

#Merging same and different writer's data set into one. (1582 rows)
dataset_pairs = pd.concat([diff_pair_df_sample,same_pair_df]).sample(frac=1).reset_index(drop=True)

In [None]:
# Now we are merging the image specifications with the dataset, we will use this dataset to train
# test or validate data.

RawTarget, RawData = merge_data_set_feature_concatenation(image_specs_df, dataset_pairs)

print("=========================")
print("====== GSC Data =========")
print("==== Concatenation ======")
print("=========================")
print("RawData : " + str(RawData.shape))

staging_gradient_descent_gscd(RawTarget, RawData)


RawData : (1017, 143062)
Training Target: (114450,)
Training Data: (1017, 114450)
Validation Target: (14306,)
Validation Data: (1017, 14306)
Testing Target: (14305,)
Testing Data: (1017, 14305)
BigSigma: (1017, 1017)
Training Phi: (114450, 5)
Testing Phi: (14305, 5)
Validation Phi: (14306, 5)


In [14]:
# Now we are merging the image specifications with the dataset, we will use this dataset to train
# test or validate data.

RawTarget, RawData = merge_data_set_feature_subtraction(image_specs_df, dataset_pairs)

print("=========================")
print("====== GSC Data =========")
print("==== Subtraction ======")
print("=========================")
print("RawData : " + str(RawData.shape))

staging_gradient_descent_gscd(RawTarget, RawData)

RawData : (509, 143062)
Training Target: (114450,)
Training Data: (509, 114450)
Validation Target: (14306,)
Validation Data: (509, 14306)
Testing Target: (14305,)
Testing Data: (509, 14305)
BigSigma: (509, 509)
Training Phi: (114450, 5)
Testing Phi: (14305, 5)
Validation Phi: (14306, 5)
----------Gradient Descent Solution--------------------
E_rms Training   = 0.52341
E_rms Validation = 0.52097
E_rms Testing    = 0.52378
Accuracy Training   = 50.07427
Accuracy Validation = 50.71299
Accuracy Testing    = 50.42992
