In [326]:
import pandas as pd
import numpy as np
from pgmpy.models import BayesianModel
import networkx as nx
from pgmpy.factors.discrete import TabularCPD
import networkx as nx
import re
from tqdm import tqdm
mapping_dict = {}

In [327]:
def preprocess_data():
    features = pd.read_csv('15features.csv')
    training_pairs = pd.read_csv('seen-dataset/dataset_seen_training_siamese.csv')
    validation_pairs = pd.read_csv('seen-dataset/dataset_seen_validation_siamese.csv')
    training_features_pairs_info = pd.merge(training_pairs,features,left_on='left',right_on='imagename')
    training_features_pairs_info = pd.merge(training_features_pairs_info,
                                            features,left_on='right',
                                            right_on='imagename', 
                                            suffixes=('1', '2'))
    training_features_pairs_info = training_features_pairs_info.drop(training_features_pairs_info.columns[[0,4,20]],
                                                                     axis=1)
    training_features_pairs = training_features_pairs_info.drop(training_features_pairs_info.columns[[0,1]],
                                                           axis=1)

    validation_features_pairs_info = pd.merge(validation_pairs,features,left_on='left',right_on='imagename')
    validation_features_pairs_info = pd.merge(validation_features_pairs_info,features,
                                              left_on='right',
                                              right_on='imagename', 
                                              suffixes=('1', '2'))
    validation_features_pairs_info = validation_features_pairs_info.drop(validation_features_pairs_info.columns[[0,4,20]],
                                                                         axis=1)
    validation_features_pairs = validation_features_pairs_info.drop(validation_features_pairs_info.columns[[0,1]],axis=1)

    training_features_pairs.to_csv('training_features.csv')
    
def create_categorical_distance_data():
    feature_names = ['pen_pressure', 
                    'letter_spacing', 
                    'size', 
                    'dimension',
                    'is_lowercase', 
                    'is_continuous', 
                    'slantness', 
                    'tilt',
                    'entry_stroke_a', 
                    'staff_of_a', 
                    'formation_n', 
                    'staff_of_d',
                    'exit_stroke_d', 
                    'word_formation', 
                    'constancy']

    categories_dictionary = {}
    counter = 0

    for i in range(0,4):
        for j in range(0,4):
            category_key     = str(i) + str(j)
            category_key_rev = str(j) + str(i)
            if category_key_rev in categories_dictionary.keys():
                category_value = categories_dictionary[category_key_rev]
            else:
                counter += 1
                category_value = counter
            categories_dictionary[category_key] = category_value

    categorical_data = {}
    categorical_data['left'] = []
    categorical_data['right'] = []
    for feature_name in feature_names:
        categorical_data[feature_name] = []
    categorical_data['label'] = []

    for index,row in tqdm(validation_features_pairs_info.iterrows()):
        categorical_data['left'].append(row['left'])
        categorical_data['right'].append(row['right'])
        for feature_name in feature_names:
            image1_feat = int(row[feature_name + '1']) - 1
            image2_feat = int(row[feature_name + '2']) - 1
            category_key = str(image1_feat)+str(image2_feat)
            category_value = categories_dictionary[category_key]
            categorical_data[feature_name].append(category_value)
        categorical_data['label'].append(row['label'])

    categorical_df = pd.DataFrame(categorical_data)

In [328]:
def create_two_dataset():
    feature_names = ['pen_pressure', 
                    'letter_spacing', 
                    'size', 
                    'dimension',
                    'is_lowercase', 
                    'is_continuous', 
                    'slantness', 
                    'tilt',
                    'entry_stroke_a', 
                    'staff_of_a', 
                    'formation_n', 
                    'staff_of_d',
                    'exit_stroke_d', 
                    'word_formation', 
                    'constancy']
    categorical_df = pd.read_csv('categorical_training_seen.csv')

    for feature_name in feature_names:
        s1 = np.unique(np.array(list(categorical_df[categorical_df['label'] == 1][feature_name].values)))
        s0 = np.unique(np.array(list(categorical_df[categorical_df['label'] == 0][feature_name].values)))
        mapper_dict = {}
        for i, category in enumerate(s1):
            mapper_dict[category] = i
        mapping_dict[feature_name] = mapper_dict

    data_label_0 = categorical_df[categorical_df['label'] == 0].drop(['Unnamed: 0','left','right','label'],axis=1)
    data_label_1 = categorical_df[categorical_df['label'] == 1].drop(['Unnamed: 0','left','right','label'],axis=1)
    return ((data_label_0), (data_label_1))

In [329]:
def create_bn(data):
    dist_model = BayesianModel([('pen_pressure','size'),
                        ('letter_spacing', 'size'),
                        ('dimension', 'size'),
                        ('size', 'constancy'),
                        ('constancy', 'word_formation'),
                        ('word_formation', 'formation_n'),
                        ('entry_stroke_a', 'exit_stroke_d'),
                        ('is_lowercase', 'is_continuous'),
                        ('staff_of_a', 'staff_of_d'),
                        ('slantness', 'tilt')])
    options = {
    'node_color': 'black',
    'node_size': 1000,
    'width': 3,
    'with_labels':True,
    'font_color':'white',
    'font_size':13
    }
#     nx.draw(dist_model, **options)
    dist_model.fit(data)
    return dist_model

In [330]:
def create_cpd(model):
    cpd_dict = {}
    for cpd in model.get_cpds():
        cpd_dict[cpd.variables[0]] = cpd
    return cpd_dict

In [331]:
def joint_prob(cpd, col):
    p = 1
    for feature_name in feature_names:
        if(feature_name == 'is_lowercase'):
            p = p*cpd['is_lowercase'].values[mapping_dict['is_lowercase'][col['is_lowercase']]]
        if(feature_name == 'is_continuous'):
            p = p*cpd['is_continuous'].values[mapping_dict['is_continuous'][col['is_continuous']]][mapping_dict['is_lowercase'][col['is_lowercase']]]
        if(feature_name == 'entry_stroke_a'):
            p = p*cpd['entry_stroke_a'].values[mapping_dict['entry_stroke_a'][col['entry_stroke_a']]]
        if(feature_name == 'exit_stroke_d'):
            p = p*cpd['exit_stroke_d'].values[mapping_dict['exit_stroke_d'][col['exit_stroke_d']]][mapping_dict['entry_stroke_a'][col['entry_stroke_a']]]
        if(feature_name == 'pen_pressure'):
            p = p*cpd['pen_pressure'].values[mapping_dict['pen_pressure'][col['pen_pressure']]]
        if(feature_name == 'letter_spacing'):
            p = p*cpd['letter_spacing'].values[mapping_dict['letter_spacing'][col['letter_spacing']]]
        if(feature_name == 'dimension'):
            p = p*cpd['dimension'].values[mapping_dict['dimension'][col['dimension']]]
        if(feature_name == 'size'):
            p = p*cpd['size'].values[mapping_dict['size'][col['size']]][mapping_dict['dimension'][col['dimension']]][mapping_dict['letter_spacing'][col['letter_spacing']]][mapping_dict['pen_pressure'][col['pen_pressure']]]
        if(feature_name == 'constancy'):
            p = p*cpd['constancy'].values[mapping_dict['constancy'][col['constancy']]][mapping_dict['size'][col['size']]]
        if(feature_name == 'word_formation'):
            p = p*cpd['word_formation'].values[mapping_dict['word_formation'][col['word_formation']]][mapping_dict['constancy'][col['constancy']]]
        if(feature_name == 'formation_n'):
            p = p*cpd['formation_n'].values[mapping_dict['formation_n'][col['formation_n']]][mapping_dict['word_formation'][col['word_formation']]]
        if(feature_name == 'staff_of_a'):
            p = p*cpd['staff_of_a'].values[mapping_dict['staff_of_a'][col['staff_of_a']]]
        if(feature_name == 'staff_of_d'):
            p = p*cpd['staff_of_d'].values[mapping_dict['staff_of_d'][col['staff_of_d']]][mapping_dict['staff_of_a'][col['staff_of_a']]]
        if(feature_name == 'slantness'):
            p = p*cpd['slantness'].values[mapping_dict['slantness'][col['slantness']]]
        if(feature_name == 'tilt'):
            p = p*cpd['tilt'].values[mapping_dict['tilt'][col['tilt']]][mapping_dict['slantness'][col['slantness']]]
    return p

In [332]:
((data_label_0, data_label_1)) = create_two_dataset()

In [333]:
dist_model_label_0 = create_bn(data_label_0)
dist_model_label_1 = create_bn(data_label_1)

In [334]:
cpd_dict_label_0 = create_cpd(dist_model_label_0)
cpd_dict_label_1 = create_cpd(dist_model_label_1)

In [335]:
categorical_test_df = pd.read_csv('categorical_validation_seen.csv')

true_positives = 0
true_negatives = 0
false_positives = 0
false_negatives = 0

for i, col in categorical_test_df.iterrows():
    
    # Evaluating Joint Probability distribution for 
    jp_l0 = joint_prob(cpd_dict_label_0, col)
    jp_l1 = joint_prob(cpd_dict_label_1, col)
    
    # Likelihood Ratio calculation
    if(jp_l0/jp_l1 > 0):
        pred = 1
    else:
        pred = 0
    
    # Creating confusion matrix
    if(pred == 1 and col['label'] == 1):
        true_positives += 1
    elif(pred == 0 and col['label'] == 0):
        true_negatives += 1
    elif pred == 0 and col['label'] == 1:
        false_positives += 1
    elif pred == 1 and col['label'] == 0:
        false_negatives += 1

In [336]:
accuracy = (true_positives+true_negatives)/categorical_test_df.shape[0]
precision = true_positives/(true_positives+false_positives)
recall = true_positives/(true_positives+false_negatives)
f1 = 2 * (precision*recall) / (precision+recall)

print("Accuracy on seen validation set is:  " , accuracy)
print("Precision on seen validation set is: " , precision)
print("Recall on seen validation set is:    " , recall)
print("F1 Score on seen validation set is:  " , f1)

Accuracy on seen validation set is:   0.8515358361774744
Precision on seen validation set is:  0.9497907949790795
Recall on seen validation set is:     0.7516556291390728
F1 Score on seen validation set is:   0.8391866913123843
