In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import re
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import inflect

from nltk.corpus import wordnet as wn

import json

from scipy.stats import spearmanr

In [3]:
def get_dataframe(file_name):
    return pd.read_csv(file_name)

In [2]:
def is_noun(word):
    tokenized_word = word_tokenize(word)
    tagged_word = pos_tag(tokenized_word)
    if len(tagged_word) >= 1:
        return tagged_word[0][1] in ('NN', 'NNS', 'NNP', 'NNPS')
    else:
        return False

In [4]:
def data_cleaning_process(df, with_loss_amount = True):
    inputs = df["inputs"].tolist()
    outputs = df["outputs"].tolist()
    loss = df["lossAmount"].tolist()
    indices_to_remove_for_empty = []
    for i in range(len(inputs)):
        if type(inputs[i]) == float:
            indices_to_remove_for_empty.append(i)
    for i in range(len(outputs)):
        if type(outputs[i]) == float:
            indices_to_remove_for_empty.append(i)
    if with_loss_amount == True:
        loss = [str(number) for number in loss]
        for i in range(len(loss)):
            if loss[i] == "nan":
                indices_to_remove_for_empty.append(i)
    indices_to_remove_for_empty = list(set(indices_to_remove_for_empty))
    
    df = df.drop(index = indices_to_remove_for_empty)
    inputs = df["inputs"].tolist()
    outputs = df["outputs"].tolist()
    loss = df["lossAmount"].tolist()
    for i in range(len(inputs)):
        inp = inputs[i].split(",")
        indices_to_remove = []
        for j in range(len(inp)):
            if len(inp[j]) == 1:
                indices_to_remove.append(j)
        inp = [k for m, k in enumerate(inp) if m not in indices_to_remove]
        inputs[i] = ",".join(inp)
    for i in range(len(outputs)):
        outp = outputs[i].split(",")
        indices_to_remove = []
        for j in range(len(outp)):
            if len(outp[j]) == 1:
                indices_to_remove.append(j)
        outp = [k for m, k in enumerate(outp) if m not in indices_to_remove]
        outputs[i] = ",".join(outp)

    inputs = [s.lower() for s in inputs]
    outputs = [s.lower() for s in outputs]
    inputs = [re.sub(r"\(.*?\)", "", s) for s in inputs]
    outputs = [re.sub(r"\(.*?\)", "", s) for s in outputs]
    inputs = [re.sub(r"'s|'", "", s) for s in inputs]
    outputs = [re.sub(r"'s|'", "", s) for s in outputs]

    # noun or not
    for i in range(len(inputs)):
        inp = inputs[i].split(",")
        for j in range(len(inp)):
            ns = inp[j].split(" ")
            indices = []
            for k in range(len(ns)):
                if is_noun(ns[k]) == False:
                    if k != 0 and k!= len(ns) - 1:
                        if is_noun(ns[k-1]) == False or is_noun(ns[k+1]) == False:
                            indices.append(k)
                    elif k == 0:
                        try:
                            if is_noun(ns[k+1]) == False:
                                indices.append(k)
                        except:
                            print(ns[k])
                    else:
                        indices.append(k)
            ns = [l for m, l in enumerate(ns) if m not in indices]
            inp[j] = " ".join(ns)
            if inp[j] == "medical diagnosis":
                inp[j] = "diagnosis"
            if inp[j] == "voter identfication card":
                inp[j] = "voter identification card"
        inputs[i] = ",".join(inp)
        
    
    for i in range(len(outputs)):
        outp = outputs[i].split(",")
        for j in range(len(outp)):
            ns = outp[j].split(" ")
            indices = []
            for k in range(len(ns)):
                if is_noun(ns[k]) == False:
                    if k != 0 and k!= len(ns) - 1:
                        if is_noun(ns[k-1]) == False or is_noun(ns[k+1]) == False:
                            indices.append(k)
                    elif k == 0:
                        try:
                            if is_noun(ns[k+1]) == False:
                                indices.append(k)
                        except:
                            print(ns[k])
                    else:
                        indices.append(k)
            ns = [l for m, l in enumerate(ns) if m not in indices]
            outp[j] = " ".join(ns)
            if outp[j] == "medical diagnosis":
                outp[j] = "diagnosis"
            if outp[j] == "voter identfication card":
                outp[j] = "voter identification card"
        outputs[i] = ",".join(outp)

    #plural to singular
    p = inflect.engine()
    for i in range(len(inputs)):
        inp = inputs[i].split(",")
        for j in range(len(inp)):
            ns = inp[j].split(" ")
            indices = []
            for k in range(len(ns)):
                if ns[k] != "":
                    if p.singular_noun(ns[k]):
                        if len(wn.synsets(p.singular_noun(ns[k]))) > 0 and ns[k] != "data":
                            ns[k] = p.singular_noun(ns[k])
                else:
                    indices.append(k)
            ns = [l for m, l in enumerate(ns) if m not in indices]
            inp[j] = " ".join(ns)
        inputs[i] = ",".join(inp)
    
    for i in range(len(outputs)):
        outp = outputs[i].split(",")
        for j in range(len(outp)):
            ns = outp[j].split(" ")
            indices = []
            for k in range(len(ns)):
                if ns[k] != "":
                    if p.singular_noun(ns[k]):
                        if len(wn.synsets(p.singular_noun(ns[k]))) > 0 and ns[k] != "data":
                            ns[k] = p.singular_noun(ns[k])
                else:
                    indices.append(k)
            ns = [l for m, l in enumerate(ns) if m not in indices]
            outp[j] = " ".join(ns)
        outputs[i] = ",".join(outp)
        
    if with_loss_amount == False:
        return inputs, outputs
    else:
        return inputs, outputs, loss

In [5]:
def graph_construction_return_loss_n_frequency(inputs, outputs, loss):
    graph = nx.DiGraph()
    weight_dict = {}
    node_loss_dict = {}
    node_freq_dict = {}
    node_avg_loss_dict = {}
    for i in range(len(inputs)):
        inp = inputs[i].split(",")
        outp = outputs[i].split(",")
        for j in range(len(inp)):
            for k in range(len(outp)):
                weight_dict[(inp[j], outp[k])] = 0
                node_loss_dict[inp[j]] = 0
                node_freq_dict[inp[j]] = 0
                node_avg_loss_dict[inp[j]] = 0
                node_loss_dict[outp[k]] = 0
                node_freq_dict[outp[k]] = 0
                node_avg_loss_dict[outp[k]] = 0
    for i in range(len(inputs)):
        inp = inputs[i].split(",")
        outp = outputs[i].split(",")
        for j in range(len(inp)):
            for k in range(len(outp)):
                weight_dict[(inp[j], outp[k])] += 1
                if loss[i] != 1 and loss[i] != 0:
                    node_loss_dict[inp[j]] += loss[i] 
                    node_freq_dict[inp[j]] += 1
                    node_avg_loss_dict[inp[j]] = node_avg_loss_dict[inp[j]]/node_freq_dict[inp[j]]
                    node_loss_dict[outp[k]] += loss[i]
                    node_freq_dict[outp[k]] += 1
                    node_avg_loss_dict[outp[k]] += node_loss_dict[outp[k]]/node_freq_dict[outp[k]]
    for i in range(len(inputs)):
        inp = inputs[i].split(",")
        outp = outputs[i].split(",")
        for j in range(len(inp)):
            for k in range(len(outp)):
                graph.add_edge(inp[j], outp[k], weight = weight_dict[(inp[j], outp[k])])
    return graph, node_loss_dict, node_freq_dict, node_avg_loss_dict



def graph_construction_using_loss_weight_return_loss_n_frequency(inputs, outputs, loss):
    graph = nx.DiGraph()
    weight_dict = {}
    weight_dict2 = {}
    node_loss_dict = {}
    node_freq_dict = {}
    node_avg_loss_dict = {}
    for i in range(len(inputs)):
        inp = inputs[i].split(",")
        outp = outputs[i].split(",")
        for j in range(len(inp)):
            for k in range(len(outp)):
                weight_dict[(inp[j], outp[k])] = 0
                weight_dict2[(inp[j], outp[k])] = 0
                node_loss_dict[inp[j]] = 0
                node_freq_dict[inp[j]] = 0
                node_avg_loss_dict[inp[j]] = 0
                node_loss_dict[outp[k]] = 0
                node_freq_dict[outp[k]] = 0
                node_avg_loss_dict[outp[k]] = 0
    for i in range(len(inputs)):
        inp = inputs[i].split(",")
        outp = outputs[i].split(",")
        for j in range(len(inp)):
            for k in range(len(outp)):
                weight_dict[(inp[j], outp[k])] += 1
                weight_dict2[(inp[j], outp[k])] += loss[i]
                if loss[i] != 1 and loss[i] != 0:
                    node_loss_dict[inp[j]] += loss[i] 
                    node_freq_dict[inp[j]] += 1
                    node_avg_loss_dict[inp[j]] = node_avg_loss_dict[inp[j]]/node_freq_dict[inp[j]]
                    node_loss_dict[outp[k]] += loss[i]
                    node_freq_dict[outp[k]] += 1
                    node_avg_loss_dict[outp[k]] += node_loss_dict[outp[k]]/node_freq_dict[outp[k]]
    for i in range(len(inputs)):
        inp = inputs[i].split(",")
        outp = outputs[i].split(",")
        for j in range(len(inp)):
            for k in range(len(outp)):
                graph.add_edge(inp[j], outp[k], weight = weight_dict2[(inp[j], outp[k])])
    return graph, node_loss_dict, node_freq_dict, node_avg_loss_dict

In [8]:
def get_pr_scores_with_initialization(graph, initialization, max_iter=100):
    return nx.pagerank(graph, weight = "weight", nstart = initialization)

def get_eHITS_score(graph, max_in_degree, max_out_degree, node_loss_dict, node_initialization = False, n_iter = 20):
    nodes = list(graph.nodes())
    hubs = {node: 1 for node in list(graph.nodes())}
    authorities = {node: 1 for node in list(graph.nodes())}
    if node_initialization == True:
        hubs = {node: node_loss_dict[node] for node in list(graph.nodes())}
        authorities = {node: node_loss_dict[node] for node in list(graph.nodes())}
    for i in tqdm(range(n_iter)):
        for node_i in nodes:
            authorities[node_i] = sum(hubs[node_j]*graph[node_j][node_i]['weight']/max_in_degree for node_j in nodes if node_i in list(graph[node_j]))
            hubs[node_i] = sum(authorities[node_j]*graph[node_i][node_j]['weight']/max_out_degree for node_j in list(graph[node_i]))
        s = sum(hubs[node]**2 for node in nodes)
        print(s)
        sum_hubs = np.sqrt(s)
        sum_authorities = np.sqrt(sum(authorities[node]**2 for node in nodes))
        hubs = {node: hubs[node]/sum_hubs if sum_hubs > 0 else 0 for node in nodes}
        authorities = {node: authorities[node]/sum_hubs if sum_authorities > 0 else 0 for node in nodes}
    return hubs, authorities

In [7]:
def normalize_n_scale_pagerank_to_with_90_quantile(pr):
    max_pr = np.percentile(list(pr.values()), 90)
    scores = {}
    for key, value in pr.items():
        scores[key] = round(value/max_pr*100,2)
    return scores

def normalize_n_scale_pagerank_to_with_90_quantile_list(l):
    max_pr = np.percentile(l, 90)
    #print(max_pr)
    scores = []
    for value in l:
        scores.append(round(value/max_pr*100,2))
       # print(round(value/max_pr*100,2))
    return scores

def normalize_n_scale_pagerank_to_with_90_quantile_list_n_whole(l, pr):
    max_pr = np.percentile(list(pr.values()), 90)
    #print(max_pr)
    scores = []
    for value in l:
        scores.append(round(value/max_pr*100,2))
        #print(round(value/max_pr*100,2))
    return scores

def normalize_n_scale_score(dictionary):
    max_pr = np.max(list(dictionary.values()))
    scores = {}
    for key, value in dictionary.items():
        scores[key] = round(value/max_pr*100,2)
    return scores

In [9]:
def check_correlation(dict1, dict2):
    l1 = list(dict1.values())
    l2 = list(dict2.values())
    correlation, p_value = spearmanr(l1, l2)
    return correlation, p_value

In [15]:
df = get_dataframe("example_dataset.csv")
inputs, outputs, loss = data_cleaning_process(df, with_loss_amount = True)
graph, node_loss_dict, node_freq_dict, node_avg_loss_dict = graph_construction_return_loss_n_frequency(inputs, outputs, loss)
graph2, node_loss_dict, node_freq_dict, node_avg_loss_dict = graph_construction_using_loss_weight_return_loss_n_frequency(inputs, 
                                                                                                                          outputs, loss)
pr_w_loss_initial = get_pr_scores_with_initialization(graph, node_loss_dict, max_iter=100)
pr_w_loss_initial_w_loss_weight = get_pr_scores_with_initialization(graph2, node_loss_dict, max_iter=100)

In [16]:
max_in_degree_freq = max(dict(graph.in_degree(weight = 'weight')).values())
max_out_degree_freq = max(dict(graph.out_degree(weight = 'weight')).values())

max_in_degree_loss = max(dict(graph2.in_degree(weight = 'weight')).values())
max_out_degree_loss = max(dict(graph2.out_degree(weight = 'weight')).values())

In [17]:
hubs_w_freq, authorities_w_freq = get_eHITS_score(graph, max_in_degree_freq, max_out_degree_freq, node_loss_dict)

100%|█████████████████████████████████████████| 20/20 [00:00<00:00, 1512.03it/s]

1.4411591220850477
0.2923768597529429
0.13326967439510795
0.13487792701838477
0.13508441414154546
0.13511030820764733
0.13511339698620803
0.135113755050591
0.13511379589917674
0.13511380051646116
0.135113801035554
0.13511380109372503
0.13511380110023147
0.13511380110095822
0.1351138011010395
0.13511380110104854
0.1351138011010494
0.1351138011010497
0.1351138011010496
0.1351138011010496





In [18]:
hubs_w_loss, authorities_w_loss = get_eHITS_score(graph2, max_in_degree_loss, max_out_degree_loss, node_loss_dict)

100%|█████████████████████████████████████████| 20/20 [00:00<00:00, 1736.99it/s]

1.5076085217033446
0.32067828940565296
0.20208486516441362
0.20311234631015876
0.20305813880207382
0.20305104184912515
0.2030504866855615
0.20305044892848575
0.20305044650453175
0.20305044635311884
0.2030504463437905
0.20305044634321995
0.20305044634318511
0.20305044634318306
0.20305044634318303
0.2030504463431829
0.20305044634318292
0.20305044634318298
0.20305044634318298
0.20305044634318298





In [19]:
hubs_w_freq_initLoss, authorities_w_freq_initLoss = get_eHITS_score(graph, max_in_degree_freq, 
                                                                    max_out_degree_freq, node_loss_dict, node_initialization = True)

100%|█████████████████████████████████████████| 20/20 [00:00<00:00, 1766.36it/s]

214416811286.41974
0.11227205862054696
0.13542960801234122
0.13517385490191294
0.13512216299333404
0.13511484578068506
0.13511392516224036
0.13511381544574133
0.13511380273510598
0.1351138012855951
0.13511380112178645
0.13511380110337273
0.13511380110130944
0.13511380110107876
0.13511380110105287
0.13511380110104998
0.13511380110104954
0.1351138011010497
0.1351138011010495
0.13511380110104976



