In [1]:
import networkx as nx
import pickle
import json
import torch

In [2]:
G = nx.read_gpickle("../data/graph_cleaned.pickle")  # defs from wordsapi
type(G)

networkx.classes.graph.Graph

In [3]:
G['active']

AtlasView({'passive': {'weight': -1}, 'involved': {'weight': 1}, 'inactive': {'weight': -1}, 'engaged in or ready for military or naval operations': {'relation': 'defined as'}, "(used of verbs (e.g. `to run') and participial adjectives (e.g. `running' in `running water')) expressing action rather than a state of being": {'relation': 'defined as'}, 'taking part in an activity': {'relation': 'defined as'}, 'in operation': {'relation': 'defined as'}, 'characterized by energetic activity': {'relation': 'defined as'}, 'disposed to take action or effectuate change': {'relation': 'defined as'}, 'engaged in full-time work': {'relation': 'defined as'}, 'exerting influence or producing a change or effect': {'relation': 'defined as'}, 'expressing that the subject of the sentence has the semantic function of actor:': {'relation': 'defined as'}, 'full of activity or engaged in continuous activity': {'relation': 'defined as'}, '(of e.g. volcanos) capable of erupting': {'relation': 'defined as'}, '(o

### BERT cosine similarity as similarity score.

In [2]:
from transformers import AutoModel, AutoTokenizer, BertTokenizer
MODEL_NAME = "bert-base-cased"

# We need to create the model and tokenizer
model = AutoModel.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [3]:
pooled_list = []  # list will contain a tuple <trait, definition, bert representation of the definition>.
for node1, node2, edge in G.edges(data='relation'):
    if edge == 'defined as':
        tokens = tokenizer.tokenize(node2)
        tokens_ids = tokenizer.convert_tokens_to_ids(tokens)

        tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)


        tokens_pt = torch.tensor([tokens_ids])
        _, pooled_temp = model(tokens_pt)
        pooled_temp = pooled_temp.squeeze(0)
        pooled_list.append((node1, node2, pooled_temp))

In [4]:
def tokenize_sentence(sentence):
    tokens = tokenizer.tokenize(sentence)
    #print("Tokens: {}".format(tokens))

    # This is not sufficient for the model, as it requires integers as input, 
    # not a problem, let's convert tokens to ids.
    tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
    #print("Tokens id: {}".format(tokens_ids))

    # Add the required special tokens
    tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)

    # We need to convert to a Deep Learning framework specific format, let's use PyTorch for now.
    tokens_pt = torch.tensor([tokens_ids])
    #print("Tokens PyTorch: {}".format(tokens_pt))

    # Now we're ready to go through BERT with out input
    outputs, pooled = model(tokens_pt)
    #print("Token wise output: {}, Pooled output: {}".format(outputs.shape, pooled.shape))
    pooled = pooled.squeeze(0)
    return pooled

In [5]:
def find_similar_trait(s, pooled_list):
    """
    :param s: a candidate sentence.
    :param pooled_list: a list of tuples <trait, definition, tensor_rep_of_sentence>.
    """
    pooled = tokenize_sentence(s)
    maxi = -1
    mini = 1
    count_thres = 0
    THRES = 0.99
    for trait, defn, rep in pooled_list:
        similarity = torch.cosine_similarity(rep, pooled, 0).item()
        if similarity > maxi:
            maxi = similarity
            max_defn = defn
            max_trait = trait
        if similarity >= THRES:
            count_thres = count_thres + 1
#         if similarity < mini:
#             mini = similarity
#             min_defn = defn
#             min_trait = trait
    return {
        'max': maxi,
        'max_defn': max_defn,
        'max_trait': max_trait,
        'count_thres': count_thres
    }

In [6]:
def find_traits(s1, s2, pooled_list):
    """
    :param s1: a candidate sentence.
    :param s2: another candidate sentence.
    :param pooled_list: a list of tuples <trait, definition, tensor_rep_of_sentence>.
    """

    result = find_similar_trait(s1, pooled_list)
    #display(result['max'], result['max_defn'], result['max_trait'], result['count_thres'])
    trait1 = result['max_trait']

    result = find_similar_trait(s2, pooled_list)
    #display(result['max'], result['max_defn'], result['max_trait'], result['count_thres'])
    trait2 = result['max_trait']
    
    return trait1, trait2

In [4]:
import pandas as pd
with open("../data/Astro1_decomposed_clean_pairs.csv") as f:
    dataset = pd.read_csv(f, delimiter=',')

In [5]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,first,second
0,0,You never give up.,You never find it difficult to change your min...
1,1,You never give up.,"Whatever you have set your sights on, you refu..."
2,2,You never give up.,You are patient unless someone takes you too far.
3,3,You never give up.,You are usually slow to anger unless someone t...
4,4,You never give up.,You are reliable.


### Roberta Entailment Scores as Similarity Scores

In [6]:
import torch
from fairseq.data.data_utils import collate_tokens
from math import exp
import numpy as np

class RobertaMNLI:
    # todo: create similar wrapper classes for other NLI engines.
    # todo: create a wrapper superclass and subclass from there.
    
    def __init__(self, rel_path):
        """
        :param rel_path: path to pytorch hub folder.
        """
        self.output_map = {
            0: 'contradiction',
            1: 'neutral',
            2: 'entailment'
        }
        
        torch.hub.set_dir(rel_path)
        self.roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')  # works
        self.roberta.eval()
        
    def predict_one(self, S1, S2, return_probs=False):
        batch = collate_tokens(
            [self.roberta.encode(S1, S2)], pad_idx=1
        )
        logprobs = self.roberta.predict('mnli', batch)
        classes_tsr = logprobs.argmax(dim=1)
        classes = classes_tsr.tolist()  
        
        if return_probs == True:
            logprobs_list=[logprob.item() for logprob in logprobs[0]]
            prob_list =  [pow(exp(1), logprob) for logprob in logprobs_list]
            return prob_list
        else:
            return classes[0]  # 0 is contradiction, 1 is neutral, 2 is entailment.

In [7]:
predictor = RobertaMNLI(rel_path="../../roberta/hub")

Using cache found in ../../roberta/hub\pytorch_fairseq_master


In [8]:
probs = predictor.predict_one('auspicious', 'bright', return_probs=True)
cls = predictor.output_map[np.argmax(probs)]
print('predicted class: ', cls)
print('probabilities: ', probs)

predicted class:  entailment
probabilities:  [0.2838515728606713, 0.3048106316015907, 0.41133781278178166]


In [9]:
def entailment_score(predictor, S1, S2):
    """
    Returns entailment score of S1 and S2.
    We ignore the directionality of S1 and S2; we return the higher entailment score
    regardless of whether S1 entails S2 or S2 entails S1.
    
    :param predictor: wrapper class for NLI engine. See RobertaMNLI above.
    :param S1: String.
    :param S2: String.
    """
    probs1 = predictor.predict_one(S1, S2, return_probs=True)
    probs2 = predictor.predict_one(S2, S1, return_probs=True)
    e1 = probs1[2]  # index 2 is for entailment.
    e2 = probs2[2]
    return max(e1, e2)

def contradiction_score(predictor, S1, S2):
    """
    Returns contradiction score of S1 and S2.
    We ignore the directionality of S1 and S2; we return the higher score
    regardless of whether S1 contradicts S2, or S2 contradicts S1. 
    
    :param predictor: wrapper class for NLI engine. See RobertaMNLI above.
    :param S1: String.
    :param S2: String.
    """
    probs1 = predictor.predict_one(S1, S2, return_probs=True)
    probs2 = predictor.predict_one(S2, S1, return_probs=True)
    c1 = probs1[0]  # index 0 is for contradiction.
    c2 = probs2[0]
    return max(c1, c2)

In [11]:
s1 = "you are a very brave man."
s2 = "you are a courageous person."
print(entailment_score(predictor, s1, s2))
print(contradiction_score(predictor, s1, s2))

0.9935886904587607
0.0016470654810573557


In [12]:
import json
with open("../data/Astro1_decomposed_clean.json", 'r') as f:
    data_dict = json.load(f)
data_dict

{'0': ['You never give up.',
  'You never find it difficult to change your mind or course in mid-stream.'],
 '1': ['Whatever you have set your sights on, you refuse to give up or let go of it.'],
 '2': ['You are patient unless someone takes you too far.',
  'You are usually slow to anger unless someone takes you too far.'],
 '3': ['You are reliable.',
  'You are consistent.',
  'You can handle more of the workload than most around you.'],
 '4': ['You prefer a regular routine, defined responsibilities.',
  'You prefer a regular routine, defined tasks.'],
 '5': ['You enjoy building and seeing the results of your hard work.'],
 '6': ['Keeping things going is your strength, especially once someone else starts them.'],
 '7': ['Money and possessions are important to you for the feelings of security they bring.'],
 '8': ['You are handy with your hands.',
  'You may be ambidextrous or mechanically inclined.'],
 '9': ['Debate and argument appeal to you.'],
 '10': ['You grasp concepts easily.',


In [13]:
sentences = list()
for k,v in data_dict.items():
    for sent in v:
        sentences.append(sent)
len(sentences)

47

In [14]:
sentences_df = pd.DataFrame(data=sentences, columns=["sentence"])
sentences_df.head()

Unnamed: 0,sentence
0,You never give up.
1,You never find it difficult to change your min...
2,"Whatever you have set your sights on, you refu..."
3,You are patient unless someone takes you too far.
4,You are usually slow to anger unless someone t...


In [15]:
with open('../data/traits_only_graph_nli_dict.json', 'r') as f:
    def_dict = json.load(f)
def_dict

{'teenage': ['being of the age 13 through 19'],
 'useful': ['being of use or service', 'having a useful function'],
 'hateful': ['characterized by malice', 'evoking or deserving hatred'],
 'motivated': ['provided with a motive or given incentive for action'],
 'aged': ["(used of tobacco) aging as a preservative process (`aged' is pronounced as one syllable)",
  "having attained a specific age; (`aged' is pronounced as one syllable)",
  "of wines, fruit, cheeses; having reached a desired or final condition; (`aged' pronounced as one syllable)",
  "advanced in years; (`aged' is pronounced as two syllables)",
  'at an advanced stage of erosion (pronounced as one syllable)'],
 'funny': ['beyond or deviating from the usual or expected',
  'not as expected',
  'arousing or provoking laughter',
  'experiencing odd bodily sensations'],
 'utile': ['being of use or service'],
 'unfortunate': ['not auspicious; boding ill',
  'not favored by fortune; marked or accompanied by or resulting in ill fo

In [16]:
len(def_dict)

432

### Get similar/related definition and trait for each sentence in the dataset.

In [95]:
from datetime import datetime
now = datetime.now().time()
print(now)
results = list()
for sent in sentences_df['sentence']:
    max_score = 0
    max_sent = None
    max_trait = None
    for k, v in def_dict.items():
        for defn in v:
            score = entailment_score(predictor, sent, defn)
            if score > max_score:
                max_score = score
                max_defn = defn
                max_trait = k
    results.append([sent, max_score, max_defn, max_trait])
    now = datetime.now().time()
    print(now)

results_df = pd.DataFrame(data=results, columns=['sentence', 'score', 'definition', 'trait'])
results_df.to_csv("../data/roberta_entailment_similarity_Astro1_decomposed_clean_commondict.csv")

01:17:09.761288
01:21:17.501581
01:25:41.194209
01:30:09.643263
01:34:20.364938
01:38:35.521848
01:42:32.398583
01:46:25.911240
01:50:32.653000
01:54:33.538441
01:58:34.835162
02:02:40.626567
02:06:52.929772
02:11:09.613367
02:15:05.264748
02:19:06.859740
02:23:02.697798
02:26:53.801155
02:30:59.520506
02:35:12.434902
02:39:02.142227
02:42:51.813328
02:46:41.377139
02:51:33.535748
02:55:28.862220
02:59:24.529590
03:03:23.505884
03:07:14.644845
03:11:06.076105
03:15:03.701197
03:18:57.047538
03:22:49.822268
03:26:41.195353
03:30:31.221035
03:34:25.631068
03:38:18.516839
03:42:12.917705
03:46:08.750592
03:50:01.525269
03:53:54.320679
03:57:47.119402
04:01:51.306251
04:05:41.016705
04:09:31.146340
04:13:21.092468
04:17:11.145741
04:21:02.319379
04:24:52.353212


In [22]:
results_df

Unnamed: 0,sentence,score,definition,trait
0,You never give up.,0.930777,never-ceasing,persistent
1,You never find it difficult to change your min...,0.939418,capable of being changed,flexible
2,"Whatever you have set your sights on, you refu...",0.910474,acting with a specific goal,shrewd
3,You are patient unless someone takes you too far.,0.927791,beyond or deviating from the usual or expected,curious
4,You are usually slow to anger unless someone t...,0.946681,not easily irritated,placid
5,You are reliable.,0.968605,dependable,stalwart
6,You are consistent.,0.956693,relating to a person who does something regularly,regular
7,You can handle more of the workload than most ...,0.901664,having substance or capable of being treated a...,real
8,"You prefer a regular routine, defined responsi...",0.963223,relating to a person who does something regularly,regular
9,"You prefer a regular routine, defined tasks.",0.960846,relating to a person who does something regularly,regular


In [30]:
# error analysis 
from datetime import datetime
now = datetime.now().time()
print(now)
results = list()
for sent in ['You are shy.']:
    max_score = 0
    max_sent = None
    max_trait = None
    for k, v in def_dict.items():
        for defn in v:
            score = entailment_score(predictor, sent, defn)
            results.append([score, defn, k])
    now = datetime.now().time()
    print(now)

tmp_df = pd.DataFrame(data=results, columns=['score', 'definition', 'trait'])
tmp_df.to_csv("../data/v2_YouAreShy_traits.csv")

16:15:59.410112
16:20:06.362910


In [17]:
# load df of sentences' related definitions and traits.
import pandas as pd
sent_df = pd.read_csv("../data/roberta_entailment_similarity_Astro1_decomposed_clean_commondict.csv")
sent_df.head()

Unnamed: 0.1,Unnamed: 0,sentence,score,definition,trait
0,0,You never give up.,0.900807,showing a fighting disposition,competitive
1,1,You never find it difficult to change your min...,0.939418,capable of being changed,flexible
2,2,"Whatever you have set your sights on, you refu...",0.910474,acting with a specific goal,shrewd
3,3,You are patient unless someone takes you too far.,0.927791,beyond or deviating from the usual or expected,funny
4,4,You are usually slow to anger unless someone t...,0.946681,not easily irritated,placid


In [18]:
# generate sentence index pairs
import itertools
pairs_list = list(itertools.combinations(sent_df.index, 2))

In [19]:
pairs_list[0:3]

[(0, 1), (0, 2), (0, 3)]

### graph inference

In [20]:
def get_paths_of_weights(G, paths_list, attr="weight"):
    """
    Return the weights of the shortest paths.
    
    :param G: source graph
    :param paths_list: all shortest paths. e.g. networkx.all_shortest_paths().
    :parm attr: name of attribute that contains the weight information. default is "weight".
    """
    weight_paths_list = list()
    for path in paths_list:
        weight_path = list()
        for i in range(len(path) - 1):
            edge_attr = G.get_edge_data(path[i], path[i+1])
            weight_path.append(edge_attr[attr])
        weight_paths_list.append(weight_path)
    return weight_paths_list

def get_entailment_paths_of_weights(predictor, paths_list):
    """
    Return the entailment scores along the shortest paths.
    
    :param predictor: wrapper class for NLI engine. See RobertaMNLI above.
    :param paths_list: all shortest paths. e.g. networkx.all_shortest_paths().
    """
    weight_paths_list = list()
    for path in paths_list:
        weight_path = list()
        for i in range(len(path) - 1):
            score = entailment_score(predictor, path[i], path[i+1])
            weight_path.append(score)
        weight_paths_list.append(weight_path)
    return weight_paths_list


def vis_weighted_paths(paths_list, weight_paths_list):
    # visualise the weighted_paths
    for i in range(len(paths_list)):
        path = paths_list[i]
        weights = weight_paths_list[i]
        to_print = str()
        for j in range(len(path)-1):
            to_print = to_print + str(f"{path[j]}-({weights[j]:.3f})-")
        to_print = to_print + f"{path[-1]}"
        print(to_print + '\n')
        
def infer_trait_relations(paths_of_weights):
    """
    simple algorithm for computing relation between start and end node of path.
    
    :param paths_of_weights: a list. see get_paths_of_weights(G, paths_list) above. each item 
    in this list represents a path's edge weights going from left (start) to right (end).
    Weights are strictly between -1 to 1, inclusive.
    """
    inferred_relations = list()
    for weight_path in paths_of_weights:
        start_val = weight_path[0]
        for i in weight_path[1:]:
            start_val = start_val * i
        inferred_relations.append(start_val)
    return inferred_relations
       
inference_map = {-1: "contradiction", 1: "entailment"}  # todo: make this a Class attribute.

def choose_rel(inferred_relations):
    """
    from a list of graph-inferred relations, chooses the statistical mode as the final inferred relation.
    """
    try:
        return inference_map[mode(inferred_relations)]
    except StatisticsError as e:
        # no mode found. synonym and antonym equally likely
        return "neutral"
    
def select_score(inferred_relations):
    """
    Computes final relation score by taking the average of the scores that are of the same polarity 
    as the most frequent polarity (positive vs negative). If frequency of positive and negative
    scores are equal then the relation score is taken to be ambiguous and thus we return a score of 0 
    by default, indicating neutrality, as it is prudent to consider the relation neutral when unsure.
    """
    print(f'inferred relations: {inferred_relations}')
    class_list = list()
    for score in inferred_relations:
        if score < 0:
            class_list.append(-1)  # -1 represents antonym
        else:
            class_list.append(1)  # 1 represents synonym
    
    try:
        final_class = mode(class_list)
        possible_scores = list()
        if final_class == 1:
            for score in inferred_relations:
                if score > 0:
                    possible_scores.append(score)
        elif final_class == -1:
            for score in inferred_relations:
                if score < 0:
                    possible_scores.append(score)
        print(f'possible_scores: {possible_scores}')
        final_score = sum(possible_scores)/len(possible_scores)  # arithmetic mean of possible scores
        # it is possible to choose final_score based on max absolute score instead if we believe 
        # that the strongest non-neutral relationship should be taken as the trait-pair's relation. 
            
    except StatisticsError:
        # no mode found. synonym and antonym equally likely
        final_score = 0  # 0 for neutral
    return final_score  # val between -1 to 1, inclusive.
        
def print_rel_counts(inferred_relations):
    distribution = np.unique(inferred_relations, return_counts=True)
    print("<relation: count>:")
    for i in range (len(distribution[0])):
        print(f"{inference_map[distribution[0][i]]}: {distribution[1][i]}")

### use nli-based graph together with nli-based related traits to perform graph inference to find a distance score for each sentence pair.

In [21]:
import random
import numpy as np
import networkx as nx
from statistics import mode
from statistics import StatisticsError
from networkx import NetworkXNoPath
from networkx import NodeNotFound

#G2 = nx.read_gpickle("../data/traits_only_graph_cleaned2.pickle")
G2 = nx.read_gpickle("../data/traits_only_graph_nli.pickle")
G2.edges(data=True)

EdgeDataView([('active', 'passive', {'score': -0.9962928040560369}), ('active', 'involved', {'score': 0.8323226615075182}), ('active', 'inactive', {'score': -0.9951286145356494}), ('passive', 'inactive', {'score': 0.9475867187093776}), ('involved', 'interested', {'score': 0.8728021251452028}), ('involved', 'participating', {'score': 0.9517589468412195}), ('inactive', 'quiescent', {'score': 0.8819435041473466}), ('inactive', 'dull', {'score': 0.7789697707268533}), ('inactive', 'slow', {'score': 0.6159981150560486}), ('inactive', 'supine', {'score': 0.7104365261292587}), ('considerate', 'inconsiderate', {'score': -0.9963289099869019}), ('considerate', 'thoughtful', {'score': 0.9640939834721406}), ('inconsiderate', 'thoughtless', {'score': 0.82419437526209}), ('inconsiderate', 'careless', {'score': 0.8225489297020572}), ('thoughtful', 'sensible', {'score': 0.8281778560611142}), ('convivial', 'sociable', {'score': 0.9021289763443758}), ('sociable', 'agreeable', {'score': 0.8358482712908396

In [23]:
# compute pairwise distances with kg
rel_scores = list()
for pair_of_indices in pairs_list:
    weight_paths = list()
    dist = 0
    trait1 = sent_df.iloc[pair_of_indices[0]]['trait']
    trait2 = sent_df.iloc[pair_of_indices[1]]['trait']
    try:
        if trait1 != trait2:
            paths = nx.all_shortest_paths(G2, source=trait1, target=trait2, weight=None, method='dijkstra')
            weight_paths = get_paths_of_weights(G2, paths, attr="score")
            potential_rel_scores = infer_trait_relations(weight_paths)
            score = select_score(potential_rel_scores)
            # print for debug
            print(sent_df.iloc[pair_of_indices[0]]['sentence'], sent_df.iloc[pair_of_indices[1]]['sentence'])
            print(trait1, trait2)
            print(score)
            print()
        else:
            score = 1  # if traits are same, then by definition, their relation is '1' or synonymous/entailed. 
    except NetworkXNoPath:
        score = 0
    except NodeNotFound:
        print("trait does not exist in graph.")
        score = None
    rel_scores.append(score)

inferred relations: [0.00902159891322593, 0.00683537556026907]
possible_scores: [0.00902159891322593, 0.00683537556026907]
You never give up. You are usually slow to anger unless someone takes you too far.
competitive placid
0.0079284872367475

inferred relations: [-0.4453521925654388, -0.3869384689688729]
possible_scores: [-0.4453521925654388, -0.3869384689688729]
You never give up. You are reliable.
competitive confident
-0.4161453307671559

inferred relations: [-0.34617452108896524]
possible_scores: [-0.34617452108896524]
You never give up. You are consistent.
competitive concordant
-0.34617452108896524

inferred relations: [-0.13634766789699912]
possible_scores: [-0.13634766789699912]
You never give up. You can handle more of the workload than most around you.
competitive real
-0.13634766789699912

inferred relations: [-0.49153916339603876, -0.4270674187704984]
possible_scores: [-0.49153916339603876, -0.4270674187704984]
You never give up. You prefer a regular routine, defined resp

In [24]:
print(len(pairs_list), len(rel_scores))
unzipped = zip(*pairs_list)  # separate the tuple elements. 
unzipped_list = list(unzipped)  # to get 2 new lists.

results_df = pd.DataFrame(data={"first_index":unzipped_list[0], "second_index":unzipped_list[1], "relation_score": rel_scores})
display(results_df.head(), results_df.tail())

1081 1081


Unnamed: 0,first_index,second_index,relation_score
0,0,1,0.0
1,0,2,0.0
2,0,3,0.0
3,0,4,0.007928
4,0,5,-0.416145


Unnamed: 0,first_index,second_index,relation_score
1076,43,45,1.0
1077,43,46,0.101918
1078,44,45,0.259512
1079,44,46,0.039288
1080,45,46,0.101918


In [25]:
first = [sent_df.iloc[index]['sentence'] for index in results_df['first_index']]
second = [sent_df.iloc[index]['sentence'] for index in results_df['second_index']]
to_save = results_df.assign(first=pd.Series(first))
to_save = to_save.assign(second=pd.Series(second))
to_save.head()

Unnamed: 0,first_index,second_index,relation_score,first,second
0,0,1,0.0,You never give up.,You never find it difficult to change your min...
1,0,2,0.0,You never give up.,"Whatever you have set your sights on, you refu..."
2,0,3,0.0,You never give up.,You are patient unless someone takes you too far.
3,0,4,0.007928,You never give up.,You are usually slow to anger unless someone t...
4,0,5,-0.416145,You never give up.,You are reliable.


In [26]:
# relation scores are between -1 to 1. we need to transform relation scores to distances in the range of 0 to 1 for the clustering algorithm.
to_save = to_save.assign(dist=-0.5*to_save['relation_score'] + 0.5)
to_save.head()

Unnamed: 0,first_index,second_index,relation_score,first,second,dist
0,0,1,0.0,You never give up.,You never find it difficult to change your min...,0.5
1,0,2,0.0,You never give up.,"Whatever you have set your sights on, you refu...",0.5
2,0,3,0.0,You never give up.,You are patient unless someone takes you too far.,0.5
3,0,4,0.007928,You never give up.,You are usually slow to anger unless someone t...,0.496036
4,0,5,-0.416145,You never give up.,You are reliable.,0.708073


In [27]:
path = input("enter path to save csv file of computed distances:\n")
to_save.to_csv(path)

enter path to save csv file of computed distances:
 ../data/roberta_kg_astro1clean_distances(debugged1).csv


## use bert cosine similarity to find related trait. then perform graph inference.

In [None]:
inferences = list()
for i in range(len(dataset)):
    s1 = dataset.iloc[i]['first']
    s2 = dataset.iloc[i]['second']
    display(s1, s2)
    t1, t2 = find_traits(s1, s2, pooled_list) # should "pre-compute" trait instead of computing this everytime.

    #t1 = random.choice(list(G2.nodes))  # example trait
    #t1 = 'ungrudging'
    print(t1)
    #t2 = random.choice(list(G2.nodes))  # example trait
    #t2 = 'disagreeable'
    print(t2)

    if t1 != t2:
        try:
            paths = nx.all_shortest_paths(G2, source=t1, target=t2, weight=None, method='dijkstra')
            paths_list = [row for row in paths]

            paths_of_weights = get_paths_of_weights(G2, paths_list)
            #vis_weighted_paths(paths_list, paths_of_weights)

            #paths_of_scores = get_entailment_paths_of_weights(predictor, paths_list)
            #vis_weighted_paths(paths_list, paths_of_scores)

            inferred_relations = infer_trait_relations(paths_of_weights)

            rel = choose_rel(inferred_relations)


        except nx.NetworkXNoPath as e:
            rel = "neutral"
    else:
        rel = "entailment"

    inferences.append(rel)

In [21]:
graph_inferences = pd.DataFrame(data=inferences, columns=['inference'])
graph_inferences.head()

Unnamed: 0,inference
0,entailment
1,contradiction
2,entailment
3,neutral
4,contradiction


In [22]:
path = input("enter path to save csv results:\n")
graph_inferences.to_csv(path)

enter path to save csv results:
 ../data/kg_inferences_on_Astro1.csv


In [None]:
equals = list()
for i in range(len(graph_inferences)):
    equal = dataset.iloc[i]['judge'] == graph_inferences.iloc[i]['inference']
    equals.append(equal)
np.unique(equals, return_counts=True)

In [None]:
np.unique(graph_inferences, return_counts=True)

In [27]:
unique = np.unique(dataset['first'])
results = list()
for i in range(len(unique)):
    res = find_similar_trait(unique[i], pooled_list)
    results.append(res)

In [32]:
rows = list()
for i in range(len(unique)):
    row = [unique[i], results[i]['max'], results[i]['max_defn'], results[i]['max_trait']]
    rows.append(row)
    
similar_defs_and_traits_df = pd.DataFrame(data=rows, columns=['candidate', 'max_similarity', 'max_defn', 'connected_trait'])
similar_defs_and_traits_df.head()

Unnamed: 0,candidate,max_similarity,max_defn,connected_trait
0,Debate and argument appeal to you.,0.99601,in keeping with the facts,straight
1,"Keeping things going is your strength, especia...",0.992741,abounding; having a lot of,thick
2,Money and possessions are important to you for...,0.995257,without envy or reluctance,ungrudging
3,"Whatever you have set your sights on, you refu...",0.994293,worthy of being depended on,honest
4,You are adventurous.,0.995988,not to be taken lightly,real


In [33]:
similar_defs_and_traits_df.to_csv("../data/bert_similar_defs_and_traits_on_Astro1_decomposed_clean.csv")

### Try to use Wordnet to determine if adjacent words have similar word sense (for tracking word sense decay).

In [None]:
np.unique(dataset['judge'], return_counts=True)

In [None]:
import numpy as np
from nltk.corpus import wordnet as wn
from itertools import product

wordx, wordy = "dull","inactive"
sem1, sem2 = wn.synsets(wordx, pos='a'), wn.synsets(wordy, pos='a')  # gives pos == 'a' or 's'.

sem1 = [sem for sem in sem1 if sem.pos() == 'a' ]
sem2 = [sem for sem in sem2 if sem.pos() == 'a' ]

display(list(product(sem1,sem2)))

scores = list()
for i,j in list(product(sem1,sem2)):
    score = i.lch_similarity(j)
    scores.append(score)
print(scores)
 
scores2 = list()
for score in scores:
    if score != None:
        scores2.append(score)
    else:
        scores2.append(np.nan)
max_index = np.nanargmax(scores2)
print(max_index)
print(scores2[max_index])
list(product(sem1,sem2))[max_index]