In [None]:
import dataset

In [None]:
regenerate=False
statements = dataset.load_statements(regenerate=regenerate)
statements_by_uid = { s.uid:s for s in statements }

In [None]:
keyword_counts = dataset.get_keyword_counts_from_statements(statements)
len(keyword_counts)

In [None]:
import networkx as nx
G = nx.Graph()

In [None]:
#for i, s in enumerate(statements):
#    print(s)
#    if i>10: break
G.add_nodes_from(s.uid for s in statements)
len(G)

In [None]:
G.add_nodes_from(k for k,c in keyword_counts.items())
len(G)

In [None]:
# https://networkx.github.io/documentation/stable/reference/classes/generated/
#   networkx.Graph.add_weighted_edges_from.html
for i, s in enumerate(statements):
    G.add_weighted_edges_from(
        #(k, s.uid, 1.0) for k in s.keywords  # Evenly weighted :: Poor
        #(k, s.uid, 1.0+i) for k in s.keywords  # Demonstrates that weightings are 'weight'
        #(k, s.uid, 2.0-1.0/keyword_counts[k]) for k in s.keywords  # weight by how frequent keywords are
        #(k, s.uid, 3.0-1.0/len(s.keywords)-1.0/keyword_counts[k]) for k in s.keywords   # bit of both
        # Decent:
        (k, s.uid, 2.0-1.0/len(s.keywords)) for k in s.keywords   # weight by how many keywords the statement has
    )
len(G.edges)

In [None]:
# Effectively kill some nodes
for node in ['be']:
    G.add_weighted_edges_from((node, n, 100.) for n in G.neighbors(node) )        

In [None]:
#[n for n in G.neighbors('iron')]
[n for n in G.neighbors('2ead-9402-4803-38bc')]

In [None]:
nx.dijkstra_path(G, 'iron', 'water')

In [None]:
for p in nx.all_shortest_paths(G, 'iron', 'water', weight='weight'): 
    print(p)

In [None]:
def explain_path(p):
    for i,n in enumerate(p):
        #if i%2>1:
        if len(n)==19:  # a uid
            print("   > "+n+" : "+statements_by_uid[n].raw_txt)
        else:
            print(n)
    print()

explain_path( nx.dijkstra_path(G, 'rust', 'water') )

In [None]:
for p in nx.all_shortest_paths(G, 'saltwater', 'reason', weight='weight'): 
    explain_path( p )

In [None]:
regenerate=False
#qanda_train = dataset.load_qanda('train', regenerate=regenerate) # 1.8MB
qanda_dev   = dataset.load_qanda('dev', regenerate=regenerate)   # 400k in 496 lines
#qanda_test  = dataset.load_qanda('test', regenerate=regenerate)  # 800k

In [None]:
preds_baseline=dict() # qa_id -> [statements in order]
with open('/tmp/scorer/predict.txt', 'rt') as f:
    for l in f.readlines():
        qid, uid = l.strip().split('\t')
        if qid not in preds_baseline: preds_baseline[qid]=[]
        preds_baseline[qid].append(uid)

In [None]:
qa = qanda_dev[485]  #212  #4=astronomy

print(qa.question_id, qa.flags)
print("Q: "+qa.question.raw_txt)
print("  ", qa.question.keywords)
for i,ans in enumerate(qa.answers):
    print(f"A{i:1d}: "+ans.raw_txt)
    print("  ", ans.keywords)
print("explanation_gold:")
for ex in qa.explanation_gold:
    print(f"  {ex.uid} : {statements_by_uid[ex.uid].raw_txt}")
    print(f"{' '*25} : {statements_by_uid[ex.uid].keywords}")

print()
print(f"""baseline MAP = {dataset.silent_average_precision_score(
    set(e.uid for e in qa.explanation_gold), preds_baseline[qa.question_id][:]):.4f}""")

In [None]:
statements_by_uid[qa.explanation_gold[2].uid].keywords # .add('part')

In [None]:
','.join(qa.question.keywords), ','.join(qa.answers[0].keywords)

In [None]:
kw_all = qa.question.keywords | qa.answers[0].keywords
kw_all = set([kw for kw in kw_all if kw in G.nodes]) # Only use those in the vocab ...
','.join(kw_all)

In [None]:
target='core'
#kw_all-set([target])

In [None]:
#d,p=nx.multi_source_dijkstra(G, kw_all-set([target]), target=target, weight='weight')
d,p=nx.multi_source_dijkstra(G, ['component'], target='speedometer', weight='weight')
#print(p)
explain_path( p ) # This is not great...

In [None]:
#kw_arr = list(kw_all)
#srcs, tgts, limit_i_j = kw_all, kw_all, True
srcs, tgts, limit_i_j = qa.question.keywords, qa.answers[0].keywords, False

srcs = set([n for n in srcs if n in G.nodes]) # Only use those in the vocab ...
tgts = set([n for n in tgts if n in G.nodes]) # Only use those in the vocab ...

node_cnt=dict()
for i, src in enumerate(srcs):
    print(f"{i}/{len(srcs)}")
    for j, tgt in enumerate(tgts):
        if limit_i_j and i>=j: continue
        for p in nx.all_shortest_paths(G, src, tgt, weight='weight'): 
            explain_path( p )
            for k,n in enumerate(p):
                if k%2==0:continue
                if n not in node_cnt: node_cnt[n]=0
                node_cnt[n]+=1
len(node_cnt)

In [None]:
nodes_desc = sorted(node_cnt.items(),key=lambda n:-n[1])
#nodes_desc[:10]
len(nodes_desc)

In [None]:
[ f"{cnt:5d} : {uid:s} : {statements_by_uid[uid].raw_txt}" for uid,cnt in nodes_desc ][:10]

In [None]:
gold_ex_uid = set(e.uid for e in qa.explanation_gold)
preds_uid   = preds_baseline[qa.question_id]

def base_uids_list(arr):
    base_arr=[]
    for a in arr: 
        base=a[:19]
        if base not in base_arr: base_arr.append(base)
    return base_arr

nodes_uid   = base_uids_list( [uid for uid,_ in nodes_desc] )

for limit in [10000, 64, 32,16] :
    sc_baseline= dataset.silent_average_precision_score(
                   gold_ex_uid, preds_uid[:limit])
    sc_graph   = dataset.silent_average_precision_score(
                   gold_ex_uid, nodes_uid[:limit])
    print(f"{limit:5d} : baseline={sc_baseline:.4f}, graph={sc_graph:.4f}")
print(f"recall:          {len(gold_ex_uid & set(preds_uid[:64]))/len(gold_ex_uid):.4f}"+
               f"        {len(gold_ex_uid & set(nodes_uid[:64]))/len(gold_ex_uid):.4f} ")

In [None]:
# Agrees with baseline scoring... (good job too, since it's the actual scorer)
# python ../tg2020task/evaluate.py --gold ../tg2020task/questions.dev.tsv /tmp/scorer/predict.txt 

In [None]:
def get_min_distance_statements(qa, all_to_all=False):
    if all_to_all:
        kw_all = qa.question.keywords | qa.answers[0].keywords
        #kw_all = set([kw for kw in kw_all if kw in G.nodes]) # Only use those in the vocab ...
        #print(len(kw_all))
        srcs, tgts, limit_i_j = kw_all, kw_all, True
    else:
        srcs, tgts, limit_i_j = qa.question.keywords, qa.answers[0].keywords, False

    srcs = set([n for n in srcs if n in G.nodes]) # Only use those in the vocab ...
    tgts = set([n for n in tgts if n in G.nodes]) # Only use those in the vocab ...
        
    node_cnt=dict()
    for i, src in enumerate(srcs):
        #print(f"{i}/{len(srcs)}")
        for j, tgt in enumerate(tgts):
            if limit_i_j and i>=j: continue
            try:
                paths = nx.all_shortest_paths(G, src, tgt, weight='weight')
                for p in paths: 
                    #explain_path( p )
                    for k,n in enumerate(p):
                        #if k%2==0:continue
                        if len(n)<19:continue  # Ignore the keywords
                        if n not in node_cnt: node_cnt[n]=0
                        node_cnt[n]+=1
            except:
                print("  Cannot get to "+tgt)
    nodes_desc = sorted(node_cnt.items(),key=lambda n:-n[1])
    #print(nodes_desc)
    nodes_uid   = base_uids_list( [uid for uid,_ in nodes_desc] )
    return nodes_uid


#qa = qanda_dev[4]  #212  #4=astronomy    
if False:
    #with open("/tmp/scorer/predict_graph.txt", "wt") as f:
    with open("../predictions/predict_graph-q-to-a.txt", "wt") as f:
        for qa_i, qa in enumerate(qanda_dev):
            print(f"Running : {qa_i}")
            uids = get_min_distance_statements(qa)  #, all_to_all=True) 
            for uid in uids:
                f.write(f"{qa.question_id}\t{uid}\n")
#get_min_distance_statements(qanda_dev[486])

In [None]:
# python ../tg2020task/evaluate.py --gold ../tg2020task/questions.dev.tsv /tmp/scorer/predict_graph.txt


In [None]:
for t in dataset.nlp('if a substance has a higher density than another substance , then the molecules in the substance will be closer than those of the other substance'):
#for t in dataset.nlp('About how many times does the moon orbit Earth in a year?'):
#for t in dataset.nlp('approximately means about'):
    print(t.pos_, t.text, t.lemma_)
# ADP About about ...

In [None]:
# Different idea : let's create a new graph from the QuestionAnswer and the known explanation_gold nodes
#   Then : Have a look at nodes that only have 1 edge

In [None]:
# https://networkx.github.io/documentation/stable/reference/classes/generated/networkx.Graph.subgraph.html

In [None]:
kw_all = qa.question.keywords | qa.answers[0].keywords
copy_qa = set([kw for kw in kw_all if kw in G.nodes]) # Only use those in the vocab ...
len(copy_qa), ','.join(copy_qa)

In [None]:
copy_gold = set([e.uid for e in qa.explanation_gold])
len(copy_gold), ','.join(copy_gold)

In [None]:
for c in copy_gold:
    print(c, list(G.neighbors(c)))

In [None]:
copy_adj = set([nbr for n in copy_gold for nbr in G.neighbors(n) ]) # Order of for loops v important
len(copy_adj), ','.join(sorted(copy_adj))

In [None]:
#copy_nodes = set([n for lst in [copy_gold, copy_adj, copy_qa] for n in lst])
#len(copy_nodes), ','.join(sorted(copy_nodes))

In [None]:
SG = G.__class__()
SG.add_nodes_from((n, G.nodes[n]) for n in copy_gold )   
SG.add_nodes_from((n, G.nodes[n]) for n in copy_adj )   
SG.add_nodes_from((n, G.nodes[n]) for n in copy_qa )   
SG.add_edges_from((n, nbr, d)
    for n, nbrs in G.adj.items() if n in copy_gold
    for nbr, d in nbrs.items() if nbr in copy_adj)
SG.add_edges_from((n, nbr, d)
    for n, nbrs in G.adj.items() if n in copy_gold
    for nbr, d in nbrs.items() if nbr in copy_qa)
len(SG)

In [None]:
# Now for the nodes with only one edge:
for n in SG.nodes:
    l=len(list(SG.neighbors(n)))
    if l<=1:
        print(n, l)

In [None]:
def get_lonely_nodes(ex_list, copy_qa=copy_qa, ):
    copy_adj = set([nbr for n in ex_list for nbr in G.neighbors(n) ]) # Order of for loops v important
    
    SG = G.__class__()
    SG.add_nodes_from((n, G.nodes[n]) for n in ex_list )   
    SG.add_nodes_from((n, G.nodes[n]) for n in copy_adj )   
    SG.add_nodes_from((n, G.nodes[n]) for n in copy_qa )   
    SG.add_edges_from((n, nbr, d)
        for n, nbrs in G.adj.items() if n in ex_list
        for nbr, d in nbrs.items() if nbr in copy_adj)
    SG.add_edges_from((n, nbr, d)
        for n, nbrs in G.adj.items() if n in ex_list
        for nbr, d in nbrs.items() if nbr in copy_qa)
    #print(len(SG))
    print("question keywords", [n for n in (qa.question.keywords) if n in SG.nodes])
    print("answers[0] keywords", [n for n in (qa.answers[0].keywords) if n in SG.nodes])
    print("leaf nodes", [n for n in (SG.nodes - copy_qa) if len(list(SG.neighbors(n)))<=1 ])
    
    # qa.question.keywords | qa.answers[0].keywords
    empty_q = [n for n in (qa.question.keywords)
                if n in SG.nodes and len(list(SG.neighbors(n)))==0 ]
    empty_a = [n for n in (qa.answers[0].keywords)
                if n in SG.nodes and len(list(SG.neighbors(n)))==0 ]
    lonely_leaf = set( [n for n in (SG.nodes - copy_qa)
                    if len(list(SG.neighbors(n)))<=1 ] )
    
    # Links between explanation statements
    # == non-qa and non-leaf words?
    #  ?+? multi-linked qa words?
    statement_links = [n for n in (SG.nodes - copy_qa - lonely_leaf)
                                if len(n)!=19 ]
    
    return empty_q, empty_a, lonely_leaf, statement_links

get_lonely_nodes(copy_gold)

In [None]:
for uid in sorted(copy_gold):
    s = statements_by_uid[uid]
    print(f"  {uid} : {s.raw_txt}")
    print(f"          {s.keywords}")

In [None]:
# Now look at the predictions from baseline_retrieval
ex_guess = preds_baseline[qa.question_id][:5]
get_lonely_nodes( ex_guess )

In [None]:
for uid in ex_guess:
    s = statements_by_uid[uid]
    print(f"  {uid} : {s.raw_txt}")
    print(f"          {s.keywords}")