In [1]:
import numpy as np
import json
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
    sys.path.append(nb_dir + '/parsing')

In [2]:
import pandas as pd
from parsing.medhop_graph_extraction import extract_graph, extract_target_and_relation
from parsing.medhop_path_extraction import extract_paths_dataset
import networkx as nx
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [3]:
df = pd.read_json('../qangaroo_v1.1/medhop/train.json', orient='records')

In [4]:
extract_graph(df, sentence_wise=False, entity_list_path='../parsing/entities.txt')
extract_target_and_relation(df)
train_dataset = extract_paths_dataset(df, 
                                      path_search_method_name='all', 
                                      cutoff=4, 
                                      limit=100)

  1%|          | 89/14466 [00:00<00:16, 882.22it/s]

20111 possible entities


100%|██████████| 14466/14466 [00:22<00:00, 632.32it/s]
1620it [05:14,  5.14it/s]

Total examples: 14436
Positive example failures: 0
Negative example failures: 0
Total with path: 14436





In [18]:
df['has_answer'] = df.apply(lambda row: 1 if row['answer'] in row['graph'][0] else 0, axis=1)
df['has_target'] = df.apply(lambda row: 1 if row['target'] in row['graph'][0] else 0, axis=1)
df['has_candidates'] = df.apply(lambda row: sum([1 if (c != row['answer'] and c in row['graph'][0])
                                                 else 0 for c in row['candidates']]), axis=1)
df['total_candidates_exc_answer'] = df.apply(lambda row: len(row['candidates']) - 1, axis=1)

#### Total number of queries

In [19]:
len(df)

1620

#### Queries that have the target in the graph

In [20]:
df['has_target'].sum()

1620

#### Queries that have the answer in the graph

In [21]:
df['has_answer'].sum()

1620

#### Total candidates except the answer candidates

In [22]:
df['total_candidates_exc_answer'].sum()

12816

#### Total candidates except answer candidates in the graph

In [23]:
df['has_candidates'].sum()

12816

#### Number of answer-target pairs in the same connected component

In [24]:
def are_connected(graph_info, source, target):
    G = nx.from_numpy_matrix(np.array(graph_info[1]))
    if source in graph_info[0] and target in graph_info[0]:
        connected = 1 if nx.has_path(G, graph_info[0][source], graph_info[0][target]) else 0
        return connected
    else:
        return 0

def shortest_path_length(graph_info, source, target):
    G = nx.from_numpy_matrix(np.array(graph_info[1]))
    return nx.shortest_path_length(G, graph_info[0][source], graph_info[0][target])

def path_count(graph_info, source, target):
    G = nx.from_numpy_matrix(np.array(graph_info[1]))
    return len(list(nx.all_simple_paths(G, graph_info[0][source], graph_info[0][target], cutoff=4)))

In [25]:
df['answer_target_connected'] = df.apply(lambda row: are_connected(row['graph'], row['answer'], row['target']), 
                                         axis=1)
df['answer_candidates_connected'] = df.apply(lambda row: sum([are_connected(row['graph'], c, row['target'])
                                                              if c != row['answer'] else 0
                                                              for c in row['candidates']]), 
                                             axis=1)
df['shortest_target_path'] = df.apply(lambda row: shortest_path_length(row['graph'], row['answer'], row['target']), 
                                      axis=1)

#### How many answer-target tuples are connected: all

In [26]:
df['answer_target_connected'].sum()

1620

#### How many candidate=target tuples are connected: all

In [27]:
df['answer_candidates_connected'].sum()

12816

#### Maximum shortest answer-target path length

In [28]:
df['shortest_target_path'].max()

4

#### Average shortest answer-target path length

In [29]:
df['shortest_target_path'].mean()

2.9327160493827162

#### Number of paths of length at most 4 between answer and target

In [30]:
df['answer_target_path_count'] = df.apply(lambda row: path_count(row['graph'], row['answer'], row['target']), 
                                          axis=1)

In [31]:
df['answer_target_path_count'].mean()

88.62407407407407

In [32]:
df['answer_target_path_count'].max()

5200

In [33]:
df['answer_target_path_count'].min()

1

In [34]:
G = nx.from_numpy_matrix(np.array(df['graph'][0][1]))
pos = nx.kamada_kawai_layout(G)

In [35]:
def plot_graph(graph_info, answer, target, candidates):
    G = nx.from_numpy_matrix(np.array(graph_info[1]))
    pos = nx.fruchterman_reingold_layout(G)
    
    edge_trace = go.Scatter(
        x=[],
        y=[],
        text=[],
        line=dict(width=0.7,color='black'),
        hoverinfo='text',
        mode='lines')

    for edge in G.edges():
        x0 = pos[edge[0]][0]
        y0 = pos[edge[0]][1]
        
        x1 = pos[edge[1]][0]
        y1 = pos[edge[1]][1]
        edge_trace['x'] += [x0, x1, None]
        edge_trace['y'] += [y0, y1, None]
        edge_trace['text'].append(graph_info[2][(edge[0], edge[1])])

    node_trace = go.Scatter(
        x=[],
        y=[],
        text=[],
        mode='markers',
        hoverinfo='text',
        marker={'color': [],
                'size': 14,
                'line': {
                         'color' : 'black',
                         'width' : 1
                        }
               })

    for node in G.nodes():
        x = pos[node][0]
        y = pos[node][1]
        node_trace['x'].append(x)
        node_trace['y'].append(y)
        node_trace['text'].append(graph_info[3][node])
        
        if graph_info[3][node] == target:
            color = 'red'
        elif graph_info[3][node] == answer:
            color = 'green'
        elif graph_info[3][node] in candidates:
            color = 'blue'
        else:
            color='grey'
        node_trace['marker']['color'].append(color)
    
    
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                    title='<br>KB graph',
                    titlefont=dict(size=16),
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),

                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)))

    iplot(fig, filename='networkx')

In [36]:
plot_idx = 0
plot_graph(df['graph'][plot_idx], df['answer'][plot_idx], df['target'][plot_idx], df['candidates'][plot_idx])