In [None]:
import collections
import glob
import json
import pandas as pd

import holoviews as hv
hv.extension('bokeh')
from bokeh.plotting import show



dataset_dir = "../data_prep/final_dataset/"

SUBSETS = "train dev test".split()

datasets = collections.defaultdict(list)

for subset in SUBSETS:
    for filename in glob.glob(dataset_dir + subset + "/*"):
        with open(filename, 'r') as f:
            datasets[subset].append(json.load(f))

In [None]:
all_pairs = sum(datasets.values(), [])

mycounter = collections.defaultdict(lambda: collections.Counter())


rebuttal_sequences = []

def get_transition_probabilities(review_sentences, rebuttal_sentences):
    rev_reb_map = collections.defaultdict(list)
    rebuttal_sequence = []
    for reb_sent in rebuttal_sentences:
        if reb_sent["alignment"][1] is not None:
            for review_index in reb_sent["alignment"][1]:
                #rev_reb_map[review_index].append((reb_sent["coarse"], reb_sent["fine"]))
                rev_reb_map[review_index].append(reb_sent["fine"])
                rebuttal_sequence.append(reb_sent["fine"])
    rebuttal_sequences.append(rebuttal_sequence)
    for i, rev_sent in enumerate(review_sentences):
        for reb_type in rev_reb_map[i]:
            if rev_sent["coarse"] == "arg_request":
                mycounter[rev_sent["fine"]][reb_type] += 1
            
        

In [None]:
for x in all_pairs:
    if x["metadata"]["annotator"] == "anno0":
        get_transition_probabilities(x["review_sentences"], x["rebuttal_sentences"])
    
for_df = []
for rev_type, reb_types in mycounter.items():
    for reb_type, count in reb_types.items():
        if count > 3:
            for_df.append(
            {
                "rev_type": rev_type,
                "reb_type": reb_type,
                "count": count
            })
sankey_df = pd.DataFrame.from_dict(for_df)

In [None]:
sankey_df

In [None]:
sankey2 = hv.Sankey(sankey_df,kdims=["rev_type", "reb_type"], vdims=["count"])

sankey2.opts(cmap="PuBuGn_r",label_position='outer',
                                 edge_color='rev_type', edge_line_width=0,
                                 node_alpha=1.0, node_width=40, node_sort=False,
                                 width=800, height=1200, bgcolor="snow",
                                 title="How many covid-19 vaccines has each manufacturer sold")

show (hv.render(sankey2))

In [None]:
def collapse_sequence(sequence):
    if not sequence:
        return tuple()
    collapsed_sequence = [sequence[0]]
    for label in sequence[1:]:
        if label == collapsed_sequence[-1]:
            continue
        else:
            collapsed_sequence.append(label)
    return tuple(collapsed_sequence)
reb_seq_counter = collections.Counter()
for seq in rebuttal_sequences:
    reb_seq_counter[collapse_sequence(seq)] += 1
    
print(sum(reb_seq_counter.values())/len(reb_seq_counter))