In [1]:
import json

In [2]:
with open("fauci-email-graph.json") as f:
    data = json.loads(f.read())

In [3]:
emails = data["emails"]
names = data["names"]
clusters = data["clusters"]

keep = set(range(len(names) + 1))
while True:
    inds = []
    all_s = []
    all_r = []
    all_c = []

    for chain in emails:
        for email in chain:
            s = email["sender"]            
            if s not in keep: continue
            
            recipients = [r for r in email["recipients"] if r in keep]
            ccs = [c for c in email["cc"] if c in keep]
            
            num_inds = len(recipients) * len(ccs)
            if num_inds == 0: continue
        
            all_s.append(s)
            for r in recipients: all_r.append(r)
            for c in ccs: all_c.append(c)
        
            for r in recipients:
                for c in ccs:
                    inds.append((s, r, c, 1.0 / num_inds))
                    
    new_keep = set(all_s).intersection(set(all_r)).intersection(set(all_c))
    no_updates = (new_keep == keep)
    keep = new_keep
    if no_updates or len(new_keep) == 0:
        break
    else:
        keep = new_keep

keep = sorted(list(keep))        
dim = len(keep)
print(dim, "nodes in SCC")

44 nodes in SCC


In [4]:
clean_names = [names[k] for k in keep]
clean_clusters = [clusters[k] for k in keep]
id_map = {k : i for (i, k) in enumerate(keep)}
clean_inds = [(id_map[s], id_map[r], id_map[c], v) for (s, r, c, v) in inds]
print(len(clean_inds), "emails")

1413 emails


In [5]:
# Merge indices having multiple values
combined_inds = {}
for (s, r, c, v) in clean_inds:
    index = (c, r, s)
    if index not in combined_inds:
        combined_inds[index] = 0.0
    combined_inds[index] += v

num_entries = len(combined_inds)    
print(num_entries, "unique cc/receiver/sender indices")

551 unique cc/receiver/sender indices


In [6]:
# Write out human-readable json
tensor_fname = 'cc-recipient-sender-tensor.json'
with open(tensor_fname, "w") as f:
    f.write('{\n')
    
    # tensor dimensions / number of nodes ^ 3
    f.write('"dimensions": [\n') 
    f.write(f'[{dim}, {dim}, {dim}]\n')
    f.write('],\n')
    
    # number of entries
    f.write('"num_entries": [\n')
    f.write(f'{num_entries}\n')
    f.write('],\n')
    
    # tensor entries (index and value)
    f.write('"entries": [\n')
    for (i, index) in enumerate(combined_inds):
        extra = ',' if i < len(combined_inds) - 1 else ''
        c, r, s = index
        v = combined_inds[index]        
        f.write(f'[{c}, {r}, {s}, {v}]{extra}\n')
    f.write('],\n')
    
    # names of people
    f.write('"names": [\n')    
    for (i, name) in enumerate(clean_names):
        extra = ',' if i < len(clean_names) - 1 else ''        
        f.write(f'"{name}"{extra}\n')
    f.write('],\n')
    
    # clusters / organizations of people
    f.write('"clusters": [\n')    
    for (i, cluster) in enumerate(clean_clusters):
        extra = ',' if i < len(clean_clusters) - 1 else ''        
        f.write(f'{cluster}{extra}\n')
    f.write(']\n')
    
    f.write('}\n')    

In [7]:
# Test that we can load the json
with open(tensor_fname) as f:
    tensor = json.loads(f.read())