In [1]:
import numpy as np
import json
from gensim.models import Word2Vec
import datetime
from datetime import date
from collections import Counter

In [2]:
with open("fauci-email-graph-2.json") as f:
    data = json.loads(f.read())
    
model = Word2Vec.load('fauci-email-w2v.model')

In [3]:
emails = data["emails"]
start_date = date.fromisoformat("2020-01-25")

tensor_inds = []
tensor_vals = []
words = []
word_map = dict()

for chain in emails:
    for email in chain:
        t = date.fromisoformat(email["time"][:10])
        t_diff = (t - start_date).days
        if t_diff >= 0:
            sender = email["sender"]
            recipients = email["recipients"]
            body_emb = email["body_embedding"]
            body_word = model.wv.most_similar(positive=[np.array(body_emb)], topn=1)[0][0]
            if body_word not in word_map:
                word_map[body_word] = len(word_map)                
                words.append(body_word)

            for recip in recipients:
                tensor_inds.append((sender, recip, t_diff, word_map[body_word]))
                tensor_vals.append(1.0 / len(recipients))
                
description = """
 Tensor index (i, j, k, l) with value v corresponds to
 i sent 
 j an email 
 k days after 2020-01-25 and the
 lth word was closest to the embedding of the email body and
 v is the inverse of the number of recipients
 
 The third index covers 102 consecutive days with an email
 
 There can be duplicate indices corresponding to emails on the same day
 
 An alternative tensor could just come from the first three indices.
 
 CCed people are not included
"""

max_day = max([ind[2] for ind in tensor_inds])
dates = [(start_date + datetime.timedelta(days=i)).isoformat() for i in range(max_day + 1)]

out_data = {"indices": tensor_inds,
            "values": tensor_vals,
            "names": data["names"],
            "words": words,
            "dates": dates,
            "description": description,
           }
with open('fauci-email-tensor-1.json', 'w') as f:
    json.dump(out_data, f)

In [26]:
tensor_inds = []
tensor_vals = []

scc_inds = np.array([1,2,3,4,5,7,22,30,32,34,35,36,48,54,67,68,77,
                     79,80,81,83,88,102,123,124,125,133,137,140,159,
                     161,162,220,229,239,241,292,293,294,368,375,399,
                     480,498,503,508,536,550,551,556,559,576,579,584,
                     613,635,639,666,672,677,694,715,716,728,729,730,
                     733,734,735,737,738,740,751,795,816,817,818]) - 1
node_map = dict()
for (i, ind) in enumerate(scc_inds):
    node_map[ind] = i

for chain in emails:
    for email in chain:
        sender = email["sender"]
        if sender not in node_map:
            continue
        scc_sender = node_map[sender]
            
            
        recipients = email["recipients"]
        scc_recips = [node_map[r] for r in recipients if r in node_map]
        #print(len(recipients), len(scc_recips))
        
        ccs = email["cc"]        
        scc_ccs = [node_map[c] for c in ccs if c in node_map]
        print(len(ccs), len(scc_ccs))        
            
        for recip in scc_recips:
            for cc in scc_ccs:
                tensor_inds.append((scc_sender, recip, cc))
                val = 1.0 / (len(scc_recips) * len(scc_ccs))
                tensor_vals.append(val)


out_data = {"indices": tensor_inds,
            "values": tensor_vals,
            "names": [data["names"][i] for i in scc_inds]
           }
with open('fauci-email-tensor-2.json', 'w') as f:
    json.dump(out_data, f)

3 3
3 3
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
2 2
2 2
0 0
2 2
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
3 2
0 0
2 2
0 0
3 3
0 0
5 5
0 0
0 0
0 0
0 0
0 0
0 0
0 0
3 3
3 3
0 0
0 0
0 0
2 1
2 0
0 0
0 0
0 0
0 0
5 3
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 0
1 0
0 0
0 0
0 0
4 3
4 3
0 0
0 0
0 0
1 1
1 0
0 0
0 0
1 1
0 0
1 1
1 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
2 2
0 0
0 0
0 0
0 0
0 0
4 4
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
2 1
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
4 2
0 0
1 1
0 0
0 0
0 0
0 0
1 1
2 2
1 1
4 3
3 2
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
2 1
3 2
3 2
3 2
1 0
1 0
1 1
1 1
1 1
0 0
0 0
0 0
2 2
1 1
3 0
0 0
0 0
3 0
1 0
0 0
0 0
1 1
1 1
0 0
0 0
0 0
0 0
0 0
0 0
4 3
0 0
1 0
5 4
5 5
5 5
0 0
1 1
1 1
0 0
0 0
0 0
5 4
0 0
0 0
3 3
3 3
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
5 2
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0


In [29]:
s = set([ind[0] for ind in tensor_inds])
r = set([ind[1] for ind in tensor_inds])
c = set([ind[2] for ind in tensor_inds])