In [5]:
import json
import gensim
from gensim.utils import tokenize
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, stem_text, strip_multiple_whitespaces, strip_tags, strip_short, strip_punctuation
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from string import ascii_lowercase
import numpy as np
import copy

In [6]:
with open("../fauci-email-data.json") as f:
    data = json.loads(f.read())

In [7]:
stem_replacement_dict = {
    "fauc": "fauci",
    "covi": "covid",
    "coro": "corona",
    "coron": "corona",
    "corona": "corona",
    "coronav": "coronavirus",
    "coronavi": "coronavirus",
    "coronavir": "coronavirus",
    "coronavirn": "coronavirus",
    "coronaviru": "coronavirus",
    "coronavirus": "coronavirus",
    "coronoviru": "coronavirus",
    "covtd": "covid",
    "econom": "economi",
    "editori": "editoria",
    "edito": "editor",
    "forwa": "forward",
    "globa": "global",
    "healt": "health",
    "instit": "institut",
    "institu": "institut",
    "internationa": "internation",
    "interviewon": "interview",
    "lnstagram": "instagram",
    "orig": "origin",
    "origi": "origin",
    "quarant": "quarantin",
    "reat": "treat",
    "reatment": "treatment",
    "strateg": "strategi",
    "wro": "wrote",
    "wrot": "wrote",
    "iaid": "niaid",
    "viru": "virus",
    "foryour": "your",
    "nlaid": "niaid"
}

def dedup_replace(word): return stem_replacement_dict.get(word, word)

In [8]:
filters = [strip_tags,
           strip_punctuation,
           strip_multiple_whitespaces, 
           remove_stopwords, 
           strip_short, 
           stem_text]

def clean_text(input_text):
    cleaned_sentences = []
    
    for sentence in input_text.split("\n"):
        tokenized = " ".join(list(tokenize(sentence, lowercase=True)))
        processed_str = preprocess_string(tokenized) #, filters)
        if len(processed_str) > 0:
            cleaned_sentences.append([dedup_replace(w) for w in processed_str])
    return cleaned_sentences

In [9]:
emails = data["emails"]

In [10]:
all_text = []
for (i, chain) in enumerate(emails):
    for email in chain:
        clean_subj = clean_text(email["subject"])
        email["clean_subj"] = clean_subj
        if len(clean_subj) > 0:
            for cs in clean_subj:
                all_text.append(cs)

        clean_body = clean_text(email["body"])
        email["clean_body"] = clean_body
        if len(clean_body) > 0:
            for cb in clean_body:
                all_text.append(cb)

In [11]:
#model = Word2Vec(all_text, vector_size=50, min_count=10) # pretty good!
model = Word2Vec(all_text, vector_size=32, min_count=12,
                 alpha=0.025, min_alpha=0.0001,
                 epochs=8,
                 compute_loss=True
                 )
#len(model.wv)
#model.get_latest_training_loss() / 1e6

In [12]:
model.save('fauci-email-w2v.model')

In [13]:
model.wv.most_similar("fauci")

[('toni', 0.8493401408195496),
 ('franci', 0.8412538170814514),
 ('david', 0.8198693990707397),
 ('colleagu', 0.7967791557312012),
 ('rubenstein', 0.7939940690994263),
 ('cnni', 0.7893975377082825),
 ('collin', 0.7886248230934143),
 ('eli', 0.7831961512565613),
 ('sir', 0.7811444401741028),
 ('renat', 0.7785160541534424)]

In [14]:
model.wv.most_similar("trump")

[('donald', 0.9098467826843262),
 ('prof', 0.9026286005973816),
 ('head', 0.8996578454971313),
 ('fellow', 0.8890562057495117),
 ('doctor', 0.8698291182518005),
 ('award', 0.8622108101844788),
 ('cet', 0.861404299736023),
 ('alex', 0.8610497117042542),
 ('hall', 0.8596192598342896),
 ('richard', 0.8551174998283386)]

In [15]:
model.wv.most_similar("presid")

[('vice', 0.8999403715133667),
 ('execut', 0.8972741365432739),
 ('senior', 0.8835725784301758),
 ('deputi', 0.87000572681427),
 ('chief', 0.8657837510108948),
 ('director', 0.8408812284469604),
 ('polici', 0.8401379585266113),
 ('harvard', 0.8221364617347717),
 ('affair', 0.8180564641952515),
 ('trump', 0.8175964951515198)]

In [16]:
model.wv.most_similar("covid")

[('treatment', 0.8399640321731567),
 ('develop', 0.8345067501068115),
 ('phase', 0.8297527432441711),
 ('ncov', 0.8125050067901611),
 ('antibodi', 0.8106117844581604),
 ('strategi', 0.8037755489349365),
 ('influenza', 0.7999055981636047),
 ('guidelin', 0.7941806316375732),
 ('detect', 0.7752757668495178),
 ('effect', 0.7700123190879822)]

In [17]:
model.wv.most_similar("twitter")

[('googl', 0.983855664730072),
 ('youtub', 0.9806011915206909),
 ('nyt', 0.9786182641983032),
 ('nytim', 0.9775824546813965),
 ('facebook', 0.9739229679107666),
 ('url', 0.9703687429428101),
 ('pitt', 0.9648745059967041),
 ('figliola', 0.9643014669418335),
 ('net', 0.9618445038795471),
 ('doi', 0.9583131670951843)]

In [18]:
words = sorted([w for w in model.wv.index_to_key])
with open("words.txt", "w") as f:
    for word in words:
        f.write(word)
        f.write("\n")

In [19]:
dim = len(model.wv["fauci"])

def mean_emb_vec(sentences):
    all_words = [word for sentence in sentences for word in sentence]    
    emb = np.zeros(dim)
    num_words = 0
    for word in all_words:
        if word in model.wv:
            emb += model.wv[word]        
            num_words += 1
            
    if num_words == 0:
        return emb
    return emb / num_words


for (i, chain) in enumerate(emails):
    for email in chain:
        email["subject_embedding"] = list(mean_emb_vec(email["clean_subj"]))
        email["body_embedding"] = list(mean_emb_vec(email["clean_body"]))

In [20]:
embedded_emails = []
for chain in emails:
    embedded_chain = []
    for email in chain:
        embedded_email = copy.copy(email)
        del embedded_email["clean_subj"]
        del embedded_email["clean_body"]
        embedded_chain.append(embedded_email)
    embedded_emails.append(embedded_chain)

In [21]:
data["emails"] = embedded_emails

In [22]:
num_nodes = len(data["names"])
X_subj_sender = np.zeros((dim, num_nodes))
X_body_sender = np.zeros((dim, num_nodes))
X_subj_recip  = np.zeros((dim, num_nodes))
X_body_recip  = np.zeros((dim, num_nodes))
X_subj_cc     = np.zeros((dim, num_nodes))
X_body_cc     = np.zeros((dim, num_nodes))

In [23]:
sender_counts = np.zeros(num_nodes)
recip_counts = np.zeros(num_nodes)
cc_counts = np.zeros(num_nodes)

In [24]:
for chain in embedded_emails:
    for email in chain:
        subj_emb = email['subject_embedding']
        body_emb = email['body_embedding']
        
        sender = email['sender']
        X_subj_sender[:, sender] += subj_emb
        X_body_sender[:, sender] += body_emb
        sender_counts[sender] += 1
        
        for recip in email['recipients']:
            X_subj_recip[:, recip] += subj_emb
            X_body_recip[:, recip] += body_emb
            recip_counts[recip] += 1
        
        for cc in email['cc']:
            X_subj_cc[:, cc] += subj_emb
            X_body_cc[:, cc] += body_emb
            cc_counts[cc] += 1

In [25]:
for j in range(num_nodes):
    if sender_counts[j] > 0:
        X_subj_sender[:, j] /= sender_counts[j]
        X_body_sender[:, j] /= sender_counts[j]
    if recip_counts[j] > 0:
        X_subj_recip[:, j] /= recip_counts[j]
        X_body_recip[:, j] /= recip_counts[j]
    if cc_counts[j] > 0:
        X_subj_cc[:, j] /= cc_counts[j]
        X_body_cc[:, j] /= cc_counts[j]
        
X = np.vstack((X_subj_sender, X_body_sender,
               X_subj_recip, X_body_recip,
               X_subj_cc, X_body_cc))

data['node_features'] = [list(X[:, j]) for j in range(num_nodes)]

In [26]:
with open('fauci-email-graph-w2v.json', 'w') as f:
    json.dump(data, f)