In [1]:
import json
import gensim
from gensim.utils import tokenize
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, stem_text, strip_multiple_whitespaces, strip_tags, strip_short, strip_punctuation
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from string import ascii_lowercase
import numpy as np
import copy



In [2]:
with open("../fauci-email-data.json") as f:
    data = json.loads(f.read())

In [3]:
stem_replacement_dict = {
    "fauc": "fauci",
    "covi": "covid",
    "coro": "corona",
    "coron": "corona",
    "corona": "corona",
    "coronav": "coronavirus",
    "coronavi": "coronavirus",
    "coronavir": "coronavirus",
    "coronavirn": "coronavirus",
    "coronaviru": "coronavirus",
    "coronavirus": "coronavirus",
    "coronoviru": "coronavirus",
    "covtd": "covid",
    "econom": "economi",
    "editori": "editoria",
    "edito": "editor",
    "forwa": "forward",
    "globa": "global",
    "healt": "health",
    "instit": "institut",
    "institu": "institut",
    "internationa": "internation",
    "interviewon": "interview",
    "lnstagram": "instagram",
    "orig": "origin",
    "origi": "origin",
    "quarant": "quarantin",
    "reat": "treat",
    "reatment": "treatment",
    "strateg": "strategi",
    "wro": "wrote",
    "wrot": "wrote",
    "iaid": "niaid",
    "viru": "virus",
    "foryour": "your",
    "nlaid": "niaid"
}

def dedup_replace(word): return stem_replacement_dict.get(word, word)

In [4]:
filters = [strip_tags,
           strip_punctuation,
           strip_multiple_whitespaces, 
           remove_stopwords, 
           strip_short, 
           stem_text]

def clean_text(input_text):
    cleaned_sentences = []
    
    for sentence in input_text.split("\n"):
        tokenized = " ".join(list(tokenize(sentence, lowercase=True)))
        processed_str = preprocess_string(tokenized) #, filters)
        if len(processed_str) > 0:
            cleaned_sentences.append([dedup_replace(w) for w in processed_str])
    return cleaned_sentences

In [5]:
emails = data["emails"]

In [6]:
all_text = []
for (i, chain) in enumerate(emails):
    for email in chain:
        clean_subj = clean_text(email["subject"])
        email["clean_subj"] = clean_subj
        if len(clean_subj) > 0:
            for cs in clean_subj:
                all_text.append(cs)

        clean_body = clean_text(email["body"])
        email["clean_body"] = clean_body
        if len(clean_body) > 0:
            for cb in clean_body:
                all_text.append(cb)

In [7]:
#model = Word2Vec(all_text, vector_size=50, min_count=10) # pretty good!
model = Word2Vec(all_text, vector_size=32, min_count=12,
                 alpha=0.025, min_alpha=0.0001,
                 epochs=8,
                 compute_loss=True
                 )
#len(model.wv)
#model.get_latest_training_loss() / 1e6

In [8]:
model.save('fauci-email-w2v.model')

In [9]:
model.wv.most_similar("fauci")

[('toni', 0.878736674785614),
 ('franci', 0.8447346687316895),
 ('colleagu', 0.8063818216323853),
 ('collin', 0.7924655675888062),
 ('sir', 0.7867681980133057),
 ('prof', 0.7852154970169067),
 ('sun', 0.7787708044052124),
 ('david', 0.7730051875114441),
 ('avi', 0.7680595517158508),
 ('webbi', 0.7609789967536926)]

In [10]:
model.wv.most_similar("trump")

[('donald', 0.9305621981620789),
 ('doctor', 0.8902467489242554),
 ('hall', 0.8899942636489868),
 ('virtual', 0.8759146332740784),
 ('head', 0.8754397034645081),
 ('member', 0.8742878437042236),
 ('told', 0.872067928314209),
 ('confer', 0.8711934685707092),
 ('town', 0.8627811074256897),
 ('council', 0.86131352186203)]

In [11]:
model.wv.most_similar("presid")

[('vice', 0.9026037454605103),
 ('execut', 0.8942199349403381),
 ('director', 0.8809024691581726),
 ('senior', 0.8788819909095764),
 ('deputi', 0.8718844652175903),
 ('chief', 0.8537507653236389),
 ('harvard', 0.845360517501831),
 ('maryland', 0.8336153626441956),
 ('univers', 0.813807487487793),
 ('dean', 0.8108547329902649)]

In [12]:
model.wv.most_similar("covid")

[('phase', 0.8731111884117126),
 ('treatment', 0.8536596298217773),
 ('develop', 0.8478230834007263),
 ('influenza', 0.8452311158180237),
 ('antibodi', 0.842274010181427),
 ('strategi', 0.8387377262115479),
 ('vaccin', 0.8261803388595581),
 ('ncov', 0.8245812058448792),
 ('studi', 0.8193559646606445),
 ('wuhan', 0.8079530000686646)]

In [13]:
model.wv.most_similar("twitter")

[('youtub', 0.9924458861351013),
 ('facebook', 0.9811912178993225),
 ('googl', 0.9796998500823975),
 ('nytim', 0.971889317035675),
 ('doi', 0.9712660908699036),
 ('nyt', 0.9672459363937378),
 ('podcast', 0.9646225571632385),
 ('bit', 0.9638143181800842),
 ('mike', 0.9636254906654358),
 ('url', 0.9635125398635864)]

In [14]:
words = sorted([w for w in model.wv.index_to_key])
with open("words.txt", "w") as f:
    for word in words:
        f.write(word)
        f.write("\n")

In [15]:
dim = len(model.wv["fauci"])

def mean_emb_vec(sentences):
    all_words = [word for sentence in sentences for word in sentence]    
    emb = np.zeros(dim)
    num_words = 0
    for word in all_words:
        if word in model.wv:
            emb += model.wv[word]        
            num_words += 1
            
    if num_words == 0:
        return emb
    return emb / num_words


for (i, chain) in enumerate(emails):
    for email in chain:
        email["subject_embedding"] = list(mean_emb_vec(email["clean_subj"]))
        email["body_embedding"] = list(mean_emb_vec(email["clean_body"]))

In [16]:
embedded_emails = []
for chain in emails:
    embedded_chain = []
    for email in chain:
        embedded_email = copy.copy(email)
        del embedded_email["clean_subj"]
        del embedded_email["clean_body"]
        embedded_chain.append(embedded_email)
    embedded_emails.append(embedded_chain)

In [17]:
data["emails"] = embedded_emails

In [18]:
num_nodes = len(data["names"])
X_subj_sender = np.zeros((dim, num_nodes))
X_body_sender = np.zeros((dim, num_nodes))
X_subj_recip  = np.zeros((dim, num_nodes))
X_body_recip  = np.zeros((dim, num_nodes))
X_subj_cc     = np.zeros((dim, num_nodes))
X_body_cc     = np.zeros((dim, num_nodes))

In [19]:
sender_counts = np.zeros(num_nodes)
recip_counts = np.zeros(num_nodes)
cc_counts = np.zeros(num_nodes)

In [20]:
for chain in embedded_emails:
    for email in chain:
        subj_emb = email['subject_embedding']
        body_emb = email['body_embedding']
        
        sender = email['sender']
        X_subj_sender[:, sender] += subj_emb
        X_body_sender[:, sender] += body_emb
        sender_counts[sender] += 1
        
        for recip in email['recipients']:
            X_subj_recip[:, recip] += subj_emb
            X_body_recip[:, recip] += body_emb
            recip_counts[recip] += 1
        
        for cc in email['cc']:
            X_subj_cc[:, cc] += subj_emb
            X_body_cc[:, cc] += body_emb
            cc_counts[cc] += 1

In [21]:
for j in range(num_nodes):
    if sender_counts[j] > 0:
        X_subj_sender[:, j] /= sender_counts[j]
        X_body_sender[:, j] /= sender_counts[j]
    if recip_counts[j] > 0:
        X_subj_recip[:, j] /= recip_counts[j]
        X_body_recip[:, j] /= recip_counts[j]
    if cc_counts[j] > 0:
        X_subj_cc[:, j] /= cc_counts[j]
        X_body_cc[:, j] /= cc_counts[j]
        
X = np.vstack((X_subj_sender, X_body_sender,
               X_subj_recip, X_body_recip,
               X_subj_cc, X_body_cc))

data['node_features'] = [list(X[:, j]) for j in range(num_nodes)]

In [22]:
with open('fauci-email-data-w2v.json', 'w') as f:
    json.dump(data, f)