In [1]:
# pdftotext -layout -r 300 leopold-nih-foia-anthony-fauci-emails.pdf 
import copy
import importlib
import json
import numpy as np
import re
from datetime import datetime
from string import ascii_lowercase
from collections import Counter

In [2]:
def line_metadata_re(name): return re.compile("[\s]*" + "[\s]*".join(list(name)) + "[\s]*:.*")
from_re = line_metadata_re("from")
time_re = line_metadata_re("sent")
to_re   = line_metadata_re("to")
cc_re   = line_metadata_re("cc")
subj_re = line_metadata_re("subject")

def is_from(line): return from_re.match(line.lower())
def is_time(line): return time_re.match(line.lower())
def is_to(line):   return to_re.match(line.lower())
def is_cc(line):   return cc_re.match(line.lower())
def is_subj(line): return subj_re.match(line.lower())

fauci_re = re.compile(".*" + "[\s]*".join(list("fauci")) + ".*")
def contains_fauci(line): return fauci_re.match(line.lower())

def has_redacted_sender(line):
    ll = line.lower()
    ind = ll.find(':')
    if ind == -1: return False
    ll = ll[ind + 1:]
    if not ('b' in ll and '6' in ll): return False
    if not ('(' in ll and ')' in ll): return False
    return sum([c in ascii_lowercase for c in ll]) <= 5

In [3]:
class Email:
    def __init__(self):
        self.sender = ""
        self.time = ""
        self.recipients = ""
        self.cc = ""
        self.subject = ""
        self.text = ""
        self.nih = ""
        self.redacted_sender = False
        
    def __str__(self):
        return f'from: {self.sender}\nsent: {self.time}\nto: {self.recipients}\ncc: {self.cc}\nsubject: {self.subject}\n\n{self.text}'

    def has_sender(self): return self.sender != ""
    def has_recipient(self): return self.recipients != "" and self.recipients != []
    def has_cc(self): return self.cc != "" and self.cc != []
    def has_timestamp(self): return self.time != ""

In [4]:
def clean_metadata_lines(lines): 
    if len(lines) == 0:  return ''
    first = lines[0][(lines[0].find(':') + 1):].strip(' \"\'')
    if len(lines) == 1: return first
    return first + '\n'.join([line.strip(' \"\'') for line in lines[1:]])

def find_next_from(chunk, start_ind):    
    """ 
    Find index of first line after start_ind that is a from line. 
    Returns length of chunk if there is no such line.
    """
    for ind in range(start_ind + 1, len(chunk)):
        if is_from(chunk[ind]): return ind
    return len(chunk)

def handle_special_case_1(chunk):
    """
    Manage the following two lines 
    # From: 
    # From: Thomas Quinn
    """
    line_to_remove = -1
    for j in range(len(chunk) - 1):
        if is_from(chunk[j]) and is_from(chunk[j + 1]):
            if chunk[j + 1].lower().find("quinn") != -1:
                line_to_remove = j
                break
    if line_to_remove != -1:
        chunk.pop(line_to_remove)
        
def find_line(lines, start_ind, check_func):
    for i in range(start_ind, len(lines)):
        if check_func(lines[i]): return i
    return -1

def process_chunk(chunk):
    """ 
    Extract structured email data from single thread. The first email 
    is from Fauci. Subsequent ones are extraced from the thread.
    """
    if not is_from(chunk[0]):
        print(chunk[0])
    
    assert(is_from(chunk[0]))
    
    handle_special_case_1(chunk)
    
    emails = []
    start_ind = 0
    end_ind = find_next_from(chunk, start_ind)
    
    while start_ind < len(chunk):
        c_chunk = copy.deepcopy(chunk[start_ind:end_ind])
        email = Email()
        
        from_ind = find_line(c_chunk, 0, is_from)
        time_ind = find_line(c_chunk, 0, is_time)
        to_ind   = find_line(c_chunk, 0, is_to)
        cc_ind   = find_line(c_chunk, 0, is_cc)
        subj_ind = find_line(c_chunk, 0, is_subj)
        
        email.sender     = clean_metadata_lines(c_chunk[from_ind:max(from_ind+1, time_ind)])
        email.time       = clean_metadata_lines(c_chunk[time_ind:max(time_ind+1, to_ind)])
        email.recipients = clean_metadata_lines(c_chunk[to_ind:max(to_ind+1, cc_ind)])
        email.cc         = clean_metadata_lines(c_chunk[cc_ind:max(cc_ind+1, subj_ind)])
        email.subject    = clean_metadata_lines(c_chunk[subj_ind:subj_ind+1])

        # Text concsits of remaining lines
        email.text = '\n'.join(c_chunk[subj_ind+1:end_ind]).strip()
        
        # Update positions
        start_ind = end_ind
        end_ind = find_next_from(chunk, start_ind)        
        
        emails.append(email)
        
    if has_redacted_sender(chunk[0]):
        emails[0].redacted_sender = True        

    return emails

In [5]:
# Extract text in chunks corresponding to each email thread
all_chunks = []

with open("leopold-nih-foia-anthony-fauci-emails.txt", "r") as f:
    last_line_new_page = True  # first line is the start of new page
    curr_chunk = []
    last_nih_num = 0

    for (j, line) in enumerate(f):
        # Check for start of new page
        possible_fauci = contains_fauci(line) or has_redacted_sender(line)
        if j == 0 or (line[0] == '\f' and is_from(line) and possible_fauci):
            if len(curr_chunk) > 0:
                all_chunks.append(curr_chunk)
                curr_chunk = []
                
        line = line.strip()
        if len(line) > 0:
            curr_chunk.append(line)
            
    if len(curr_chunk) > 0:
        all_chunks.append(curr_chunk)

In [6]:
# Extract emails from chunks
emails = []
for (i, chunk) in enumerate(all_chunks):
    chain = [e for e in process_chunk(chunk) if e.has_recipient()]
    if len(chain) > 0:
        emails.append(chain)

In [7]:
import parse_timestamps
importlib.reload(parse_timestamps)
from parse_timestamps import *

In [8]:
import parse_names
importlib.reload(parse_names)
from parse_names import *

In [9]:
import parse_text
importlib.reload(parse_text)
from parse_text import *

In [10]:
# Map timestamps and names
nih_names = Counter()
hhs_names = Counter()
cdc_names = Counter()
fda_names = Counter()
os_names = Counter()
eop_names = Counter()

def check_membership(name, p_name):
    if name == '': return
    l_name = name.lower()
    def is_match(s): return l_name.find(s) != -1
    
    if p_name == "lenihan, keagan": 
        fda_names[p_name] += 1  # don't match nih
        return
    if p_name == "aspa deputies":
        hhs_names[p_name] += 1
        return
        
    if p_name in ["shoc", "farrar, jeremey", "schwartlander, bernhard",
                  "banks, lynn", "kanarek, morgan", "lapook, jon",
                  "elias, chris", "pfeifer, hazel", "smith, steven",
                  "hatchett, richard", "lancman, christine",
                  "morrison, stephen", "oneail, shawn", "callahan, michael",
                  "mcnamara, tracey", "caneva, duane", "verma, seema"]: return
    
    if is_match("(nih") or p_name.find("niaid") != -1 or p_name.find("nih") != -1: nih_names[p_name] += 1
    if is_match("(cdc") or p_name.find("cdc") != -1: cdc_names[p_name] += 1        
    if is_match("(fda") or p_name.find("fda") != -1: fda_names[p_name] += 1           
    if is_match("(hhs") or p_name.find("hhs") != -1: hhs_names[p_name] += 1
    if is_match("(os"):  os_names[p_name] += 1
    if is_match("eop/"): eop_names[p_name] += 1
        
parsed_emails = []
for (i, chain) in enumerate(emails):    
    parsed_chain = []
    for email in chain:
        p_email = copy.copy(email)
        p_email.time = parse_timestamp(email.time)
        
        s_name = parse_name(str(email.sender))
        if email.redacted_sender:
            s_name = "fauci, anthony"
        check_membership(str(email.sender), s_name)
        p_email.sender = s_name
            
        r_names = []
        for name in email.recipients.split(';'): 
            r_name = parse_name(str(name))
            if type(r_name) == type([]):
                r_names += r_name
            else:
                r_names.append(r_name)
                check_membership(str(name), r_name)
        p_email.recipients = list(filter(lambda name: name != '', r_names))
                
        cc_names = []
        if email.has_cc():
            for name in email.cc.split(';'):
                cc_name = parse_name(str(name))
                if type(cc_name) == type([]):
                    cc_names += cc_name
                else:
                    cc_names.append(cc_name)
                    check_membership(str(name), cc_name)
        p_email.cc = list(filter(lambda name: name != '', cc_names))
        
        # Special case of duplicate Ashley Parkers
        if p_email.sender == "abutaleb, yasmeen":
            if p_email.recipients[0] == "conrad, patricia":
                if p_email.subject.find("Washington Post") != -1:
                    p_email.cc = ["parker, ashley (wapo)"]

        parsed_chain.append(p_email)
        
    parsed_emails.append(parsed_chain)

In [11]:
# Map all names to integers 
name_map = dict()
def name_id(name):
    if name not in name_map:
        n = len(name_map)
        name_map[name] = n
    return name_map[name]

mapped_emails = []
for (i, chain) in enumerate(parsed_emails):    
    mapped_chain = []
    for email in chain:
        # Skip if we can't parse timestamp
        if not email.has_timestamp(): continue
        # Skip if we didn't find a sender
        if not email.has_sender(): continue
        # Skip if we didn't find a recipient
        if not email.has_recipient(): continue
            
        mapped_email = dict()
        mapped_email["sender"] = name_id(email.sender)
        mapped_email["recipients"] = list(set([name_id(recip) for recip in email.recipients]))
        mapped_email["cc"] = list(set([name_id(cc_name) for cc_name in email.cc]))
        mapped_email["time"] = email.time.isoformat()  # read with date.fromisoformat()
        mapped_email["subject"] = parse_text(email.subject)
        mapped_email["body"] = parse_text(email.text)
        mapped_chain.append(mapped_email)
        
        if len(mapped_email["recipients"]) == 0:
            print(list(email.recipients))
        
    if len(mapped_chain) > 0:
        mapped_emails.append(mapped_chain)

In [12]:
# Write out the digest
names = []
clusters = []
for node_id, name in sorted([(name_map[k], k) for k in name_map]):
    names.append(name)
    counts = [nih_names[name], hhs_names[name], cdc_names[name],
              fda_names[name], os_names[name], eop_names[name]]
    if max(counts) == 0:
        # "other" cluster
        clusters.append(len(counts) + 1)
    else:
        clusters.append(int(np.argmax(counts)) + 1)
    
cluster_names = ["NIH", "HHS", "CDC", "FDA", "OS", "EOP", "other"]
        
data = {"names": names,
        "clusters": clusters,
        "cluster_names": cluster_names,
        "emails": mapped_emails
       }
with open('fauci-email-graph.json', 'w') as f:
    json.dump(data, f)