# Political Email NER 

Data from: https://dataverse.scholarsportal.info/dataset.xhtml?persistentId=doi:10.23685/GZQ8Z2

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import matplotlib.gridspec as gridspec
from datetime import timedelta, datetime, date
import xml.etree.ElementTree as ET
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
import spacy
from spacy import displacy

In [2]:
# STEP 1 - SELECT WHICH PARTY AND DONATION TYPE YOU'D LIKE TO START ANALYZING

# STEP 2 - THIS FUNCTION TAKES YOUR SELECTION AND DYNAMICALLY EXTRACTS THE 'PERSON' ENTITY FROM THE TEXT OF THE EMAILS
def load(Party, Donation):
    print(f'Analyzing emails from the {Party} party, sent to those who have {Donation}.')
    if Party == "Conservative":
        if Donation == "donated": file_path = "./data/PE-Conservatives-Donation-Subscription.xml"
        elif Donation == "not donated": file_path = "./data/PE-Conservatives-NoDonation-Subscription.xml"
    if Party == "Liberal":
        if Donation == "donated": file_path = "./data/PE-Liberal-Donation-Subscription.xml"
        elif Donation == "not donated": file_path = "./data/PE-Liberal-NoDonation-Subscription.xml"
    if Party == "NDP":
        if Donation == "donated": file_path = "./data/PE-NDP-Donation-Subscription.xml"
        elif Donation == "not donated": file_path = "./data/PE-NDP-NoDonation-Subscription.xml"
    if Party == "Green": 
        if Donation == "donated": file_path = "./data/PE-GreenParty-Donation-Subscription.xml"
        elif Donation == "not donated": file_path = "./data/PE-GreenParty-NoDonation-Subscription.xml"
    print('loading file:', file_path)

    tree = ET.parse(file_path)  # parse the .xml file

    from_name = [] # this will hold the name of the sender
    for i in tree.findall('./message/from/name'): from_name.append(i.text) 

    date = [] # this will hold the date of send
    for i in tree.findall('./message/received/date'): date.append(i.text)

    subject = [] # this will hold the subject title of the email
    for i in tree.findall('./message/subject'): subject.append(i.text)

    text = [] # this will hold the body of the email
    for i in tree.findall('./message/text'): text.append(i.text)
    
    df = pd.DataFrame(zip(date, from_name, subject, text), columns= ['date', 'from', 'subject', 'text'])
    
    # import spacy model - english, size small
    nlp = spacy.load("en_core_web_sm")
    
    person_freq = []    # this will be a list of the dictionaries holding PERSON frequencies, with 


    for i in range(0, len(df)):
        email = nlp(df['text'][i])
        temp_dic = {}

        for ent in email.ents: 
            # temp_named_entity = None
            # temp_entity_name = ''
            
            term = ent.text
            tag = ent.label_
            if tag == "PERSON":
                if term not in temp_dic:
                    temp_dic[term] = 1
                else: temp_dic[term] += 1
        person_freq.append(temp_dic)
        print("email number", i, person_freq[i])

#UI for Picking Part and Donation Status
outputf = widgets.interact(
    load,
    Party=['Liberal', 'Conservative', 'NDP', 'Green'], 
    Donation = ['donated', 'not donated']
)



interactive(children=(Dropdown(description='Party', options=('Liberal', 'Conservative', 'NDP', 'Green'), value…

In [123]:
nlp = spacy.load("en_core_web_sm")

for i in range(0, len(df)):
    email = nlp(df['text'][i])
    temp_dic = {}

    for ent in email.ents: 
        # temp_named_entity = None
        # temp_entity_name = ''
        
        term = ent.text
        tag = ent.label_
        if tag == "PERSON":
            if term not in temp_dic:
                temp_dic[term] = 1
            else: temp_dic[term] += 1
    print(temp_dic)
            # temp_entity_name = ''.join([temp_entity_name, term]).strip()
            # temp_named_entity = (temp_entity_name, tag)
            # print(tag, temp_entity_name)

# for i in range(1, len(df)):
#     doc = nlp(df['text'][i])
#     for ent in doc.ents:
#         persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
#         print(persons)


{}
{'Fellow Canadians': 2, 'Laureen': 1, 'Ben': 1, 'Rachel': 1, 'God': 1}
{}
{'Stephen Harper': 1}
{}
{'Justin Trudeau': 2, 'Thomas Mulcair': 2}
{'Justin Trudeau': 1, 'Dustin Van Vugt': 1}
{'Justin Trudeau': 2, 'Fred DeLorey': 1}
{'Stephen Harper': 2, 'Justin Trudeau': 4, 'Harper': 3, 'Justin Trudeau’s': 1, 'Pierre Poilievre': 1}
{'Mike': 2, 'Fred DeLorey': 1, 'Albert St': 1}
{'Mike': 1, 'newtaxbreaks&v': 1, 'Cory Hann': 1, 'Justin Trudeau': 1, 'Albert St': 1}
{'Mike': 1, 'Stephen Harper': 1, 'Chris Alexander': 1, 'Albert St': 1}
{'Mike': 1, 'Stephen Harper': 1, 'Trudeau': 1, "Justin Trudeau's": 2, 'Justin Trudeau': 1, 'Harper': 1, 'Fred DeLorey': 1, 'Albert St': 1}
{'Mike': 1, 'Joe Oliver': 1, 'Stephen Harper': 1, 'Harper': 1, 'Kevin Sorenson': 1, 'Albert St': 1}
{'Albert St': 1}
{'Mike': 1, 'Stephen Harper': 1, 'Ted Opitz': 1, 'Albert St': 1}
{'Mike': 1, 'Albert St': 1}
{'Mike': 1, 'Pat Perkins': 1, 'Jim Eglinski': 1, 'Albert St': 1}
{'Mike': 1, 'Harper': 3, 'Justin Trudeau': 6, 'Jus

In [53]:
df['text']

0                                                       
1      http://www.conservative.ca\nSpeech from Prime ...
2      https://donate.conservative.ca/MembershipDrive...
3      http://www.conservative.ca/NewTaxBreaks/\nFrie...
4      https://donate.conservative.ca?mpi=c4864059e7&...
                             ...                        
650    Dear Mike,\n\nWhile the Liberals have been bus...
651    Your weekly update on the CPC...\nFollow the C...
652    Dear Mike,\n\nHalloween is a spooky time of ye...
653    The Q3 fundraising results are in.\n\nYou are ...
654    Dear Mike,\n\nYou may have heard about the Q3 ...
Name: text, Length: 655, dtype: object