# An Prediction of 2016 Election Result

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from fnmatch import fnmatch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import bs4
from pattern import web

from collections import Counter
import shelve

plt.style.use('seaborn-dark')
plt.rcParams['figure.figsize'] = (10, 6)

ModuleNotFoundError: No module named 'pattern'

Load the data set from http://charts.realclearpolitics.com/charts/%i.xml and transfer the url link into actual text.

In [8]:
def get_poll_xml(poll_id):
    url = "http://charts.realclearpolitics.com/charts/%i.xml" % int(poll_id)
    return requests.get(url).text

## Distance between speeches

### By President

Let's first compute a few useful things we'll need later, like the unique set of all presidents and their initials (for plot labeling).

Now, make a dataframe that will contain the word counts aggregated by president:

In [11]:
import xml.etree.ElementTree as ET

def rcp_poll_data(input):
    tree = ET.fromstring(input)
    dictionary = dict()
    
    dates = list()
    series = tree.findall('series')
    for value in series[0].findall('value'):
        dates.append(value.text)
    dictionary['date'] = pd.to_datetime(dates)

    graphs = tree.findall('graphs/graph')
    for graph in graphs:
            values = list()
            title = graph.get('title')
            for value in graph.findall('value'):
                try:
                    values.append(float(str(value.text)))
                except:
                    values.append(value.text)
            dictionary[title] = values
    
    df = pd.DataFrame(dictionary)
    df_clean = df.dropna()
    return df_clean


In [12]:
import re

def find_governor_races(url):
    text = requests.get(url).text
    links = re.findall('http://www.realclearpolitics.com/epolls/\d{4}/governor/\D{2}/.*?-\d{,4}.html',text)
    links = list(set(links))
    return links

And `pmn` is a normalized version that turns counts into probability distributions:

In [13]:
def race_result(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'html.parser')
    tables = soup.findAll('table', {'class': 'data'})
    table = tables[0]
    rows = [row for row in table.find_all("tr")]
    columns = [str(col.get_text()) for col in rows[0].find_all("th")]
    candidates = [column.split('(')[0].strip() for column in columns[3:-1]]

    row = rows[1]
    tds = row.find_all("td")
    results = [float(str(t.get_text())) for t in tds[3:-1]]
    # convert to percentage 
    tot = sum(results)/100 
 
    return {l:r / tot for l, r in zip(candidates, results)}

Sanity check: all columns should sum to 1.

In [14]:
def plot_colors(xml):
    dom = web.Element(xml)
    result = {}
    for graph in dom.by_tag('graph'):
        title = _strip(graph.attributes['title'])
        result[title] = graph.attributes['color']
    return result

Make a numpy array version to use with Scikit-Learn:

In [15]:
def poll_plot(poll_id):
    xml = get_poll_xml(poll_id)
    data = rcp_poll_data(xml)
    colors = plot_colors(xml)
    data = data.rename(columns = {c: _strip(c) for c in data.columns})

    #normalize poll numbers so they add to 100%    
    norm = data[colors.keys()].sum(axis=1) / 100    
    for c in colors.keys():
        data[c] /= norm
    
    for label, color in colors.items():
        plt.plot(data.date, data[label], color=color, label=label)        
        
    plt.xticks(rotation=70)
    plt.legend(loc='best')
    plt.xlabel("Date")
    plt.ylabel("Normalized Poll Percentage")

You can use the JSdiv function below as-is, it already works.

In [16]:
poll_plot(1113)

NameError: name 'web' is not defined

This is a "naive" plot where we use the $L^2$ distance to build the embedding.

In [9]:
def id_from_url(url):
    """Given a URL, look up the RCP identifier number"""
    return url.split('-')[-1].split('.html')[0]

def plot_race(url):
    """Make a plot summarizing the historical poll data and the actual results
    """
    id = id_from_url(url)
    xml = get_poll_xml(id)    
    colors = plot_colors(xml)
    if len(colors) == 0:
        return
    result = race_result(url)
    poll_plot(id)
    plt.xlabel("Date")
    plt.ylabel("Polling Percentage")
    for r in result:
        plt.axhline(result[r], color=colors[_strip(r)], alpha=0.6, ls='--')

In [None]:
euclidean_embedding_df = pd.DataFrame(euclidean_embedding, columns=['x', 'y'])
euclidean_embedding_df['presidents'] = pres_mat.columns

This will be more informative if we make a little utility function for the plots, that carries a legend, annotates each point with the president's initials, and colors them all:

In [10]:
def plot_embedding(data, title='MDS Embedding', savepath=None, palette='viridis', 
                   size=7, annotate='initials'):
    """Plot an MDS embedding dataframe for all presidents.
    
    Uses Seaborn's `lmplot` to create an x-y scatterplot of the data, encoding the 
    value of the `President` field into the hue (which can be mapped to any desired
    color palette).
    
    Parameters
    ----------
    data : DataFrame
        A DataFrame that must contain 3 columns labeled 'x', 'y' and 'President'.
        
    title : optional, string
        Title for the plot
        
    savepath : optional, string
        If given, a path to save the figure into using matplotlib's `savefig`.
        
    palette : optional, string
        The name of a valid Seaborn palette for coloring the points.
    
    size : optional, float
        Size of the plot in inches (single number, square plot)
        
    annotate: optional, 'initials', 'name' or False
        If 'initials', annotate each point with the intials of each president,
        if 'name', use their last name, and if False, do not annotate at all.
        
    Returns
    -------
    FacetGrid
        The Seaborn FacetGrid object used to create the plot.
    """
    # First, get the annotations as required
    if annotate == 'initials':
        annotations = []
        for name in data.presidents.str.split():
            this_initial = ""
            for word in name:
                this_initial += word[0]
        
            annotations.append(this_initial)
    
    elif annotate == 'name':
        annotations = []
        for name in data.presidents.str.split():
            annotations.append(name[-1])
            
        
    # Next, build the plot
    myPlot = sns.lmplot('x', 'y', data, size=size, aspect=1, hue='presidents', palette=palette, fit_reg=False, x_ci=None, ci=None)
    if annotate:
        for label, x, y in zip(annotations, data.x, data.y):
            plt.annotate(label, xy=(x, y))
        
    # Save if necessary
    if savepath:
        myPlot.savefig(savepath)
    
    # Add title if necessary
    if title:
        plt.title(title)
    
    return myPlot

In [11]:
plot_embedding(euclidean_embedding_df, 'Naive MDS - euclidean distance', 'fig/mds_naive.png', annotate='name');

NameError: name 'embed_peu' is not defined

Now we use a distance metric defined on probability mass functions, the Jensen-Shannon Metric.

In [None]:
# Calculating dissimilarity matrix from jensen-shannon metric
JSM_dissim = np.zeros((len(pmm), len(pmm)))
for i in range(len(JSM_dissim)):
    for j in range(len(JSM_dissim)):
        JSM_dissim[i][j] = JSdiv(pmm[i], pmm[j])
# Applying MDS on this dissimilarity matrix
JSM_MDS = MDS(verbose=1, n_jobs=3, dissimilarity='precomputed')
JSM_embedding = JSM_MDS.fit_transform(JSM_dissim)

In [None]:
JSM_embedding_df = pd.DataFrame(JSM_embedding, columns=['x', 'y'])
JSM_embedding_df['presidents'] = pres_mat.columns

In [None]:
plot_embedding(JSM_embedding_df, 'MDS - Jensen-Shannon Distance', 'fig/mds_jsdiv.png', annotate='name');

### By Speech
First we normalize the term-document matrix

In [None]:
# Computing normalized TDM
normalized_tdm = []
for i in range(len(wmat.columns)):
    normalized_tdm.append(normalize(wmat.iloc[:,i]))
normalized_tdm = pd.DataFrame(np.array(normalized_tdm).T, columns = wmat.columns)
normalized_tdm.columns = addresses.president
normalized_tdm = normalized_tdm.T

We make a similar naive plot embedding using the $L^2$ distance.

In [None]:
normalized_tdm = normalized_tdm.T

In [None]:
euclidean_embedding_df = pd.DataFrame(euclidean_embedding, columns=['x', 'y'])
euclidean_embedding_df['presidents'] = normalized_tdm.index

In [None]:
plot_embedding(euclidean_embedding_df, 'Naive MDS - euclidean distance, all speeches',
               'fig/mds_naive_all.png', size=10);

Now with the Jensen-Shannon metric

In [None]:
# Calculating dissimilarity matrix from jensen-shannon metric
JSM_dissim = np.zeros((len(normalized_tdm), len(normalized_tdm)))
for i in range(len(JSM_dissim)):
    for j in range(len(JSM_dissim)):
        JSM_dissim[i][j] = JSdiv(normalized_tdm[i], normalized_tdm[j])

In [None]:
JSM_MDS = MDS(verbose=1, dissimilarity='precomputed', random_state=1)
JSM_embedding = JSM_MDS.fit_transform(JSM_dissim)
JSM_embedding_df = pd.DataFrame(JSM_embedding, columns=['x', 'y'])
JSM_embedding_df['presidents'] = normalized_tdm.index

In [None]:
plot_embedding(JSM_embedding_df, 'Naive MDS - Jensen-Shannon distance, all speeches', 
               'fig/mds_jdsiv_all.png', size=10);

Store some final results. For native numpy arrays, we can use the convenient numpy `npz` container format, which behaves in practice similar to the Pandas HDF5 store and the Python Shelves:

In [None]:
np.savez('results/npa4.npz', pmm=pmm)