# 538 Scrape

This project will be using polling data collected and analyzed by 538 to investigate the connection between twitter and candidate polling.

This notebook uses code provided on the BuzzFeed News [github](https://github.com/BuzzFeedNews/2016-11-grading-the-election-forecasts)

In [21]:
import requests
import forecast
import random
import itertools
import json
import re
import sys
import pandas as pd

In [4]:
BASE_URL = "https://projects.fivethirtyeight.com/2016-election-forecast/"

In [5]:
def process_historical(data, office):
    fs = data["forecasts"]["all"]
    arr = []
    for f in fs:
        for model_name, model in f["models"].items():
            arr.append({
                "date": f["date"],
                "model": "538_" + model_name,
                "office": office,
                "state": data["state"],
                "party": f["party"],
                "candidate": f["candidate"].upper(),
                "win_prob": model["winprob"] / 100,
                "est_diff": None,
                "est_share": model["forecast"] / 100,
                "est_share_2p": None,
            })
    return arr 

In [6]:
def get_inline_data(url, variable):
    res = requests.get(url, params={ "r": random.random() })
    html = res.content.decode("utf-8")
    match = re.search(r"{0} = ([^;]+)".format(variable), html)
    raw = json.loads(match.group(1))
    return raw

In [18]:
def get_historical_predictions():
    arr = []
    raw = get_inline_data(BASE_URL, "race.stateData")
    p = process_historical(raw, 'P')
    arr += p
    return arr

In [44]:
df = pd.DataFrame(get_historical_predictions())

In [45]:
df.head()

Unnamed: 0,candidate,date,est_diff,est_share,est_share_2p,model,office,party,state,win_prob
0,CLINTON,2016-11-08,,0.485272,,538_now,P,D,US,0.71365
1,CLINTON,2016-11-08,,0.485272,,538_polls,P,D,US,0.71365
2,CLINTON,2016-11-08,,0.484984,,538_plus,P,D,US,0.7177
3,TRUMP,2016-11-08,,0.449499,,538_now,P,R,US,0.286
4,TRUMP,2016-11-08,,0.449499,,538_polls,P,R,US,0.286


In [46]:
clinton_df = df[df.candidate == 'CLINTON']
clinton_now_df = clinton_df[clinton_df.model == '538_polls']

In [48]:
trump_df = df[df.candidate == 'TRUMP']
trump_now_df = trump_df[trump_df.model == '538_polls']

In [50]:
clinton_now_df.to_csv('clinton_polls_only.csv')
trump_now_df.to_csv('trump_polls_only.csv')