## Comparisons of predictions to true data

In [7]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sys
sys.path.append("../")
from src import manipulators as man
from src import explorers
from src import cleaners

import plotly.express as px
from urllib.request import urlopen
import json

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
# Get prediction data and populations/fips data
predictions = pd.read_csv('../data/predictions.csv')
predictions_rf = pd.read_csv('../data/predictions_RF.csv')
populations = pd.read_csv('../data/population.csv')
populations.drop(columns='Unnamed: 0', inplace=True)
predictions.drop(columns='Unnamed: 0', inplace=True)

FileNotFoundError: [Errno 2] File data/predictions.csv does not exist: 'data/predictions.csv'

In [3]:
predict_indices = predictions['index_values']
predictions['fips'] = populations.loc[predict_indices, 'fips'].values

In [49]:
# Merge dataframes
master_ = man.merge_df(populations, predictions)

# Calculate predicted and true votes for each candidate
df = man.calculate_votes(master_)

# Predict winner for each county
df2 = cleaners.find_winner(df, 'trump_predict', 'clinton_predict', 'Winner_predict')
df3 = cleaners.find_winner(df2, 'trump_true', 'clinton_true', 'Winner_true')

# Make column for flipped counties
df4 = man.get_flipped_counties(df3, 'Winner_true', 'Winner_predict')

# Fix fips in case they are numeric
explorers.fix_fips(df4)

Unnamed: 0,trump_predict,trump_true,clinton_predict,clinton_true,fips,state,county,total_population,total_votes16
0,61.637127,65.935214,38.362873,34.064786,17169,Illinois,Schuyler,7205.0,3828
1,47.631373,47.395636,52.368627,52.604364,20091,Kansas,Johnson,572428.0,290090
2,31.44551,34.897959,68.55449,65.102041,46017,South Dakota,Buffalo,2052.0,490
3,63.302647,64.256702,36.697353,35.743298,38027,North Dakota,Eddy,2370.0,1231
4,87.305608,88.854213,12.694392,11.145787,40007,Oklahoma,Beaver,5479.0,2243


In [100]:
# Compare
total_predictions = df4.shape[0]
percentage_correct = 1 - df4['flipped'].sum()/total_predictions
print(f"The percentage of counties with a correct predictions is {100*percentage_correct:.2f}%")

The percentage of counties with a correct predictions is 97.04%


In [84]:
# Plot a choropleth of counties that were predicted incorrectly
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

fig = px.choropleth(df4, geojson=counties, locations='fips', color='flipped',
                    color_discrete_sequence=['#EF553B', '#636EFA'],
                           scope="usa"
                           )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## Get total vote discrepancy

In [112]:
df4.sum(axis=0)

trump_predict                                                   49767.2
trump_true                                                      49877.6
clinton_predict                                                 28032.8
clinton_true                                                    27922.4
fips                  1716920091460173802740007482994102329203400131...
state                 IllinoisKansasSouth DakotaNorth DakotaOklahoma...
county                SchuylerJohnsonBuffaloEddyBeaverLlanoGrantShan...
total_population                                            8.78866e+07
total_votes16                                                  37789202
trump_votes_pred                                            1.66163e+07
trump_votes_true                                             1.6553e+07
clinton_votes_pred                                          2.11729e+07
clinton_votes_true                                          2.12362e+07
Winner_predict        TrumpClintonClintonTrumpTrumpTrumpTrumpTru

In [114]:
(df4['Winner_true'] == 'Clinton').sum()

141