In [14]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import BayesianRidge
import numpy as np
import os
from pprint import pprint

In [2]:
# Read in our cleaned data
results_cleaned = pd.read_csv('Cleaned_Data/cleaned_final_election_Votes.csv')
features_cleaned = pd.read_csv('Cleaned_Data/Combined_CC_final.csv')

In [3]:
# Our data isn't sorted the same, so we sorted the features data by year first then state instead of by state then year
features_cleaned_sorted = features_cleaned.sort_values(['YEAR', 'STNAME'], ascending = [True, True]).reset_index().drop(columns = ['index'])
features_cleaned_sorted.head(20)

Unnamed: 0,STNAME,STATE,YEAR,TOT_POP,MALE_PCT,WHITE_PCT,HISPANIC_PCT,Young_PCT
0,Alabama,1,2000,3521922,0.474877,0.735524,0.015647,0.544908
1,Alaska,2,2000,469829,0.518674,0.740891,0.036541,0.624455
2,Arizona,4,2000,4009385,0.495596,0.893063,0.22077,0.562405
3,Arkansas,5,2000,2116895,0.481669,0.832044,0.028325,0.534715
4,California,6,2000,26200355,0.494346,0.782061,0.288755,0.595113
5,Colorado,8,2000,3405978,0.501195,0.91457,0.153388,0.594183
6,Connecticut,9,2000,2702848,0.477235,0.87323,0.082819,0.531772
7,Delaware,10,2000,623603,0.478832,0.784429,0.041525,0.552255
8,District of Columbia,11,2000,474163,0.464252,0.37411,0.07466,0.587952
9,Florida,12,2000,13006409,0.482285,0.834175,0.163254,0.502145


In [4]:
# Add the target column to the feature data
features_cleaned_sorted['rep_vote_pct'] = results_cleaned['republic_vote_per']
features_cleaned_sorted.tail()

Unnamed: 0,STNAME,STATE,YEAR,TOT_POP,MALE_PCT,WHITE_PCT,HISPANIC_PCT,Young_PCT,rep_vote_pct
301,Virginia,51,2020,7078671,0.49162,0.701516,0.089543,0.489966,0.448453
302,Washington,53,2020,6328721,0.503057,0.793233,0.114586,0.506295,0.400745
303,West Virginia,54,2020,1497088,0.497054,0.93725,0.016523,0.434419,0.697985
304,Wisconsin,55,2020,4851781,0.499328,0.88512,0.060982,0.466948,0.49682
305,Wyoming,56,2020,467881,0.512838,0.933485,0.09159,0.483369,0.724804


In [5]:
# Filter data for the training set (2000-2016) and the testing set (2020)
training_data = features_cleaned_sorted[features_cleaned_sorted['YEAR'].between(2000, 2016)]
testing_data = features_cleaned_sorted[features_cleaned_sorted['YEAR'] == 2020]

# Make sure the training data stops at 2016
training_data.tail()

Unnamed: 0,STNAME,STATE,YEAR,TOT_POP,MALE_PCT,WHITE_PCT,HISPANIC_PCT,Young_PCT,rep_vote_pct
250,Virginia,51,2016,6860068,0.487605,0.712389,0.080696,0.493956,0.471736
251,Washington,53,2016,5929127,0.497112,0.813611,0.103864,0.498283,0.412131
252,West Virginia,54,2016,1521483,0.491221,0.941437,0.013453,0.439648,0.721611
253,Wisconsin,55,2016,4717187,0.494019,0.890556,0.055222,0.466912,0.504059
254,Wyoming,56,2016,468705,0.509745,0.935923,0.087874,0.491941,0.757053


In [6]:
# Separate the dependent variable (Y) and independent variables (X) for both sets
# Our independent variables are the demographic data we cleaned, and the testing data is the vote percentage republicans got
X_train = training_data[['MALE_PCT', 'WHITE_PCT', 'HISPANIC_PCT', 'Young_PCT']]  
y_train = training_data['rep_vote_pct']

X_test = testing_data[['MALE_PCT', 'WHITE_PCT', 'HISPANIC_PCT', 'Young_PCT']] 
y_test = testing_data['rep_vote_pct']

In [7]:
# Instantiate the Linear Regression model

# Assign a random_state parameter to the model
linear_regression_model = LinearRegression()

# Fit the model using training data
model = linear_regression_model.fit(X_train, y_train)

In [8]:
# Get the predictions for the test data
predictions_test = model.predict(X_test)

In [9]:
# and the training data (to make sure our data isn't overfitted)
predictions_train = model.predict(X_train)

In [10]:
# This is just to see what the model values in its assessment
print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)

Coefficients: [ 4.47992471  0.28392542 -0.30629907  0.33121076]
Intercept: -2.0561221639515077


In [11]:
# We want a list of predictions to add back to the data to get results
predictions = list(predictions_train) + list(predictions_test)
predictions

[0.4558132169044291,
 0.6734889292520179,
 0.536326676139868,
 0.5063851730356355,
 0.4892213474030025,
 0.5986815872895184,
 0.4805489481092655,
 0.48192169179905164,
 0.3017776558074292,
 0.4576330024563853,
 0.5044414430854336,
 0.4239400533216777,
 0.6156080535239146,
 0.49615731279260533,
 0.5462009150119806,
 0.5577956891923961,
 0.5563446368931393,
 0.5436270960167335,
 0.4498496701732049,
 0.5414673286967093,
 0.4380150989342386,
 0.48667343032446775,
 0.5251492609033983,
 0.5834443628361363,
 0.43911390032232145,
 0.5138525259375566,
 0.5886787982740298,
 0.564023990232192,
 0.5980051421032093,
 0.5755894204747927,
 0.4538906798604643,
 0.43440950185483507,
 0.42466800263708393,
 0.5060359837258464,
 0.6077758579461126,
 0.5101202714878079,
 0.5178030722409659,
 0.5671545204817394,
 0.4875424587566157,
 0.4767387248531696,
 0.4669771192526415,
 0.5825587433750421,
 0.5067959333322491,
 0.49813571921071276,
 0.6266496651783116,
 0.5637378484674196,
 0.5088914108744924,
 0.57269

In [12]:
# adding the predictions back to the results
features_cleaned_sorted['model_predicitons'] = pd.Series(predictions)
features_cleaned_sorted.head()

Unnamed: 0,STNAME,STATE,YEAR,TOT_POP,MALE_PCT,WHITE_PCT,HISPANIC_PCT,Young_PCT,rep_vote_pct,model_predicitons
0,Alabama,1,2000,3521922,0.474877,0.735524,0.015647,0.544908,0.575855,0.455813
1,Alaska,2,2000,469829,0.518674,0.740891,0.036541,0.624455,0.679369,0.673489
2,Arizona,4,2000,4009385,0.495596,0.893063,0.22077,0.562405,0.532826,0.536327
3,Arkansas,5,2000,2116895,0.481669,0.832044,0.028325,0.534715,0.528007,0.506385
4,California,6,2000,26200355,0.494346,0.782061,0.288755,0.595113,0.43797,0.489221


In [15]:
# MSE to see how well our model does compared to other models
mse = mean_squared_error(features_cleaned_sorted['rep_vote_pct'], features_cleaned_sorted['model_predicitons'])
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.00904619554343915


In [16]:
# Change the column name because we will have a lot of columns of data
features_cleaned_sorted = features_cleaned_sorted.rename(columns = {'model_predicitons':'linear_model_predictions'})
features_cleaned_sorted.head(51)

Unnamed: 0,STNAME,STATE,YEAR,TOT_POP,MALE_PCT,WHITE_PCT,HISPANIC_PCT,Young_PCT,rep_vote_pct,linear_model_predictions
0,Alabama,1,2000,3521922,0.474877,0.735524,0.015647,0.544908,0.575855,0.455813
1,Alaska,2,2000,469829,0.518674,0.740891,0.036541,0.624455,0.679369,0.673489
2,Arizona,4,2000,4009385,0.495596,0.893063,0.22077,0.562405,0.532826,0.536327
3,Arkansas,5,2000,2116895,0.481669,0.832044,0.028325,0.534715,0.528007,0.506385
4,California,6,2000,26200355,0.494346,0.782061,0.288755,0.595113,0.43797,0.489221
5,Colorado,8,2000,3405978,0.501195,0.91457,0.153388,0.594183,0.544858,0.598682
6,Connecticut,9,2000,2702848,0.477235,0.87323,0.082819,0.531772,0.407443,0.480549
7,Delaware,10,2000,623603,0.478832,0.784429,0.041525,0.552255,0.432599,0.481922
8,District of Columbia,11,2000,474163,0.464252,0.37411,0.07466,0.587952,0.095123,0.301778
9,Florida,12,2000,13006409,0.482285,0.834175,0.163254,0.502145,0.499983,0.457633


In [17]:
# First attempt at a polynomial regression
poly_features = PolynomialFeatures(degree=3)

In [18]:
poly_reg = make_pipeline(poly_features, LinearRegression())

In [19]:
model = poly_reg.fit(X_train, y_train)

In [20]:
# Making predictions based off of the polynomial regression
predictions_poly_test = model.predict(X_test)
predictions_poly_train = model.predict(X_train)

In [22]:
# Add the predictions to a list and add that to a dataframe of the predictions
predictions_poly_1 = list(predictions_poly_train) + list(predictions_poly_test)

In [23]:
final_predictions_df = features_cleaned_sorted[['STNAME', 'STATE', 'YEAR', 'rep_vote_pct', 'linear_model_predictions']]
final_predictions_df['polynomial_model_predictions'] = pd.Series(predictions_poly_1)
final_predictions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_predictions_df['polynomial_model_predictions'] = pd.Series(predictions_poly_1)


Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial_model_predictions
0,Alabama,1,2000,0.575855,0.455813,0.570312
1,Alaska,2,2000,0.679369,0.673489,0.640625
2,Arizona,4,2000,0.532826,0.536327,0.564453
3,Arkansas,5,2000,0.528007,0.506385,0.566406
4,California,6,2000,0.437970,0.489221,0.435547
...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.476562
302,Washington,53,2020,0.400745,0.555347,0.519531
303,West Virginia,54,2020,0.697985,0.575573,0.630859
304,Wisconsin,55,2020,0.496820,0.568115,0.593750


In [24]:
# Check the mse and see if the model performs better than the linear version, which it does
mse_poly_3 = mean_squared_error(final_predictions_df['rep_vote_pct'], final_predictions_df['polynomial_model_predictions'])
print(f'Mean Squared Error of Degree 3 Polynomial: {mse_poly_3}')

Mean Squared Error of Degree 3 Polynomial: 0.006434836979820076


In [25]:
# Doing this again for degree 4, but adding in the "interaction_only" feature, which doesn't square/cube/quartic variables
# It only does interactions between the variables
# This is one model that makes it into the presentation
poly_features_4_only = PolynomialFeatures(degree=4, interaction_only=True)
poly_regression_4 = make_pipeline(poly_features_4_only, LinearRegression())
model3 = poly_regression_4.fit(X_train, y_train)
predictions_poly4_only_train = model3.predict(X_train)
predictions_poly4_only_test = model3.predict(X_test)
predictions_poly4_only = list(predictions_poly4_only_train) + list(predictions_poly4_only_test)
final_predictions_df['polynomial4_only_predictions'] = pd.Series(predictions_poly4_only)
final_predictions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_predictions_df['polynomial4_only_predictions'] = pd.Series(predictions_poly4_only)


Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial_model_predictions,polynomial4_only_predictions
0,Alabama,1,2000,0.575855,0.455813,0.570312,0.535256
1,Alaska,2,2000,0.679369,0.673489,0.640625,0.694118
2,Arizona,4,2000,0.532826,0.536327,0.564453,0.569640
3,Arkansas,5,2000,0.528007,0.506385,0.566406,0.513410
4,California,6,2000,0.437970,0.489221,0.435547,0.464937
...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.476562,0.490846
302,Washington,53,2020,0.400745,0.555347,0.519531,0.552848
303,West Virginia,54,2020,0.697985,0.575573,0.630859,0.537330
304,Wisconsin,55,2020,0.496820,0.568115,0.593750,0.564661


In [27]:
#Check the mse
mse_poly_4_only = mean_squared_error(final_predictions_df['rep_vote_pct'], final_predictions_df['polynomial4_only_predictions'])
print(f'Mean Squared Error of Degree 4 Polynomial: {mse_poly_4_only}')

Mean Squared Error of Degree 4 Polynomial: 0.005636230358920835


In [28]:
# Degree 4 polynomial without interaction_only
poly_features_4 = PolynomialFeatures(degree=4)
poly_regression4 = make_pipeline(poly_features_4, LinearRegression())
model4 = poly_regression4.fit(X_train, y_train)
predictions_poly4_train = model4.predict(X_train)
predictions_poly4_test = model4.predict(X_test)
predictions_poly4 = list(predictions_poly4_train) + list(predictions_poly4_test)
final_predictions_df['polynomial4_predictions'] = pd.Series(predictions_poly4)
final_predictions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_predictions_df['polynomial4_predictions'] = pd.Series(predictions_poly4)


Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial_model_predictions,polynomial4_only_predictions,polynomial4_predictions
0,Alabama,1,2000,0.575855,0.455813,0.570312,0.535256,0.587891
1,Alaska,2,2000,0.679369,0.673489,0.640625,0.694118,0.640625
2,Arizona,4,2000,0.532826,0.536327,0.564453,0.569640,0.615234
3,Arkansas,5,2000,0.528007,0.506385,0.566406,0.513410,0.541016
4,California,6,2000,0.437970,0.489221,0.435547,0.464937,0.505859
...,...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.476562,0.490846,0.574219
302,Washington,53,2020,0.400745,0.555347,0.519531,0.552848,0.644531
303,West Virginia,54,2020,0.697985,0.575573,0.630859,0.537330,0.757812
304,Wisconsin,55,2020,0.496820,0.568115,0.593750,0.564661,0.560547


In [29]:
# Check the mse, worse than linear
mse_poly_4 = mean_squared_error(final_predictions_df['rep_vote_pct'], final_predictions_df['polynomial4_predictions'])
print(f'Mean Squared Error of Degree 3 Polynomial: {mse_poly_4}')
np.sqrt(mse_poly_4)

Mean Squared Error of Degree 3 Polynomial: 0.00874885712314712


0.0935353255361156

In [30]:
# Degree 3 with interaction_only (makes it into final presentation)
poly_features_3_only = PolynomialFeatures(degree=3, interaction_only=True)
poly_regression_3only = make_pipeline(poly_features_3_only, LinearRegression())
model3only = poly_regression_3only.fit(X_train, y_train)
predictions_poly3_only_train = model3only.predict(X_train)
predictions_poly3_only_test = model3only.predict(X_test)
predictions_poly3_only = list(predictions_poly3_only_train) + list(predictions_poly3_only_test)
final_predictions_df['polynomial_3_only_predictions'] = pd.Series(predictions_poly3_only)
final_predictions_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_predictions_df['polynomial_3_only_predictions'] = pd.Series(predictions_poly3_only)


Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial_model_predictions,polynomial4_only_predictions,polynomial4_predictions,polynomial_3_only_predictions
0,Alabama,1,2000,0.575855,0.455813,0.570312,0.535256,0.587891,0.536025
1,Alaska,2,2000,0.679369,0.673489,0.640625,0.694118,0.640625,0.712766
2,Arizona,4,2000,0.532826,0.536327,0.564453,0.569640,0.615234,0.579054
3,Arkansas,5,2000,0.528007,0.506385,0.566406,0.513410,0.541016,0.515258
4,California,6,2000,0.437970,0.489221,0.435547,0.464937,0.505859,0.447349
...,...,...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.476562,0.490846,0.574219,0.489971
302,Washington,53,2020,0.400745,0.555347,0.519531,0.552848,0.644531,0.551020
303,West Virginia,54,2020,0.697985,0.575573,0.630859,0.537330,0.757812,0.569085
304,Wisconsin,55,2020,0.496820,0.568115,0.593750,0.564661,0.560547,0.565939


In [31]:
# Check mse
mse_poly_3_only = mean_squared_error(final_predictions_df['rep_vote_pct'], final_predictions_df['polynomial_3_only_predictions'])
print(f'Mean Squared Error of Degree 3 Polynomial: {mse_poly_3_only}')
np.sqrt(mse_poly_3_only)

Mean Squared Error of Degree 3 Polynomial: 0.005690061018229299


0.075432493119539

In [32]:
# Make a list of mse's to see the best ones
mse_list = [mse, mse_poly_3, mse_poly_4_only, mse_poly_4, mse_poly_3_only]
mse_list

[0.00904619554343915,
 0.006434836979820076,
 0.005636230358920835,
 0.00874885712314712,
 0.005690061018229299]

In [33]:
# Try again with degree 2 with interaction_only
poly_features_2_only = PolynomialFeatures(degree=2, interaction_only=True)
poly_regression_2only = make_pipeline(poly_features_2_only, LinearRegression())
model2_only = poly_regression_2only.fit(X_train, y_train)
predictions_poly2_only_train = model2_only.predict(X_train)
predictions_poly2_only_test = model2_only.predict(X_test)
predictions_poly2_only = list(predictions_poly2_only_train) + list(predictions_poly2_only_test)
final_predictions_df['polynomial_2_only_predictions'] = pd.Series(predictions_poly2_only)
final_predictions_df

Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial_model_predictions,polynomial4_only_predictions,polynomial4_predictions,polynomial_3_only_predictions,polynomial_2_only_predictions
0,Alabama,1,2000,0.575855,0.455813,0.570312,0.535256,0.587891,0.536025,0.491116
1,Alaska,2,2000,0.679369,0.673489,0.640625,0.694118,0.640625,0.712766,0.641360
2,Arizona,4,2000,0.532826,0.536327,0.564453,0.569640,0.615234,0.579054,0.614263
3,Arkansas,5,2000,0.528007,0.506385,0.566406,0.513410,0.541016,0.515258,0.511789
4,California,6,2000,0.437970,0.489221,0.435547,0.464937,0.505859,0.447349,0.416263
...,...,...,...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.476562,0.490846,0.574219,0.489971,0.496328
302,Washington,53,2020,0.400745,0.555347,0.519531,0.552848,0.644531,0.551020,0.548741
303,West Virginia,54,2020,0.697985,0.575573,0.630859,0.537330,0.757812,0.569085,0.505611
304,Wisconsin,55,2020,0.496820,0.568115,0.593750,0.564661,0.560547,0.565939,0.554746


In [34]:
# Check the mse against the others
mse_poly_2_only = mean_squared_error(final_predictions_df['rep_vote_pct'], final_predictions_df['polynomial_2_only_predictions'])
print(f'Mean Squared Error of Degree 2 Polynomial: {mse_poly_2_only}')
mse_list.append(mse_poly_2_only)
print(np.sqrt(mse_poly_2_only))
mse_list

Mean Squared Error of Degree 2 Polynomial: 0.006762650567860223
0.08223533649143915


[0.00904619554343915,
 0.006434836979820076,
 0.005636230358920835,
 0.00874885712314712,
 0.005690061018229299,
 0.006762650567860223]

In [35]:
# Degree 2 without interaction_only
poly_features_2 = PolynomialFeatures(degree=2)
poly_regression_2 = make_pipeline(poly_features_2, LinearRegression())
model2 = poly_regression_2.fit(X_train, y_train)
predictions_poly2_train = model2.predict(X_train)
predictions_poly2_test = model2.predict(X_test)
predictions_poly2 = list(predictions_poly2_train) + list(predictions_poly2_test)
final_predictions_df['polynomial_2_predictions'] = pd.Series(predictions_poly2)
final_predictions_df

Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial_model_predictions,polynomial4_only_predictions,polynomial4_predictions,polynomial_3_only_predictions,polynomial_2_only_predictions,polynomial_2_predictions
0,Alabama,1,2000,0.575855,0.455813,0.570312,0.535256,0.587891,0.536025,0.491116,0.510381
1,Alaska,2,2000,0.679369,0.673489,0.640625,0.694118,0.640625,0.712766,0.641360,0.681628
2,Arizona,4,2000,0.532826,0.536327,0.564453,0.569640,0.615234,0.579054,0.614263,0.561861
3,Arkansas,5,2000,0.528007,0.506385,0.566406,0.513410,0.541016,0.515258,0.511789,0.532629
4,California,6,2000,0.437970,0.489221,0.435547,0.464937,0.505859,0.447349,0.416263,0.452651
...,...,...,...,...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.476562,0.490846,0.574219,0.489971,0.496328,0.543717
302,Washington,53,2020,0.400745,0.555347,0.519531,0.552848,0.644531,0.551020,0.548741,0.563974
303,West Virginia,54,2020,0.697985,0.575573,0.630859,0.537330,0.757812,0.569085,0.505611,0.548485
304,Wisconsin,55,2020,0.496820,0.568115,0.593750,0.564661,0.560547,0.565939,0.554746,0.576261


In [36]:
# Check mse
mse_poly_2 = mean_squared_error(final_predictions_df['rep_vote_pct'], final_predictions_df['polynomial_2_predictions'])
print(f'Mean Squared Error of Degree 2 Polynomial: {mse_poly_2}')
mse_list.append(mse_poly_2)
print(np.sqrt(mse_poly_2))
mse_list

Mean Squared Error of Degree 2 Polynomial: 0.0054117217248442605
0.07356440528437826


[0.00904619554343915,
 0.006434836979820076,
 0.005636230358920835,
 0.00874885712314712,
 0.005690061018229299,
 0.006762650567860223,
 0.0054117217248442605]

In [37]:
# Try a Bayesian model to see if that performs better (it does not perform much different than linear)
bayesian = BayesianRidge()
bayesianmodel = bayesian.fit(X_train, y_train)
predictions_bay_train = bayesianmodel.predict(X_train)
predictions_bay_test = bayesianmodel.predict(X_test)
predictions_bay = list(predictions_bay_train) + list(predictions_bay_test)
final_predictions_df['bay_predictions'] = pd.Series(predictions_bay)
final_predictions_df

Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial_model_predictions,polynomial4_only_predictions,polynomial4_predictions,polynomial_3_only_predictions,polynomial_2_only_predictions,polynomial_2_predictions,bay_predictions
0,Alabama,1,2000,0.575855,0.455813,0.570312,0.535256,0.587891,0.536025,0.491116,0.510381,0.460030
1,Alaska,2,2000,0.679369,0.673489,0.640625,0.694118,0.640625,0.712766,0.641360,0.681628,0.661968
2,Arizona,4,2000,0.532826,0.536327,0.564453,0.569640,0.615234,0.579054,0.614263,0.561861,0.535623
3,Arkansas,5,2000,0.528007,0.506385,0.566406,0.513410,0.541016,0.515258,0.511789,0.532629,0.508781
4,California,6,2000,0.437970,0.489221,0.435547,0.464937,0.505859,0.447349,0.416263,0.452651,0.488935
...,...,...,...,...,...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.476562,0.490846,0.574219,0.489971,0.496328,0.543717,0.477996
302,Washington,53,2020,0.400745,0.555347,0.519531,0.552848,0.644531,0.551020,0.548741,0.563974,0.549720
303,West Virginia,54,2020,0.697985,0.575573,0.630859,0.537330,0.757812,0.569085,0.505611,0.548485,0.571965
304,Wisconsin,55,2020,0.496820,0.568115,0.593750,0.564661,0.560547,0.565939,0.554746,0.576261,0.563864


In [38]:
mse_bay = mean_squared_error(final_predictions_df['rep_vote_pct'], final_predictions_df['bay_predictions'])
print(f'Mean Squared Error of Degree 2 Polynomial: {mse_bay}')
mse_list.append(mse_bay)
print(np.sqrt(mse_bay))
mse_list

Mean Squared Error of Degree 2 Polynomial: 0.009039685178550205
0.09507725899788132


[0.00904619554343915,
 0.006434836979820076,
 0.005636230358920835,
 0.00874885712314712,
 0.005690061018229299,
 0.006762650567860223,
 0.0054117217248442605,
 0.009039685178550205]

In [39]:
# Check for higher degree polynomials
# Note: This ended up super overfitting the data
for i in range(5, 11):
    poly_features = PolynomialFeatures(degree= i )
    poly_regression = make_pipeline(poly_features, LinearRegression())
    model = poly_regression.fit(X_train, y_train)
    predictions_poly_train = model.predict(X_train)
    predictions_poly_test = model.predict(X_test)
    predictions_poly = list(predictions_poly_train) + list(predictions_poly_test)
    final_predictions_df['polynomial_' + str(i) + '_predictions'] = pd.Series(predictions_poly)
final_predictions_df

Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial_model_predictions,polynomial4_only_predictions,polynomial4_predictions,polynomial_3_only_predictions,polynomial_2_only_predictions,polynomial_2_predictions,bay_predictions,polynomial_5_predictions,polynomial_6_predictions,polynomial_7_predictions,polynomial_8_predictions,polynomial_9_predictions,polynomial_10_predictions
0,Alabama,1,2000,0.575855,0.455813,0.570312,0.535256,0.587891,0.536025,0.491116,0.510381,0.460030,0.570801,0.583008,0.575880,0.575855,0.575855,0.575855
1,Alaska,2,2000,0.679369,0.673489,0.640625,0.694118,0.640625,0.712766,0.641360,0.681628,0.661968,0.687500,0.685547,0.679363,0.679370,0.679369,0.679370
2,Arizona,4,2000,0.532826,0.536327,0.564453,0.569640,0.615234,0.579054,0.614263,0.561861,0.535623,0.562012,0.473633,0.532855,0.532826,0.532826,0.532826
3,Arkansas,5,2000,0.528007,0.506385,0.566406,0.513410,0.541016,0.515258,0.511789,0.532629,0.508781,0.531250,0.567383,0.527990,0.528007,0.528007,0.528007
4,California,6,2000,0.437970,0.489221,0.435547,0.464937,0.505859,0.447349,0.416263,0.452651,0.488935,0.447266,0.340820,0.437997,0.437970,0.437970,0.437970
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.476562,0.490846,0.574219,0.489971,0.496328,0.543717,0.477996,0.382324,1.367188,-3.522718,0.010591,0.204758,0.272695
302,Washington,53,2020,0.400745,0.555347,0.519531,0.552848,0.644531,0.551020,0.548741,0.563974,0.549720,0.375000,11.982422,-1.583221,1.242138,0.866982,-0.110355
303,West Virginia,54,2020,0.697985,0.575573,0.630859,0.537330,0.757812,0.569085,0.505611,0.548485,0.571965,0.815918,0.808594,0.191940,-0.531846,0.590320,1.031090
304,Wisconsin,55,2020,0.496820,0.568115,0.593750,0.564661,0.560547,0.565939,0.554746,0.576261,0.563864,0.535645,0.070312,-0.395955,-0.050090,0.229268,0.480271


In [40]:
# Check mse's
# Note: This is what overfitted data looks like, comparing the mses to the other models
for i in range(5, 11):
    mse = mean_squared_error(final_predictions_df['rep_vote_pct'], final_predictions_df['polynomial_' + str(i) + '_predictions'])
    print(f'Mean Squared Error of Degree {i} Polynomial: {mse}')
    mse_list.append(mse)
    print(np.sqrt(mse))
mse_list

Mean Squared Error of Degree 5 Polynomial: 0.11130944707462899
0.3336307046340744
Mean Squared Error of Degree 6 Polynomial: 784.1135159195686
28.00202699662238
Mean Squared Error of Degree 7 Polynomial: 7919.329765521813
88.99061616553631
Mean Squared Error of Degree 8 Polynomial: 6069.451737734956
77.90668609134235
Mean Squared Error of Degree 9 Polynomial: 2615.2039912468695
51.139065216787735
Mean Squared Error of Degree 10 Polynomial: 1400.0161909034516
37.41679022716208


[0.00904619554343915,
 0.006434836979820076,
 0.005636230358920835,
 0.00874885712314712,
 0.005690061018229299,
 0.006762650567860223,
 0.0054117217248442605,
 0.009039685178550205,
 0.11130944707462899,
 784.1135159195686,
 7919.329765521813,
 6069.451737734956,
 2615.2039912468695,
 1400.0161909034516]

In [41]:
# Checking to make sure the mse's aren't significantly different for mse total and mse of the 2020 test data
mse_test_list = []
for i in range(4, 18):
    column = final_predictions_df.iloc[:, i]
    mse_2020 = mean_squared_error(final_predictions_df['rep_vote_pct'].loc[final_predictions_df['YEAR'] == 2020], 
                             column.loc[final_predictions_df['YEAR'] == 2020])
    print(f'Mean Squared Error of Column {i}: {mse_2020}')
    mse_test_list.append(mse_2020)
    print(np.sqrt(mse_2020))
print(mse_list)
print(mse_test_list)

Mean Squared Error of Column 4: 0.010630413978006134
0.10310389894667482
Mean Squared Error of Column 5: 0.013075641344233602
0.11434877062843134
Mean Squared Error of Column 6: 0.006865364372530589
0.0828574943655104
Mean Squared Error of Column 7: 0.026151410610864866
0.1617139777844354
Mean Squared Error of Column 8: 0.006961038799924918
0.08343284005668822
Mean Squared Error of Column 9: 0.008055987051554272
0.08975515055724809
Mean Squared Error of Column 10: 0.0071984008457326365
0.0848433901122099
Mean Squared Error of Column 11: 0.010529486486927335
0.10261328611309227
Mean Squared Error of Column 12: 0.6564828081983636
0.8102362668989606
Mean Squared Error of Column 13: 4704.674832491026
68.5906322502645
Mean Squared Error of Column 14: 47515.97859312469
217.98160150142186
Mean Squared Error of Column 15: 36416.710426409736
190.83162847497198
Mean Squared Error of Column 16: 15691.223947481216
125.26461570404156
Mean Squared Error of Column 17: 8400.097145420708
91.65204386930

In [42]:
# Decided the best models were the Poly_4_only, Poly_3_only, and the poly_2
# Added linear as a comparison
best_pred_df = final_predictions_df[['STNAME' ,'STATE' ,'YEAR' ,'rep_vote_pct','linear_model_predictions', 'polynomial4_only_predictions', 'polynomial_3_only_predictions', 'polynomial_2_predictions']]
best_pred_df.head(51)

Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial4_only_predictions,polynomial_3_only_predictions,polynomial_2_predictions
0,Alabama,1,2000,0.575855,0.455813,0.535256,0.536025,0.510381
1,Alaska,2,2000,0.679369,0.673489,0.694118,0.712766,0.681628
2,Arizona,4,2000,0.532826,0.536327,0.56964,0.579054,0.561861
3,Arkansas,5,2000,0.528007,0.506385,0.51341,0.515258,0.532629
4,California,6,2000,0.43797,0.489221,0.464937,0.447349,0.452651
5,Colorado,8,2000,0.544858,0.598682,0.594004,0.616979,0.639986
6,Connecticut,9,2000,0.407443,0.480549,0.485984,0.489185,0.423337
7,Delaware,10,2000,0.432599,0.481922,0.505255,0.507125,0.504107
8,District of Columbia,11,2000,0.095123,0.301778,0.125381,0.104802,0.073711
9,Florida,12,2000,0.499983,0.457633,0.454786,0.457432,0.417037


In [43]:
#Created an electoral votes dictionary for 2000 through 2020
electoral_votes_dict = {
 'Alabama': [9, 9, 9, 9, 9, 9],
 'Alaska': [3, 3, 3, 3, 3, 3],
 'Arizona': [8, 10, 10, 11, 11, 11],
 'Arkansas': [6, 6, 6, 6, 6, 6],
 'California': [54, 55, 55, 55, 55, 55],
 'Colorado': [8, 9, 9, 9, 9, 9],
 'Connecticut': [8, 7, 7, 7, 7, 7],
 'Delaware': [3, 3, 3, 3, 3, 3],
 'District of Columbia': [3, 3, 3, 3, 3, 3],
 'Florida': [25, 27, 27, 29, 29, 29],
 'Georgia': [13, 15, 15, 16, 16, 16],
 'Hawaii': [4, 4, 4, 4, 4, 4],
 'Idaho': [4, 4, 4, 4, 4, 4],
 'Illinois': [22, 21, 21, 20, 20, 20],
 'Indiana': [12, 11, 11, 11, 11, 11],
 'Iowa': [7, 7, 7, 6, 6, 6],
 'Kansas': [6, 6, 6, 6, 6, 6],
 'Kentucky': [8, 8, 8, 8, 8, 8],
 'Louisiana': [9, 9, 9, 8, 8, 8],
 'Maine': [4, 4, 4, 4, 4, 4],
 'Maryland': [10, 10, 10, 10, 10, 10],
 'Massachusetts': [12, 12, 12, 11, 11, 11],
 'Michigan': [18, 17, 17, 16, 16, 16],
 'Minnesota': [10, 10, 10, 10, 10, 10],
 'Mississippi': [7, 6, 6, 6, 6, 6],
 'Missouri': [11, 11, 11, 10, 10, 10],
 'Montana': [3, 3, 3, 3, 3, 3],
 'Nebraska': [5, 5, 5, 5, 5, 5],
 'Nevada': [4, 5, 5, 6, 6, 6],
 'New Hampshire': [4, 4, 4, 4, 4, 4],
 'New Jersey': [15, 15, 15, 14, 14, 14],
 'New Mexico': [5, 5, 5, 5, 5, 5],
 'New York': [33, 31, 31, 29, 29, 29],
 'North Carolina': [14, 15, 15, 15, 15, 15],
 'North Dakota': [3, 3, 3, 3, 3, 3],
 'Ohio': [21, 20, 20, 18, 18, 18],
 'Oklahoma': [8, 7, 7, 7, 7, 7],
 'Oregon': [7, 7, 7, 7, 7, 7],
 'Pennsylvania': [23, 21, 21, 20, 20, 20],
 'Rhode Island': [4, 4, 4, 4, 4, 4],
 'South Carolina': [8, 8, 8, 9, 9, 9],
 'South Dakota': [3, 3, 3, 3, 3, 3],
 'Tennessee': [11, 11, 11, 11, 11, 11],
 'Texas': [32, 34, 34, 38, 38, 38],
 'Utah': [5, 5, 5, 6, 6, 6],
 'Vermont': [3, 3, 3, 3, 3, 3],
 'Virginia': [13, 13, 13, 13, 13, 13],
 'Washington': [11, 11, 11, 12, 12, 12],
 'West Virginia': [5, 5, 5, 5, 5, 5],
 'Wisconsin': [11, 10, 10, 10, 10, 10],
 'Wyoming': [3, 3, 3, 3, 3, 3],
}
    

In [44]:
# rearranged the dictionary to work with our data
ev_list = []
for i in range(6):
    for key, value in electoral_votes_dict.items():
        ev_list.append(value[i])
ev_list

[9,
 3,
 8,
 6,
 54,
 8,
 8,
 3,
 3,
 25,
 13,
 4,
 4,
 22,
 12,
 7,
 6,
 8,
 9,
 4,
 10,
 12,
 18,
 10,
 7,
 11,
 3,
 5,
 4,
 4,
 15,
 5,
 33,
 14,
 3,
 21,
 8,
 7,
 23,
 4,
 8,
 3,
 11,
 32,
 5,
 3,
 13,
 11,
 5,
 11,
 3,
 9,
 3,
 10,
 6,
 55,
 9,
 7,
 3,
 3,
 27,
 15,
 4,
 4,
 21,
 11,
 7,
 6,
 8,
 9,
 4,
 10,
 12,
 17,
 10,
 6,
 11,
 3,
 5,
 5,
 4,
 15,
 5,
 31,
 15,
 3,
 20,
 7,
 7,
 21,
 4,
 8,
 3,
 11,
 34,
 5,
 3,
 13,
 11,
 5,
 10,
 3,
 9,
 3,
 10,
 6,
 55,
 9,
 7,
 3,
 3,
 27,
 15,
 4,
 4,
 21,
 11,
 7,
 6,
 8,
 9,
 4,
 10,
 12,
 17,
 10,
 6,
 11,
 3,
 5,
 5,
 4,
 15,
 5,
 31,
 15,
 3,
 20,
 7,
 7,
 21,
 4,
 8,
 3,
 11,
 34,
 5,
 3,
 13,
 11,
 5,
 10,
 3,
 9,
 3,
 11,
 6,
 55,
 9,
 7,
 3,
 3,
 29,
 16,
 4,
 4,
 20,
 11,
 6,
 6,
 8,
 8,
 4,
 10,
 11,
 16,
 10,
 6,
 10,
 3,
 5,
 6,
 4,
 14,
 5,
 29,
 15,
 3,
 18,
 7,
 7,
 20,
 4,
 9,
 3,
 11,
 38,
 6,
 3,
 13,
 12,
 5,
 10,
 3,
 9,
 3,
 11,
 6,
 55,
 9,
 7,
 3,
 3,
 29,
 16,
 4,
 4,
 20,
 11,
 6,
 6,
 8,
 8,
 4,
 10,
 11,
 16,


In [45]:
best_pred_df['electoral_votes'] = pd.Series(ev_list)
best_pred_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  best_pred_df['electoral_votes'] = pd.Series(ev_list)


Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial4_only_predictions,polynomial_3_only_predictions,polynomial_2_predictions,electoral_votes
0,Alabama,1,2000,0.575855,0.455813,0.535256,0.536025,0.510381,9
1,Alaska,2,2000,0.679369,0.673489,0.694118,0.712766,0.681628,3
2,Arizona,4,2000,0.532826,0.536327,0.569640,0.579054,0.561861,8
3,Arkansas,5,2000,0.528007,0.506385,0.513410,0.515258,0.532629,6
4,California,6,2000,0.437970,0.489221,0.464937,0.447349,0.452651,54
...,...,...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.490846,0.489971,0.543717,13
302,Washington,53,2020,0.400745,0.555347,0.552848,0.551020,0.563974,12
303,West Virginia,54,2020,0.697985,0.575573,0.537330,0.569085,0.548485,5
304,Wisconsin,55,2020,0.496820,0.568115,0.564661,0.565939,0.576261,10


In [46]:
# This is mapping our results to categorical data of which party won each state in actuality and in our model
columns = ['actual_result', 'linear_result', 'poly_4_result', 'poly_3_result', 'poly_2_result']
outcome_df = best_pred_df.copy()
for i in range(5):
    outcome_df[columns[i]] = round(best_pred_df.iloc[:, i+3])
outcome_df

Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial4_only_predictions,polynomial_3_only_predictions,polynomial_2_predictions,electoral_votes,actual_result,linear_result,poly_4_result,poly_3_result,poly_2_result
0,Alabama,1,2000,0.575855,0.455813,0.535256,0.536025,0.510381,9,1.0,0.0,1.0,1.0,1.0
1,Alaska,2,2000,0.679369,0.673489,0.694118,0.712766,0.681628,3,1.0,1.0,1.0,1.0,1.0
2,Arizona,4,2000,0.532826,0.536327,0.569640,0.579054,0.561861,8,1.0,1.0,1.0,1.0,1.0
3,Arkansas,5,2000,0.528007,0.506385,0.513410,0.515258,0.532629,6,1.0,1.0,1.0,1.0,1.0
4,California,6,2000,0.437970,0.489221,0.464937,0.447349,0.452651,54,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.490846,0.489971,0.543717,13,0.0,0.0,0.0,0.0,1.0
302,Washington,53,2020,0.400745,0.555347,0.552848,0.551020,0.563974,12,0.0,1.0,1.0,1.0,1.0
303,West Virginia,54,2020,0.697985,0.575573,0.537330,0.569085,0.548485,5,1.0,1.0,1.0,1.0,1.0
304,Wisconsin,55,2020,0.496820,0.568115,0.564661,0.565939,0.576261,10,0.0,1.0,1.0,1.0,1.0


In [47]:
# Turns out that Florida 2000 was...interesting, so we had to manually change the result from 0 to 1 (Dem to Republican)
outcome_df.at[9, 'actual_result'] = 1
outcome_df.loc[outcome_df['STNAME'] == 'Florida']

Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial4_only_predictions,polynomial_3_only_predictions,polynomial_2_predictions,electoral_votes,actual_result,linear_result,poly_4_result,poly_3_result,poly_2_result
9,Florida,12,2000,0.499983,0.457633,0.454786,0.457432,0.417037,25,1.0,0.0,0.0,0.0,0.0
60,Florida,12,2004,0.525237,0.456949,0.447443,0.450037,0.425764,27,1.0,0.0,0.0,0.0,0.0
111,Florida,12,2008,0.485823,0.451203,0.434967,0.438593,0.427366,27,0.0,0.0,0.0,0.0,0.0
162,Florida,12,2012,0.495577,0.423509,0.392203,0.396235,0.394272,29,0.0,0.0,0.0,0.0,0.0
213,Florida,12,2016,0.506188,0.412048,0.374122,0.378194,0.386221,29,1.0,0.0,0.0,0.0,0.0
264,Florida,12,2020,0.516948,0.422439,0.392178,0.403671,0.405975,29,1.0,0.0,0.0,0.0,0.0


In [48]:
# Mapped the result to Dem and Rep
mapping = {0: 'DEMOCRAT', 1: 'REPUBLICAN'}
for i in columns:
    outcome_df[i + '_mapped'] = outcome_df[i].map(mapping)
outcome_df

Unnamed: 0,STNAME,STATE,YEAR,rep_vote_pct,linear_model_predictions,polynomial4_only_predictions,polynomial_3_only_predictions,polynomial_2_predictions,electoral_votes,actual_result,linear_result,poly_4_result,poly_3_result,poly_2_result,actual_result_mapped,linear_result_mapped,poly_4_result_mapped,poly_3_result_mapped,poly_2_result_mapped
0,Alabama,1,2000,0.575855,0.455813,0.535256,0.536025,0.510381,9,1.0,0.0,1.0,1.0,1.0,REPUBLICAN,DEMOCRAT,REPUBLICAN,REPUBLICAN,REPUBLICAN
1,Alaska,2,2000,0.679369,0.673489,0.694118,0.712766,0.681628,3,1.0,1.0,1.0,1.0,1.0,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN
2,Arizona,4,2000,0.532826,0.536327,0.569640,0.579054,0.561861,8,1.0,1.0,1.0,1.0,1.0,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN
3,Arkansas,5,2000,0.528007,0.506385,0.513410,0.515258,0.532629,6,1.0,1.0,1.0,1.0,1.0,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN
4,California,6,2000,0.437970,0.489221,0.464937,0.447349,0.452651,54,0.0,0.0,0.0,0.0,0.0,DEMOCRAT,DEMOCRAT,DEMOCRAT,DEMOCRAT,DEMOCRAT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,Virginia,51,2020,0.448453,0.480330,0.490846,0.489971,0.543717,13,0.0,0.0,0.0,0.0,1.0,DEMOCRAT,DEMOCRAT,DEMOCRAT,DEMOCRAT,REPUBLICAN
302,Washington,53,2020,0.400745,0.555347,0.552848,0.551020,0.563974,12,0.0,1.0,1.0,1.0,1.0,DEMOCRAT,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN
303,West Virginia,54,2020,0.697985,0.575573,0.537330,0.569085,0.548485,5,1.0,1.0,1.0,1.0,1.0,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN
304,Wisconsin,55,2020,0.496820,0.568115,0.564661,0.565939,0.576261,10,0.0,1.0,1.0,1.0,1.0,DEMOCRAT,REPUBLICAN,REPUBLICAN,REPUBLICAN,REPUBLICAN


In [49]:
# A groupby to see how well each model predicted the total winner
# And using it for each model
outcome_groupby = outcome_df.groupby(['YEAR', 'actual_result_mapped'])

In [50]:
outcome_groupby['electoral_votes'].sum()

YEAR  actual_result_mapped
2000  DEMOCRAT                267
      REPUBLICAN              271
2004  DEMOCRAT                252
      REPUBLICAN              286
2008  DEMOCRAT                364
      REPUBLICAN              174
2012  DEMOCRAT                332
      REPUBLICAN              206
2016  DEMOCRAT                233
      REPUBLICAN              305
2020  DEMOCRAT                306
      REPUBLICAN              232
Name: electoral_votes, dtype: int64

In [51]:
outcome_groupby_linear = outcome_df.groupby(['YEAR', 'linear_result_mapped'])
outcome_groupby_linear['electoral_votes'].sum()

YEAR  linear_result_mapped
2000  DEMOCRAT                286
      REPUBLICAN              252
2004  DEMOCRAT                327
      REPUBLICAN              211
2008  DEMOCRAT                344
      REPUBLICAN              194
2012  DEMOCRAT                372
      REPUBLICAN              166
2016  DEMOCRAT                378
      REPUBLICAN              160
2020  DEMOCRAT                336
      REPUBLICAN              202
Name: electoral_votes, dtype: int64

In [52]:
outcome_groupby_poly4 = outcome_df.groupby(['YEAR', 'poly_4_result_mapped'])
outcome_groupby_poly4['electoral_votes'].sum()

YEAR  poly_4_result_mapped
2000  DEMOCRAT                225
      REPUBLICAN              313
2004  DEMOCRAT                252
      REPUBLICAN              286
2008  DEMOCRAT                294
      REPUBLICAN              244
2012  DEMOCRAT                330
      REPUBLICAN              208
2016  DEMOCRAT                312
      REPUBLICAN              226
2020  DEMOCRAT                283
      REPUBLICAN              255
Name: electoral_votes, dtype: int64

In [53]:
outcome_groupby_poly3 = outcome_df.groupby(['YEAR', 'poly_3_result_mapped'])
outcome_groupby_poly3['electoral_votes'].sum()

YEAR  poly_3_result_mapped
2000  DEMOCRAT                224
      REPUBLICAN              314
2004  DEMOCRAT                252
      REPUBLICAN              286
2008  DEMOCRAT                294
      REPUBLICAN              244
2012  DEMOCRAT                321
      REPUBLICAN              217
2016  DEMOCRAT                317
      REPUBLICAN              221
2020  DEMOCRAT                277
      REPUBLICAN              261
Name: electoral_votes, dtype: int64

In [54]:
outcome_groupby_poly2 = outcome_df.groupby(['YEAR', 'poly_2_result_mapped'])
outcome_groupby_poly2['electoral_votes'].sum()

YEAR  poly_2_result_mapped
2000  DEMOCRAT                225
      REPUBLICAN              313
2004  DEMOCRAT                225
      REPUBLICAN              313
2008  DEMOCRAT                229
      REPUBLICAN              309
2012  DEMOCRAT                286
      REPUBLICAN              252
2016  DEMOCRAT                282
      REPUBLICAN              256
2020  DEMOCRAT                261
      REPUBLICAN              277
Name: electoral_votes, dtype: int64

In [55]:
# Saving the results to a csv
outcome_df.to_csv('Cleaned_Data/outcome_df.csv')

In [56]:
# Same thing with a results dataframe
results = outcome_groupby['electoral_votes'].sum().reset_index()
results.head()

Unnamed: 0,YEAR,actual_result_mapped,electoral_votes
0,2000,DEMOCRAT,267
1,2000,REPUBLICAN,271
2,2004,DEMOCRAT,252
3,2004,REPUBLICAN,286
4,2008,DEMOCRAT,364


In [57]:
gbs = [outcome_groupby_linear, outcome_groupby_poly2, outcome_groupby_poly3, outcome_groupby_poly4]
degree = 0
for i in gbs:
    df = i['electoral_votes'].sum().reset_index()
    degree = degree + 1
    column_name = "electoral_votes_" + str(degree)
    df.columns = ['YEAR', 'actual_result_mapped', column_name]
    results = pd.merge(results, df, on=['YEAR', 'actual_result_mapped'], how='left')
print(results)

    YEAR actual_result_mapped  electoral_votes  electoral_votes_1  \
0   2000             DEMOCRAT              267                286   
1   2000           REPUBLICAN              271                252   
2   2004             DEMOCRAT              252                327   
3   2004           REPUBLICAN              286                211   
4   2008             DEMOCRAT              364                344   
5   2008           REPUBLICAN              174                194   
6   2012             DEMOCRAT              332                372   
7   2012           REPUBLICAN              206                166   
8   2016             DEMOCRAT              233                378   
9   2016           REPUBLICAN              305                160   
10  2020             DEMOCRAT              306                336   
11  2020           REPUBLICAN              232                202   

    electoral_votes_2  electoral_votes_3  electoral_votes_4  
0                 225                224

In [58]:
results.to_csv('Cleaned_Data/model_ev_results.csv')

In [59]:
results

Unnamed: 0,YEAR,actual_result_mapped,electoral_votes,electoral_votes_1,electoral_votes_2,electoral_votes_3,electoral_votes_4
0,2000,DEMOCRAT,267,286,225,224,225
1,2000,REPUBLICAN,271,252,313,314,313
2,2004,DEMOCRAT,252,327,225,252,252
3,2004,REPUBLICAN,286,211,313,286,286
4,2008,DEMOCRAT,364,344,229,294,294
5,2008,REPUBLICAN,174,194,309,244,244
6,2012,DEMOCRAT,332,372,286,321,330
7,2012,REPUBLICAN,206,166,252,217,208
8,2016,DEMOCRAT,233,378,282,317,312
9,2016,REPUBLICAN,305,160,256,221,226
