In [None]:
import pandas as pd
# import the predictors (X), the response (y) from github
X = pd.read_csv('https://raw.githubusercontent.com/mawalz05/Paper_Machine_Learning_and_Forecasting_Elections/master/X.csv')
y = pd.read_csv("https://raw.githubusercontent.com/mawalz05/Paper_Machine_Learning_and_Forecasting_Elections/master/y.csv", header=None)
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=333)

In [None]:
import time
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# create pipeline
pipe = Pipeline([('regr', LinearRegression())])

# Create dictionary with candidate learning algorithms and their hyperparameters
search_space = [{'regr': [LinearRegression()]},
                {'regr': [Lasso(max_iter = 1000, normalize = True)],
                 'regr__alpha': [1e-3,1e-2, 1, 5, 10, 20]},
                {'regr': [Ridge(normalize = True)],
                'regr__alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]},
                {"regr": [ElasticNet(normalize = True, max_iter = 10000)],
                 "regr__alpha":[1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20],
                 "regr__l1_ratio": np.arange(0.0, 1.1,0.1)},
                {"regr": [RandomForestRegressor()],
                "regr__n_estimators": [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
                "regr__max_features": ['auto', 'sqrt'],
                "regr__max_depth": [int(x) for x in np.linspace(10, 110, num = 11)],
                "regr__min_samples_split": [2, 5, 10],
                "regr__min_samples_leaf": [1, 2, 4],
                "regr__bootstrap": [True, False]},
                {"regr": [GradientBoostingRegressor()],
                "regr__learning_rate": [0.01, .05, .1, .5, 1],
                "regr__n_estimators": [1, 2, 4, 8, 16, 32, 64, 100, 200],
                "regr__max_depth": np.linspace(1, 32, 32, endpoint=True),
                "regr__min_samples_split": np.linspace(0.1, 1.0, 10, endpoint=True),
                "regr__min_samples_leaf": np.linspace(0.1, 0.5, 5, endpoint=True),
                "regr__max_features": list(range(1,X_train.shape[1]))},
               {'regr': [SVR()],
                'regr__kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),
                'regr__C' : [1,5,10, 50, 100],
                'regr__degree' : [2,3,4,5,6,7,8],
                'regr__gamma' :np.logspace(-3,3, num=6, endpoint = False)},
               {"regr": [MLPRegressor(max_iter = 10000)],
                "regr__hidden_layer_sizes": [1,3,5,10, 15,30,100], 
                "regr__activation": ["logistic", "tanh", "relu"], 
                "regr__solver": ["lbfgs", "sgd", "adam"], 
                "regr__alpha": [float(x) for x in np.linspace(.0000001, .01, num = 10)]}]
                
# Creating a random search through parameters 
randomsearch = RandomizedSearchCV(pipe, search_space, n_iter = 1000, cv = 5, verbose =0, n_jobs = -1)

# Recoding the start time of the cell
start_run = time.time()
local_time = time.ctime(start_run)
print('start time run: {}'.format(local_time))


# Fit grid search and select the best models for the total results
best_model = randomsearch.fit(X_train, y_train)
model = best_model.best_estimator_.get_params()["regr"]
print("The best Model and parameters:")
# View best model
print(best_model.best_estimator_.get_params()["regr"])

end_run1 = time.time()
local_time = time.ctime(end_run1)
print('end time: {} '.format(local_time))
duration_run = round((end_run1 - start_run)/60, 2)
print('Run 1 run time: {}'.format(duration_run))
 

In [None]:
# Creating a model object by fitting the data on the best parameters
model = model.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

# Using 5-fold cross validation to get the RMSE
scores = cross_val_score(model, X_train, y_train, cv=5, scoring = 'neg_root_mean_squared_error', n_jobs = -1)
print(scores)
np.mean(scores)

In [None]:
# Creating predictions for the training and testing sets
y_pred = model.predict(X_test)
y_pred_train = model.predict(X_train)

# Caclulating the RMSE for the training and testing sets
from sklearn.metrics import mean_squared_error
from math import sqrt

rmse_test = sqrt(mean_squared_error(y_test, y_pred))
rmse_train = sqrt(mean_squared_error(y_train, y_pred_train))
print(rmse_test)
print(rmse_train)

In [None]:
# Turning y_test into an array anf dinding the sahpe
y_test = np.array(y_test)
y_test.shape

In [None]:
#Creating an array to calculate the testing accuracy.
y_test = y_test.reshape(180, 1) #reshaping the data to be the length of the test set X 1
y_pred = y_pred.reshape(180, 1) #reshaping the data to be the length of the prediction array X 1
results = np.concatenate((y_test, y_pred), axis = 1) #Combinng the two arrays [[Y,Y_hat],[Y,Y_hat],...]

#Add 1 for every instance where both the prediction and training response variable are either above or below 50 percent
results2 = 0
for i in range(180):
    if results[i][0] > 50 and results[i][1] > 50:
        results2 += 1
    elif results[i][0] < 50 and results[i][1] < 50:
        results2 += 1
    else:
        results2 += 0

#Calculaing the testing accuray
print("The testing accuracy is: " + str(results2/len(results)))