In [155]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

import pandas as pd
import numpy as np

# Imports
import os
from google.cloud import bigquery

In [156]:
# Environment variables
gcp_project_id = os.environ['GCP_PROJECT']
gcp_service_account_key = os.environ['GCP_SERVICE_ACCOUNT_KEY']
bq_source_dataset = os.environ['BQ_SOURCE_DATASET']
bq_cleaned_dataset = os.environ['BQ_CLEANED_DATASET']

In [157]:
# Import data from BQ and set data to X
# Initialize a BigQuery client using the service account JSON file
bq_client = bigquery.Client(project=gcp_project_id).from_service_account_json(gcp_service_account_key)

# Setting the table we want from the source dataset
select_table = 'cleaned_full_polls_combined_national_results_2004_2019'

# SQL query for querying Big Query and fetching entire table
query = f"""
    SELECT *
    FROM `{gcp_project_id}.{bq_cleaned_dataset}.{select_table}`
"""

In [158]:
# Use BQ client to create DF from the selected table
data = bq_client.query(query).to_dataframe()

In [159]:
# Replace any found NaN values with 0
data.replace(np.nan, 0, inplace=True)

In [160]:
# Add a field 'poll length' that shows number of days the poll was held for
data['poll_length'] = pd.to_datetime(data.enddate) - pd.to_datetime(data.startdate)
data['poll_length'] = data['poll_length'].dt.days

In [161]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'NAT_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']:
    data[column] = data[column] / 100

In [162]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']:
    data[column] = data[column] / 100

In [163]:
# Drop columns we don't need
data = data.drop(columns=['startdate', 'enddate', 'pollster'])

In [164]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'days_to_elec', 'poll_length']
num_transformer = MinMaxScaler()

In [165]:
# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = OrdinalEncoder(categories = [['F','D-','D','D+','C-','B','B+','A-']])

In [166]:
#TODO – a neural network will converge qicker if everything is scaled. Also put ordinal encoding through minmax scaler

In [167]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [168]:
# Build our feature matrix, by dropping irrelevant features and target columns
X = data.drop(columns=['next_elec_date','NAT_ACT', 'BRX_ACT', 'CON_ACT',
       'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT',
       'OTH_PERCENTAGE'])

In [169]:
# Build our target matrix, retaining each party share vote columns only
y = data[['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT',
                    'NAT_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_PERCENTAGE']]

In [170]:
# Handle data splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [171]:
# Fit transform preprocessing pipeline to X_train
X_train = preproc_pipeline.fit_transform(X_train)

In [172]:
# Transform X_test
X_test = preproc_pipeline.transform(X_test)

In [173]:
#TODO – refactor the below into a function > model instantiation, fitting, scoring, etc.
# Handle target extraction for test and train sets respectively
#TODO Kwarg > defaults to either cross val or prediction
y_train_LAB = y_train['LAB_ACT']
y_train_CON = y_train['CON_ACT']
y_train_LIB = y_train['LIB_ACT']
y_train_GRE = y_train['GRE_ACT']
y_train_BRX = y_train['BRX_ACT']
y_train_NAT = y_train['NAT_ACT']
y_train_SNP = y_train['SNP_ACT']
y_train_UKI = y_train['UKI_ACT']
y_train_PLC = y_train['PLC_ACT']
y_train_OTH = y_train['OTH_PERCENTAGE']

y_test_LAB = y_test['LAB_ACT']
y_test_CON = y_test['CON_ACT']
y_test_LIB = y_test['LIB_ACT']
y_test_GRE = y_test['GRE_ACT']
y_test_BRX = y_test['BRX_ACT']
y_test_NAT = y_test['NAT_ACT']
y_test_SNP = y_test['SNP_ACT']
y_test_UKI = y_test['UKI_ACT']
y_test_PLC = y_test['PLC_ACT']
y_test_OTH = y_test['OTH_PERCENTAGE']

In [174]:
# Instantiate XGBRegressor Model
xgb_regression_model = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)

In [175]:
# Set XGBRegressor default parameters
xgbr_model_params = {
    "learning_rate": 0.3,
    "n_estimators": 300,
    "max_depth": 3,
    "subsample": 0.7,
    "objective": 'reg:squarederror',
    "nthread": -1,
    "enable_categorical": True
}

# Instantiate the XGBRegressor model for each party
model_LAB = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_CON = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_LIB = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_GRE = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_BRX = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_NAT = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_SNP = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_UKI = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_PLC = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_OTH = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )

In [176]:
# Handle model training
X_train_matrix = np.array(X_train)

model_LAB.fit(X_train_matrix, y_train_LAB)
model_CON.fit(X_train_matrix, y_train_CON)
model_LIB.fit(X_train_matrix, y_train_LIB)
model_GRE.fit(X_train_matrix, y_train_GRE)
model_BRX.fit(X_train_matrix, y_train_BRX)
model_NAT.fit(X_train_matrix, y_train_NAT)
model_SNP.fit(X_train_matrix, y_train_SNP)
model_UKI.fit(X_train_matrix, y_train_UKI)
model_PLC.fit(X_train_matrix, y_train_PLC)
model_OTH.fit(X_train_matrix, y_train_OTH)

In [177]:
X_test_matrix = np.array(X_test)

In [178]:
# Evaluate model performance with cross validation
X_test_matrix = np.array(X_test)

score_LAB = cross_val_score(model_LAB, X_test_matrix, y_test_LAB, scoring="neg_root_mean_squared_error").mean()
score_CON = cross_val_score(model_CON, X_test_matrix, y_test_CON, scoring="neg_root_mean_squared_error").mean()
score_LIB = cross_val_score(model_LIB, X_test_matrix, y_test_LIB, scoring="neg_root_mean_squared_error").mean()
score_GRE = cross_val_score(model_GRE, X_test_matrix, y_test_GRE, scoring="neg_root_mean_squared_error").mean()
score_BRX = cross_val_score(model_BRX, X_test_matrix, y_test_BRX, scoring="neg_root_mean_squared_error").mean()
score_NAT = cross_val_score(model_NAT, X_test_matrix, y_test_NAT, scoring="neg_root_mean_squared_error").mean()
score_SNP = cross_val_score(model_SNP, X_test_matrix, y_test_SNP, scoring="neg_root_mean_squared_error").mean()
score_UKI = cross_val_score(model_UKI, X_test_matrix, y_test_UKI, scoring="neg_root_mean_squared_error").mean()
score_PLC = cross_val_score(model_PLC, X_test_matrix, y_test_PLC, scoring="neg_root_mean_squared_error").mean()
score_OTH = cross_val_score(model_OTH, X_test_matrix, y_test_OTH, scoring="neg_root_mean_squared_error").mean()

InvalidParameterError: The 'scoring' parameter of cross_val_score must be a str among {'average_precision', 'neg_mean_poisson_deviance', 'f1', 'f1_weighted', 'neg_brier_score', 'jaccard', 'roc_auc_ovo', 'precision_micro', 'neg_mean_squared_log_error', 'matthews_corrcoef', 'neg_negative_likelihood_ratio', 'v_measure_score', 'max_error', 'positive_likelihood_ratio', 'neg_mean_gamma_deviance', 'jaccard_samples', 'roc_auc_ovr', 'precision_weighted', 'neg_root_mean_squared_error', 'completeness_score', 'recall', 'f1_macro', 'f1_micro', 'neg_log_loss', 'f1_samples', 'precision_samples', 'top_k_accuracy', 'homogeneity_score', 'normalized_mutual_info_score', 'explained_variance', 'recall_samples', 'roc_auc_ovo_weighted', 'recall_micro', 'd2_absolute_error_score', 'neg_mean_absolute_error', 'recall_weighted', 'neg_median_absolute_error', 'neg_mean_absolute_percentage_error', 'roc_auc_ovr_weighted', 'accuracy', 'jaccard_weighted', 'precision', 'r2', 'mutual_info_score', 'jaccard_micro', 'neg_root_mean_squared_log_error', 'precision_macro', 'adjusted_mutual_info_score', 'balanced_accuracy', 'neg_mean_squared_error', 'rand_score', 'adjusted_rand_score', 'recall_macro', 'jaccard_macro', 'roc_auc', 'fowlkes_mallows_score'}, a callable or None. Got ['neg_root_mean_squared_error', 'max_error'] instead.

In [None]:
print(f"RMSE score for LAB: {score_LAB}")
print(f"RMSE score for CON: {score_CON}")
print(f"RMSE score for LIB: {score_LIB}")
print(f"RMSE score for GRE: {score_GRE}")
print(f"RMSE score for BRX: {score_BRX}")
print(f"RMSE score for NAT: {score_NAT}")
print(f"RMSE score for SNP: {score_SNP}")
print(f"RMSE score for UKI: {score_UKI}")
print(f"RMSE score for PLC: {score_PLC}")
print(f"RMSE score for OTH: {score_OTH}")

RMSE score for LAB: -0.009647370552783304
RMSE score for CON: -0.007041803470523608
RMSE score for LIB: -0.007837127889967896
RMSE score for GRE: -0.0019031463040710417
RMSE score for BRX: -0.0021086246896250957
RMSE score for NAT: -0.13008735841306482
RMSE score for SNP: -0.0019742143953015
RMSE score for UKI: -0.009142521531030482
RMSE score for PLC: -0.00012845882566453882
RMSE score for OTH: -0.0005440674135258442


In [None]:
#TODO Create a function for prediction