In [229]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV

import pandas as pd
import numpy as np

# Imports
import os
from google.cloud import bigquery

In [230]:
# Environment variables
gcp_project_id = os.environ['GCP_PROJECT']
gcp_service_account_key = os.environ['GCP_SERVICE_ACCOUNT_KEY']
bq_source_dataset = os.environ['BQ_SOURCE_DATASET']
bq_cleaned_dataset = os.environ['BQ_CLEANED_DATASET']

In [231]:
# Import data from BQ and set data to X
# Initialize a BigQuery client using the service account JSON file
bq_client = bigquery.Client(project=gcp_project_id).from_service_account_json(gcp_service_account_key)

# Setting the table we want from the source dataset
select_table = 'cleaned_full_polls_combined_national_results_2004_2019'

# SQL query for querying Big Query and fetching entire table
query = f"""
    SELECT *
    FROM `{gcp_project_id}.{bq_cleaned_dataset}.{select_table}`
"""

In [232]:
# Use BQ client to create DF from the selected table
data = bq_client.query(query).to_dataframe()

In [233]:
# Replace any found NaN values with 0
data.replace(np.nan, 0, inplace=True)

In [234]:
# Add a field 'poll length' that shows number of days the poll was held for
data['poll_length'] = pd.to_datetime(data.enddate) - pd.to_datetime(data.startdate)
data['poll_length'] = data['poll_length'].dt.days

In [235]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'NAT_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']:
    data[column] = data[column] / 100

In [236]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']:
    data[column] = data[column] / 100

In [237]:
# Drop columns we don't need
data = data.drop(columns=['startdate', 'enddate', 'pollster'])

In [238]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'days_to_elec', 'poll_length']
num_transformer = MinMaxScaler()

In [239]:
# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = OrdinalEncoder(categories = [['F','D-','D','D+','C-','B','B+','A-']])

In [None]:
#TODO – a neural network will converge qicker if everything is scaled. Also put ordinal encoding through minmax scaler

In [240]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [241]:
# Fit transform preprocessing pipeline to data
data_processed = preproc_pipeline.fit_transform(data)

In [242]:
# Check feature names make sense
data_processed = pd.DataFrame(
    data_processed, columns=preproc_pipeline.get_feature_names_out()
)

In [243]:
# Build our feature matrix, by dropping irrelevant features and target columns
X = data_processed.drop(columns=['next_elec_date','NAT_ACT', 'BRX_ACT', 'CON_ACT',
       'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT',
       'OTH_PERCENTAGE'])

In [244]:
# Build our target matrix, retaining each party share vote columns only
y = data_processed[['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT',
                    'NAT_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_PERCENTAGE']]

In [245]:
# Handle data splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [246]:
#TODO – refactor the below into a function > model instantiation, fitting, scoring, etc.
# Handle target extraction for test and train sets respectively
#TODO Kwarg > defaults to either cross val or prediction
y_train_LAB = y_train['LAB_ACT']
y_train_CON = y_train['CON_ACT']
y_train_LIB = y_train['LIB_ACT']
y_train_GRE = y_train['GRE_ACT']
y_train_BRX = y_train['BRX_ACT']
y_train_NAT = y_train['NAT_ACT']
y_train_SNP = y_train['SNP_ACT']
y_train_UKI = y_train['UKI_ACT']
y_train_PLC = y_train['PLC_ACT']
y_train_OTH = y_train['OTH_PERCENTAGE']

y_test_LAB = y_test['LAB_ACT']
y_test_CON = y_test['CON_ACT']
y_test_LIB = y_test['LIB_ACT']
y_test_GRE = y_test['GRE_ACT']
y_test_BRX = y_test['BRX_ACT']
y_test_NAT = y_test['NAT_ACT']
y_test_SNP = y_test['SNP_ACT']
y_test_UKI = y_test['UKI_ACT']
y_test_PLC = y_test['PLC_ACT']
y_test_OTH = y_test['OTH_PERCENTAGE']

In [247]:
# Instantiate XGBRegressor Model
xgb_regression_model = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)

In [254]:
# Set XGBRegressor default parameters
xgbr_model_params = {
    "learning_rate": 0.3,
    "n_estimators": 300,
    "max_depth": 3,
    "subsample": 0.7,
    "objective": 'reg:squarederror',
    "nthread": -1,
}

# Instantiate the model for each party
model_LAB = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_CON = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_LIB = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_GRE = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_BRX = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_NAT = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_SNP = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_UKI = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_PLC = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )
model_OTH = XGBRegressor(learning_rate=xgbr_model_params["learning_rate"],
                         n_estimators=xgbr_model_params["n_estimators"],
                                     max_depth=xgbr_model_params["max_depth"],
                                     subsample=xgbr_model_params["subsample"],
                                     objective=xgbr_model_params["objective"],
                                     nthread=xgbr_model_params["nthread"],
                                     enable_categorical=
                                     xgbr_model_params["enable_categorical"]
                                     )

In [260]:
# Handle model training
X_train_matrix = np.array(X_train)

model_LAB.fit(X_train_matrix, y_train_LAB)
model_CON.fit(X_train_matrix, y_train_CON)
model_LIB.fit(X_train_matrix, y_train_LIB)
model_GRE.fit(X_train_matrix, y_train_GRE)
model_BRX.fit(X_train_matrix, y_train_BRX)
model_NAT.fit(X_train_matrix, y_train_NAT)
model_SNP.fit(X_train_matrix, y_train_SNP)
model_UKI.fit(X_train_matrix, y_train_UKI)
model_PLC.fit(X_train_matrix, y_train_PLC)
model_OTH.fit(X_train_matrix, y_train_OTH)

In [261]:
X_test_matrix = np.array(X_test)
model_BRX.score(X_test_matrix, y_test_BRX)

-18.95487403869629

In [250]:
# Evaluate model performance with cross validation
X_test_matrix = np.array(X_test)

score_LAB = cross_val_score(model_LAB, X_test_matrix, y_test_LAB).mean()
score_CON = cross_val_score(model_CON, X_test_matrix, y_test_CON).mean()
score_LIB = cross_val_score(model_LIB, X_test_matrix, y_test_LIB).mean()
score_GRE = cross_val_score(model_GRE, X_test_matrix, y_test_GRE).mean()
score_BRX = cross_val_score(model_BRX, X_test_matrix, y_test_BRX).mean()
score_NAT = cross_val_score(model_NAT, X_test_matrix, y_test_NAT).mean()
score_SNP = cross_val_score(model_SNP, X_test_matrix, y_test_SNP).mean()
score_UKI = cross_val_score(model_UKI, X_test_matrix, y_test_UKI).mean()
score_PLC = cross_val_score(model_PLC, X_test_matrix, y_test_PLC).mean()
score_OTH = cross_val_score(model_OTH, X_test_matrix, y_test_OTH).mean()

In [251]:
print(f"R^2 score for LAB: {score_LAB}")
print(f"R^2 score for CON: {score_CON}")
print(f"R^2 score for LIB: {score_LIB}")
print(f"R^2 score for GRE: {score_GRE}")
print(f"R^2 score for BRX: {score_BRX}")
print(f"R^2 score for NAT: {score_NAT}")
print(f"R^2 score for SNP: {score_SNP}")
print(f"R^2 score for UKI: {score_UKI}")
print(f"R^2 score for PLC: {score_PLC}")
print(f"R^2 score for OTH: {score_OTH}")

R^2 score for LAB: 0.8850999355316163
R^2 score for CON: 0.9366500854492188
R^2 score for LIB: 0.9798464059829712
R^2 score for GRE: 0.9677788138389587
R^2 score for BRX: 0.912101149559021
R^2 score for NAT: 0.9652236819267273
R^2 score for SNP: 0.9711585521697998
R^2 score for UKI: 0.9673395037651062
R^2 score for PLC: 0.9096218705177307
R^2 score for OTH: 0.9512455344200135


In [None]:
#TODO Create a function for prediction