In [61]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

# Imports
import os
from google.cloud import bigquery

In [16]:
# Environment variables
gcp_project_id = os.environ['GCP_PROJECT']
gcp_service_account_key = os.environ['GCP_SERVICE_ACCOUNT_KEY']
bq_source_dataset = os.environ['BQ_SOURCE_DATASET']
bq_cleaned_dataset = os.environ['BQ_CLEANED_DATASET']

In [17]:
# Import data from BQ and set data to X
# Initialize a BigQuery client using the service account JSON file
bq_client = bigquery.Client(project=gcp_project_id).from_service_account_json(gcp_service_account_key)

# Setting the table we want from the source dataset
select_table = 'cleaned_full_polls_combined_national_results_2004_2019'

# SQL query for querying Big Query and fetching entire table
query = f"""
    SELECT *
    FROM `{gcp_project_id}.{bq_cleaned_dataset}.{select_table}`
"""

In [18]:
# Use BQ client to create DF from the selected table
data = bq_client.query(query).to_dataframe()



In [19]:
# Replace any found NaN values with 0
data.replace(np.nan, 0, inplace=True)

In [20]:
# Add a field 'poll length' that shows number of days the poll was held for
data['poll_length'] = pd.to_datetime(data.enddate) - pd.to_datetime(data.startdate)
data['poll_length'] = data['poll_length'].dt.days

In [21]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'NAT_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']:
    data[column] = data[column] / 100

In [22]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']:
    data[column] = data[column] / 100

In [23]:
# Drop columns we don't need
data = data.drop(columns=['startdate', 'enddate', 'pollster'])

In [24]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'days_to_elec', 'poll_length']
num_transformer = MinMaxScaler()

In [25]:
# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = OrdinalEncoder(categories = [['F','D-','D','D+','C-','B','B+','A-']])

In [26]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [27]:
# Need to add step to make sure we train/test split before fitting preproc pipeline

In [28]:
# Fit transform preprocessing pipeline to data
data_processed = preproc_pipeline.fit_transform(data)

In [29]:
# Check feature names make sense
data_processed = pd.DataFrame(
    data_processed, columns=preproc_pipeline.get_feature_names_out()
)

In [30]:
# Define our X by dropping irrelevant and y columns

X = data_processed.drop(columns=['next_elec_date','NAT_ACT', 'BRX_ACT', 'CON_ACT',
       'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT',
       'OTH_PERCENTAGE'])

In [31]:
# Build our target matrix, retaining each party share vote columns only
y = data_processed[['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT',
                    'NAT_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_PERCENTAGE']]

In [32]:
# Handle data splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [35]:
y_train

Unnamed: 0,LAB_ACT,CON_ACT,LIB_ACT,GRE_ACT,BRX_ACT,NAT_ACT,SNP_ACT,UKI_ACT,PLC_ACT,OTH_PERCENTAGE
237,0.399893,0.423427,0.073654,0.015909,0.0,0.014415,0.030356,0.018447,0.005107,0.033062
2248,0.304506,0.36811,0.078705,0.03621,0.0,0.005431,0.047382,0.126435,0.005919,0.032678
1192,0.289896,0.360539,0.230289,0.008934,0.0,1.901011,0.016552,0.030986,0.005571,0.038223
109,0.399893,0.423427,0.073654,0.015909,0.0,0.014415,0.030356,0.018447,0.005107,0.033062
2132,0.304506,0.36811,0.078705,0.03621,0.0,0.005431,0.047382,0.126435,0.005919,0.032678
...,...,...,...,...,...,...,...,...,...,...
1095,0.289896,0.360539,0.230289,0.008934,0.0,1.901011,0.016552,0.030986,0.005571,0.038223
1130,0.289896,0.360539,0.230289,0.008934,0.0,1.901011,0.016552,0.030986,0.005571,0.038223
1294,0.351872,0.323596,0.220256,0.009491,0.0,0.710007,0.015186,0.022322,0.00644,0.043738
860,0.289896,0.360539,0.230289,0.008934,0.0,1.901011,0.016552,0.030986,0.005571,0.038223


In [36]:
#TODO – refactor the below into a function > model instantiation, fitting, scoring, etc.
# Handle target extraction for test and train sets respectively
#TODO Kwarg > defaults to either cross val or prediction
y_train_LAB = y_train['LAB_ACT']
y_train_CON = y_train['CON_ACT']
y_train_LIB = y_train['LIB_ACT']
y_train_GRE = y_train['GRE_ACT']
y_train_BRX = y_train['BRX_ACT']
y_train_NAT = y_train['NAT_ACT']
y_train_SNP = y_train['SNP_ACT']
y_train_UKI = y_train['UKI_ACT']
y_train_PLC = y_train['PLC_ACT']
y_train_OTH = y_train['OTH_PERCENTAGE']

y_test_LAB = y_test['LAB_ACT']
y_test_CON = y_test['CON_ACT']
y_test_LIB = y_test['LIB_ACT']
y_test_GRE = y_test['GRE_ACT']
y_test_BRX = y_test['BRX_ACT']
y_test_NAT = y_test['NAT_ACT']
y_test_SNP = y_test['SNP_ACT']
y_test_UKI = y_test['UKI_ACT']
y_test_PLC = y_test['PLC_ACT']
y_test_OTH = y_test['OTH_PERCENTAGE']

In [40]:
# Instantiate XGBRegressor Model
xgb_regression_model = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)

In [49]:
# Instantiate the model for each party
model_LAB = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_CON = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_LIB = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_GRE = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_BRX = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_NAT = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_SNP = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_UKI = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_PLC = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_OTH = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)

In [50]:
# Handle model training
X_train_matrix = np.array(X_train)

model_LAB.fit(X_train_matrix, y_train_LAB)
model_CON.fit(X_train_matrix, y_train_CON)
model_LIB.fit(X_train_matrix, y_train_LIB)
model_GRE.fit(X_train_matrix, y_train_GRE)
model_BRX.fit(X_train_matrix, y_train_BRX)
model_NAT.fit(X_train_matrix, y_train_NAT)
model_SNP.fit(X_train_matrix, y_train_SNP)
model_UKI.fit(X_train_matrix, y_train_UKI)
model_PLC.fit(X_train_matrix, y_train_PLC)
model_OTH.fit(X_train_matrix, y_train_OTH)

In [51]:
assert model_LAB == model_CON

AssertionError: 

In [54]:
cv_score_LAB = cross_val_score(model_LAB, X_train_matrix, y_train_LAB).mean()
cv_score_CON = cross_val_score(model_CON, X_train_matrix, y_train_CON).mean()
cv_score_LIB = cross_val_score(model_LIB, X_train_matrix, y_train_LIB).mean()
cv_score_GRE = cross_val_score(model_GRE, X_train_matrix, y_train_GRE).mean()
cv_score_BRX = cross_val_score(model_BRX, X_train_matrix, y_train_BRX).mean()
cv_score_NAT = cross_val_score(model_NAT, X_train_matrix, y_train_NAT).mean()
cv_score_SNP = cross_val_score(model_SNP, X_train_matrix, y_train_SNP).mean()
cv_score_UKI = cross_val_score(model_UKI, X_train_matrix, y_train_UKI).mean()
cv_score_PLC = cross_val_score(model_PLC, X_train_matrix, y_train_PLC).mean()
cv_score_OTH = cross_val_score(model_OTH, X_train_matrix, y_train_OTH).mean()

In [55]:
# Evaluate model performance with cross validation
X_test_matrix = np.array(X_test)

y_pred_LAB = model_LAB.predict(X_test_matrix)
y_pred_CON = model_CON.predict(X_test_matrix)
y_pred_LIB = model_LIB.predict(X_test_matrix)
y_pred_GRE = model_GRE.predict(X_test_matrix)
y_pred_BRX = model_BRX.predict(X_test_matrix)
y_pred_NAT = model_NAT.predict(X_test_matrix)
y_pred_SNP = model_SNP.predict(X_test_matrix)
y_pred_UKI = model_UKI.predict(X_test_matrix)
y_pred_PLC = model_PLC.predict(X_test_matrix)
y_pred_OTH = model_OTH.predict(X_test_matrix)

# Going to try model.evaluate on test data instead of cross_val
# score_LAB = cross_val_score(model_LAB, X_test_matrix, y_test_LAB).mean()
# score_CON = cross_val_score(model_CON, X_test_matrix, y_test_CON).mean()
# score_LIB = cross_val_score(model_LIB, X_test_matrix, y_test_LIB).mean()
# score_GRE = cross_val_score(model_GRE, X_test_matrix, y_test_GRE).mean()
# score_BRX = cross_val_score(model_BRX, X_test_matrix, y_test_BRX).mean()
# score_NAT = cross_val_score(model_NAT, X_test_matrix, y_test_NAT).mean()
# score_SNP = cross_val_score(model_SNP, X_test_matrix, y_test_SNP).mean()
# score_UKI = cross_val_score(model_UKI, X_test_matrix, y_test_UKI).mean()
# score_PLC = cross_val_score(model_PLC, X_test_matrix, y_test_PLC).mean()
# score_OTH = cross_val_score(model_OTH, X_test_matrix, y_test_OTH).mean()

In [56]:
model_NAT

In [60]:
print(f"2019 prediction for LAB: {y_pred_LAB.mean()}")
print(f"2019 prediction for CON: {y_pred_CON.mean()}")
print(f"2019 prediction for LIB: {y_pred_LIB.mean()}")
print(f"2019 prediction for GRE: {y_pred_GRE.mean()}")
print(f"2019 prediction for BRX: {y_pred_BRX.mean()}")
print(f"2019 prediction for NAT: {y_pred_NAT.mean()}")
print(f"2019 prediction for SNP: {y_pred_SNP.mean()}")
print(f"2019 prediction for UKI: {y_pred_UKI.mean()}")
print(f"2019 prediction for PLC: {y_pred_PLC.mean()}")
print(f"2019 prediction for OTH: {y_pred_OTH.mean()}")

2019 prediction for LAB: 0.31363046169281006
2019 prediction for CON: 0.3812522292137146
2019 prediction for LIB: 0.1121877133846283
2019 prediction for GRE: 0.02770712971687317
2019 prediction for BRX: 0.0030804567504674196
2019 prediction for NAT: 0.3495117425918579
2019 prediction for SNP: 0.038696981966495514
2019 prediction for UKI: 0.07978854328393936
2019 prediction for PLC: 0.005629132501780987
2019 prediction for OTH: 0.03445062413811684


In [None]:
# Manually calculating MSE for each prediction
mse_pred_LAB = mean_squared_error(y_pred_LAB,y_test_LAB)
r2_LAB = r2_score(y_pred_LAB,y_test_LAB)

In [23]:
print(f"R^2 score for LAB: {score_LAB}")
print(f"R^2 score for CON: {score_CON}")
print(f"R^2 score for LIB: {score_LIB}")
print(f"R^2 score for GRE: {score_GRE}")
print(f"R^2 score for BRX: {score_BRX}")
print(f"R^2 score for NAT: {score_NAT}")
print(f"R^2 score for SNP: {score_SNP}")
print(f"R^2 score for UKI: {score_UKI}")
print(f"R^2 score for PLC: {score_PLC}")
print(f"R^2 score for OTH: {score_OTH}")

R^2 score for LAB: 0.9034786343574523
R^2 score for CON: 0.9454878568649292
R^2 score for LIB: 0.9790375351905822
R^2 score for GRE: 0.9669990301132202
R^2 score for BRX: 0.9406910538673401
R^2 score for NAT: 0.9635486602783203
R^2 score for SNP: 0.9732215166091919
R^2 score for UKI: 0.9694663763046265
R^2 score for PLC: 0.9111235022544861
R^2 score for OTH: 0.9595659613609314
