In [22]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# Imports
import os
from google.cloud import bigquery
from  datetime import datetime

now = datetime.now()
current_time = now.strftime("%Y-%m-%d %H:%M:%S")
print("Current Time:", current_time)
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

from sklearn.preprocessing import OneHotEncoder

Current Time: 2024-06-10 12:53:08


In [23]:
# Imports data
data = pd.read_csv('../processed_data/1988_to_2024_combined_clean_polling_and_results.csv')

In [24]:
data['enddate'] = pd.to_datetime(data['enddate'])
data['next_elec_date'] = pd.to_datetime(data['next_elec_date'])
data['startdate'] = pd.to_datetime(data['startdate'])


In [25]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'days_to_elec', 'poll_length', 'months_to_elec','months_to_elec_weight']
num_transformer = MinMaxScaler()

In [26]:
# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = make_pipeline(OrdinalEncoder(categories = [['F','F+','E-','E','E+','D-','D','D+','C-','C','C+','B-','B','B+','A-']]),MinMaxScaler())

In [27]:
# One-hot encoder for party_in_power
ohe = OneHotEncoder()

In [28]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    (ohe,['party_in_power']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [29]:
# Split data up to 90 days before the 2019 election, and test the final 90 days running into it
data['next_elec_date'] = data['next_elec_date'].astype("datetime64[ns]")
data_train = data[data['next_elec_date'] < datetime.strptime('2024-07-04', '%Y-%m-%d')]
data_test = data[data['next_elec_date'] == datetime.strptime('2024-07-04', '%Y-%m-%d')]

In [31]:
# Fit transform preprocessing pipeline to data_train
data_train_processed = preproc_pipeline.fit_transform(data_train)

# Transform preprocessing pipeline to data_test
data_test_processed = preproc_pipeline.transform(data_test)

In [32]:
# Check feature names make sense for data_train
data_train_processed = pd.DataFrame(
    data_train_processed, columns=preproc_pipeline.get_feature_names_out()
)

# Check feature names make sense for data_test
data_test_processed = pd.DataFrame(
    data_test_processed, columns=preproc_pipeline.get_feature_names_out()
)

In [33]:
data_train_processed.columns

Index(['samplesize', 'days_to_elec', 'poll_length', 'months_to_elec',
       'months_to_elec_weight', 'rating', 'party_in_power_Conservative',
       'party_in_power_Labour', 'Unnamed: 0', 'startdate', 'enddate',
       'pollster', 'next_elec_date', 'CON_FC', 'LAB_FC', 'LIB_FC', 'BRX_FC',
       'GRE_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC', 'CON_ACT', 'LAB_ACT',
       'LIB_ACT', 'BRX_ACT', 'GRE_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT',
       'OTH_PERCENTAGE'],
      dtype='object')

In [34]:
# Define our X by dropping irrelevant and y columns

X_train = data_train_processed[['samplesize', 'poll_length',
       'months_to_elec_weight', 'rating', 'party_in_power_Conservative', 'CON_FC', 'LAB_FC', 'LIB_FC', 'BRX_FC',
       'GRE_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']]

X_test = data_test_processed[['samplesize', 'poll_length',
       'months_to_elec_weight', 'rating', 'party_in_power_Conservative', 'CON_FC', 'LAB_FC', 'LIB_FC', 'BRX_FC',
       'GRE_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']]

In [35]:
# Build our target matrix, retaining each party share vote columns only
y_train = data_train_processed[['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_PERCENTAGE']]

# Build our target matrix, retaining each party share vote columns only
y_test = data_test_processed[['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_PERCENTAGE']]

In [36]:
y_train_LAB = y_train['LAB_ACT']
y_train_CON = y_train['CON_ACT']
y_train_LIB = y_train['LIB_ACT']
y_train_GRE = y_train['GRE_ACT']
y_train_BRX = y_train['BRX_ACT']
y_train_SNP = y_train['SNP_ACT']
y_train_UKI = y_train['UKI_ACT']
y_train_PLC = y_train['PLC_ACT']
y_train_OTH = y_train['OTH_PERCENTAGE']

y_test_LAB = y_test['LAB_ACT']
y_test_CON = y_test['CON_ACT']
y_test_LIB = y_test['LIB_ACT']
y_test_GRE = y_test['GRE_ACT']
y_test_BRX = y_test['BRX_ACT']
y_test_SNP = y_test['SNP_ACT']
y_test_UKI = y_test['UKI_ACT']
y_test_PLC = y_test['PLC_ACT']
y_test_OTH = y_test['OTH_PERCENTAGE']

In [37]:
# Instantiate XGBRegressor Model
xgb_regression_model = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)

In [38]:
# Instantiate the model for each party
model_LAB = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_CON = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_LIB = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_GRE = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_BRX = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_SNP = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_UKI = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_PLC = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_OTH = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)

In [39]:
# Handle model training
X_train_matrix = np.array(X_train)

model_LAB.fit(X_train_matrix, y_train_LAB)
model_CON.fit(X_train_matrix, y_train_CON)
model_LIB.fit(X_train_matrix, y_train_LIB)
model_GRE.fit(X_train_matrix, y_train_GRE)
model_BRX.fit(X_train_matrix, y_train_BRX)
model_SNP.fit(X_train_matrix, y_train_SNP)
model_UKI.fit(X_train_matrix, y_train_UKI)
model_PLC.fit(X_train_matrix, y_train_PLC)
model_OTH.fit(X_train_matrix, y_train_OTH)

In [40]:
cv_score_LAB = cross_val_score(model_LAB, X_train_matrix, y_train_LAB).mean()
cv_score_CON = cross_val_score(model_CON, X_train_matrix, y_train_CON).mean()
cv_score_LIB = cross_val_score(model_LIB, X_train_matrix, y_train_LIB).mean()
cv_score_GRE = cross_val_score(model_GRE, X_train_matrix, y_train_GRE).mean()
cv_score_BRX = cross_val_score(model_BRX, X_train_matrix, y_train_BRX).mean()
cv_score_SNP = cross_val_score(model_SNP, X_train_matrix, y_train_SNP).mean()
cv_score_UKI = cross_val_score(model_UKI, X_train_matrix, y_train_UKI).mean()
cv_score_PLC = cross_val_score(model_PLC, X_train_matrix, y_train_PLC).mean()
cv_score_OTH = cross_val_score(model_OTH, X_train_matrix, y_train_OTH).mean()

In [41]:
# Evaluate model performance with cross validation
X_test_matrix = np.array(X_test)

y_pred_LAB = model_LAB.predict(X_test_matrix)
y_pred_CON = model_CON.predict(X_test_matrix)
y_pred_LIB = model_LIB.predict(X_test_matrix)
y_pred_GRE = model_GRE.predict(X_test_matrix)
y_pred_BRX = model_BRX.predict(X_test_matrix)
y_pred_SNP = model_SNP.predict(X_test_matrix)
y_pred_UKI = model_UKI.predict(X_test_matrix)
y_pred_PLC = model_PLC.predict(X_test_matrix)
y_pred_OTH = model_OTH.predict(X_test_matrix)


In [42]:
print(f"2019 prediction for LAB: {y_pred_LAB.mean()}")
print(f"2019 prediction for CON: {y_pred_CON.mean()}")
print(f"2019 prediction for LIB: {y_pred_LIB.mean()}")
print(f"2019 prediction for GRE: {y_pred_GRE.mean()}")
print(f"2019 prediction for BRX: {y_pred_BRX.mean()}")
print(f"2019 prediction for SNP: {y_pred_SNP.mean()}")
print(f"2019 prediction for UKI: {y_pred_UKI.mean()}")
print(f"2019 prediction for PLC: {y_pred_PLC.mean()}")
print(f"2019 prediction for OTH: {y_pred_OTH.mean()}")

2019 prediction for LAB: 0.33735108375549316
2019 prediction for CON: 0.42973893880844116
2019 prediction for LIB: 0.12248048186302185
2019 prediction for GRE: 0.02654256857931614
2019 prediction for BRX: 0.02154829539358616
2019 prediction for SNP: 0.03979972377419472
2019 prediction for UKI: 0.001750043360516429
2019 prediction for PLC: 0.005207299254834652
2019 prediction for OTH: 0.037113647907972336


In [43]:
print(f"2019 prediction for LAB: {y_pred_LAB[-1]}")
print(f"2019 prediction for CON: {y_pred_CON[-1]}")
print(f"2019 prediction for LIB: {y_pred_LIB[-1]}")
print(f"2019 prediction for GRE: {y_pred_GRE[-1]}")
print(f"2019 prediction for BRX: {y_pred_BRX[-1]}")
print(f"2019 prediction for SNP: {y_pred_SNP[-1]}")
print(f"2019 prediction for UKI: {y_pred_UKI[-1]}")
print(f"2019 prediction for PLC: {y_pred_PLC[-1]}")
print(f"2019 prediction for OTH: {y_pred_OTH[-1]}")

2019 prediction for LAB: 0.3435162305831909
2019 prediction for CON: 0.4115138351917267
2019 prediction for LIB: 0.12784186005592346
2019 prediction for GRE: 0.02434469759464264
2019 prediction for BRX: 0.019418325275182724
2019 prediction for SNP: 0.03902982920408249
2019 prediction for UKI: -0.0077798678539693356
2019 prediction for PLC: 0.004930238705128431
2019 prediction for OTH: 0.043957922607660294
