In [108]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# Imports
import os
from google.cloud import bigquery
from  datetime import datetime

now = datetime.now()
current_time = now.strftime("%Y-%m-%d %H:%M:%S")
print("Current Time:", current_time)
from sklearn.model_selection import TimeSeriesSplit, cross_val_score


Current Time: 2024-06-09 20:17:26


In [109]:
# Imports data
data = pd.read_csv('../processed_data/2004_to_2019_combined_clean_polling_and_results.csv')

In [110]:
data['enddate'] = pd.to_datetime(data['enddate'])

In [111]:
data['next_elec_date'] = pd.to_datetime(data['next_elec_date'])

In [113]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'poll_length', 'days_to_elec', 'months_to_elec_weight']
num_transformer = MinMaxScaler()

In [114]:
# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = make_pipeline(OrdinalEncoder(categories = [['F','D-','D','D+','C-','B','B+','A-']]),MinMaxScaler())

In [115]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [116]:
data

Unnamed: 0.1,Unnamed: 0,enddate,pollster,samplesize,rating,next_elec_date,days_to_elec,months_to_election,months_to_elec_weight,poll_length,...,UKI_FC,CON_ACT,LAB_ACT,LIB_ACT,BRX_ACT,GRE_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE
0,0,2004-01-04,Populus,566,D+,2005-05-05,489,16,0.310800,2,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
1,1,2004-01-18,ICM,1007,D+,2005-05-05,475,16,0.310800,2,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
2,2,2004-02-08,Populus,580,D+,2005-05-05,454,15,0.325548,2,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
3,3,2004-02-22,ICM,1006,D+,2005-05-05,440,15,0.325548,2,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
4,4,2004-03-07,Populus,573,D+,2005-05-05,426,14,0.341247,2,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,3255,2019-12-11,IpsosMORI,2213,A-,2019-12-12,3,0,1.000000,2,...,,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837
3256,3256,2019-12-11,Kantar,2815,B+,2019-12-12,3,0,1.000000,2,...,,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837
3257,3257,2019-12-11,Opinium,3005,A-,2019-12-12,2,0,1.000000,1,...,,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837
3258,3258,2019-12-11,Panelbase,3174,A-,2019-12-12,2,0,1.000000,1,...,,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837


In [117]:
split_date = datetime.strptime('2019-09-13', '%Y-%m-%d')


In [118]:
# Split data up to 90 days before the 2019 election, and test the final 90 days running into it
data['next_elec_date'] = data['next_elec_date'].astype("datetime64[ns]")
data_train = data[data['enddate'] < split_date]
data_test = data[(data['enddate'] >= split_date)]

In [119]:
# Fit transform preprocessing pipeline to data_train
data_train_processed = preproc_pipeline.fit_transform(data_train)

# Transform preprocessing pipeline to data_test
data_test_processed = preproc_pipeline.transform(data_test)

In [120]:
# Check feature names make sense for data_train
data_train_processed = pd.DataFrame(
    data_train_processed, columns=preproc_pipeline.get_feature_names_out()
)

# Check feature names make sense for data_test
data_test_processed = pd.DataFrame(
    data_test_processed, columns=preproc_pipeline.get_feature_names_out()
)

In [121]:
# Define our X by dropping irrelevant and y columns

X_train = data_train_processed.drop(columns=['pollster', 'next_elec_date', 'BRX_ACT', 'CON_ACT',
       'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT',
       'OTH_PERCENTAGE', 'enddate', 'party_in_power'])

X_test = data_test_processed.drop(columns=['pollster', 'next_elec_date', 'BRX_ACT', 'CON_ACT',
       'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT',
       'OTH_PERCENTAGE', 'enddate', 'party_in_power'])

In [122]:
# Build our target matrix, retaining each party share vote columns only
y_train = data_train_processed[['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_PERCENTAGE']]

# Build our target matrix, retaining each party share vote columns only
y_test = data_test_processed[['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_PERCENTAGE']]

In [123]:
y_train_LAB = y_train['LAB_ACT']
y_train_CON = y_train['CON_ACT']
y_train_LIB = y_train['LIB_ACT']
y_train_GRE = y_train['GRE_ACT']
y_train_BRX = y_train['BRX_ACT']
y_train_SNP = y_train['SNP_ACT']
y_train_UKI = y_train['UKI_ACT']
y_train_PLC = y_train['PLC_ACT']
y_train_OTH = y_train['OTH_PERCENTAGE']

y_test_LAB = y_test['LAB_ACT']
y_test_CON = y_test['CON_ACT']
y_test_LIB = y_test['LIB_ACT']
y_test_GRE = y_test['GRE_ACT']
y_test_BRX = y_test['BRX_ACT']
y_test_SNP = y_test['SNP_ACT']
y_test_UKI = y_test['UKI_ACT']
y_test_PLC = y_test['PLC_ACT']
y_test_OTH = y_test['OTH_PERCENTAGE']

In [124]:
# Instantiate XGBRegressor Model
xgb_regression_model = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)

In [125]:
# Instantiate the model for each party
model_LAB = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_CON = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_LIB = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_GRE = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_BRX = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_NAT = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_SNP = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_UKI = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_PLC = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)
model_OTH = XGBRegressor(learning_rate=0.3, n_estimators=300,
                                     max_depth=3, subsample=0.7,
                                     objective='reg:squarederror', nthread=-1,
                                     enable_categorical=True)

In [126]:
# Handle model training
X_train_matrix = np.array(X_train)

model_LAB.fit(X_train_matrix, y_train_LAB)
model_CON.fit(X_train_matrix, y_train_CON)
model_LIB.fit(X_train_matrix, y_train_LIB)
model_GRE.fit(X_train_matrix, y_train_GRE)
model_BRX.fit(X_train_matrix, y_train_BRX)
model_SNP.fit(X_train_matrix, y_train_SNP)
model_UKI.fit(X_train_matrix, y_train_UKI)
model_PLC.fit(X_train_matrix, y_train_PLC)
model_OTH.fit(X_train_matrix, y_train_OTH)

In [127]:
cv_score_LAB = cross_val_score(model_LAB, X_train_matrix, y_train_LAB).mean()
cv_score_CON = cross_val_score(model_CON, X_train_matrix, y_train_CON).mean()
cv_score_LIB = cross_val_score(model_LIB, X_train_matrix, y_train_LIB).mean()
cv_score_GRE = cross_val_score(model_GRE, X_train_matrix, y_train_GRE).mean()
cv_score_BRX = cross_val_score(model_BRX, X_train_matrix, y_train_BRX).mean()
cv_score_SNP = cross_val_score(model_SNP, X_train_matrix, y_train_SNP).mean()
cv_score_UKI = cross_val_score(model_UKI, X_train_matrix, y_train_UKI).mean()
cv_score_PLC = cross_val_score(model_PLC, X_train_matrix, y_train_PLC).mean()
cv_score_OTH = cross_val_score(model_OTH, X_train_matrix, y_train_OTH).mean()

In [128]:
# Evaluate model performance with cross validation
X_test_matrix = np.array(X_test)

y_pred_LAB = model_LAB.predict(X_test_matrix)
y_pred_CON = model_CON.predict(X_test_matrix)
y_pred_LIB = model_LIB.predict(X_test_matrix)
y_pred_GRE = model_GRE.predict(X_test_matrix)
y_pred_BRX = model_BRX.predict(X_test_matrix)
y_pred_SNP = model_SNP.predict(X_test_matrix)
y_pred_UKI = model_UKI.predict(X_test_matrix)
y_pred_PLC = model_PLC.predict(X_test_matrix)
y_pred_OTH = model_OTH.predict(X_test_matrix)


In [129]:
print(f"2019 prediction for LAB: {y_pred_LAB.mean()}")
print(f"2019 prediction for CON: {y_pred_CON.mean()}")
print(f"2019 prediction for LIB: {y_pred_LIB.mean()}")
print(f"2019 prediction for GRE: {y_pred_GRE.mean()}")
print(f"2019 prediction for BRX: {y_pred_BRX.mean()}")
print(f"2019 prediction for SNP: {y_pred_SNP.mean()}")
print(f"2019 prediction for UKI: {y_pred_UKI.mean()}")
print(f"2019 prediction for PLC: {y_pred_PLC.mean()}")
print(f"2019 prediction for OTH: {y_pred_OTH.mean()}")

2019 prediction for LAB: 0.3279475271701813
2019 prediction for CON: 0.4346550405025482
2019 prediction for LIB: 0.11195586621761322
2019 prediction for GRE: 0.025448476895689964
2019 prediction for BRX: 0.018202191218733788
2019 prediction for SNP: 0.03825394809246063
2019 prediction for UKI: 0.0019260755507275462
2019 prediction for PLC: 0.004840616602450609
2019 prediction for OTH: 0.03659659996628761
