In [111]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.cloud import bigquery
from datetime import datetime, timedelta

In [112]:
# Print current time
now = datetime.now()
current_time = now.strftime("%Y-%m-%d %H:%M:%S")

In [113]:
# Imports data
data = pd.read_csv('../processed_data/1988_to_2024_combined_clean_polling_and_results.csv')

In [114]:
# Convert date columns to datetime
data['enddate'] = pd.to_datetime(data['enddate'])
data['next_elec_date'] = pd.to_datetime(data['next_elec_date'])
data['startdate'] = pd.to_datetime(data['startdate'])

In [115]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'months_to_elec_weight']
num_transformer = MinMaxScaler()

In [116]:
# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = make_pipeline(OrdinalEncoder(categories = [['F','F+','E-','E','E+','D-','D','D+','C-','C','C+','B-','B','B+','A-']]),MinMaxScaler())

In [117]:
# One-hot encoder for party_in_power
ohe = OneHotEncoder()

In [118]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    (ohe,['party_in_power']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [119]:
data['next_elec_date'].value_counts()

next_elec_date
2015-05-07    1930
2024-07-04    1397
2010-05-06     584
1992-04-09     453
2019-12-12     440
1997-05-01     338
2017-06-08     254
2001-06-07     213
2005-05-05      98
Name: count, dtype: int64

In [120]:
# Define election dates
election_date= datetime.strptime('2005-05-05', '%Y-%m-%d')
cutoff_date = election_date - timedelta(days=33)
prediction_date = election_date - timedelta(days=3)

In [121]:
# Split data
data_train = data[data['startdate'] > '2003-12-31']
data_train = data_train[data_train['startdate'] < cutoff_date]
data_test_1 = data[(data['startdate'] >= cutoff_date) & (data['startdate'] < prediction_date)]
data_test = data_test_1[data_test_1['next_elec_date'] == election_date]

In [122]:
data_test

Unnamed: 0.1,Unnamed: 0,startdate,enddate,pollster,samplesize,rating,next_elec_date,days_to_elec,months_to_elec,months_to_elec_weight,...,UKI_FC,CON_ACT,LAB_ACT,LIB_ACT,BRX_ACT,GRE_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_ACT
1083,1083,2005-04-10,2005-04-12,ICM,1169,D+,2005-05-05,25,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838
1084,1084,2005-04-14,2005-04-17,Populus,586,D+,2005-05-05,21,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838
1085,1085,2005-04-17,2005-04-19,ICM,1163,D+,2005-05-05,18,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838
1086,1086,2005-04-16,2005-04-19,Populus,863,D+,2005-05-05,19,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838
1087,1087,2005-04-17,2005-04-20,Populus,836,D+,2005-05-05,18,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838
1088,1088,2005-04-18,2005-04-21,Populus,806,D+,2005-05-05,17,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838
1089,1089,2005-04-19,2005-04-22,Populus,798,D+,2005-05-05,16,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838
1090,1090,2005-04-20,2005-04-23,Populus,798,D+,2005-05-05,15,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838
1091,1091,2005-04-21,2005-04-24,Populus,819,D+,2005-05-05,14,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838
1092,1092,2005-04-22,2005-04-25,Populus,831,D+,2005-05-05,13,1,0.831387,...,,0.323596,0.351872,0.220256,0.0,0.009491,0.00644,0.015186,0.022322,0.050838


In [123]:
# Fit transform preprocessing pipeline to data_train
data_train_processed = preproc_pipeline.fit_transform(data_train)


In [124]:
# Transform preprocessing pipeline to data_test
data_test_processed = preproc_pipeline.transform(data_test)

In [125]:
# Check feature names make sense for data_train
data_train_processed = pd.DataFrame(
    data_train_processed, columns=preproc_pipeline.get_feature_names_out()
)

# Check feature names make sense for data_test
data_test_processed = pd.DataFrame(
    data_test_processed, columns=preproc_pipeline.get_feature_names_out()
)

In [126]:
# Define our X by dropping irrelevant and y columns
X_train = data_train_processed.drop(columns=['startdate', 'enddate', 'pollster', 'Unnamed: 0', 'next_elec_date', 'days_to_elec', 'months_to_elec', 'party_in_power_Labour', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_ACT'])
X_test = data_test_processed.drop(columns=['startdate', 'enddate', 'pollster', 'Unnamed: 0', 'next_elec_date', 'days_to_elec', 'months_to_elec', 'party_in_power_Labour', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_ACT'])

In [127]:
# Build our target matrix
y_train = data_train_processed[['next_elec_date', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_ACT']]
y_test = data_test_processed[['next_elec_date', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_ACT']]

In [128]:
# Drops y_train actuals where the actual is the actual result we are trying to predict, replaces with NaNs
y_train.loc[y_train['next_elec_date'] == '2005-05-05',
         ['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_ACT']] = np.nan

In [130]:
y_train

Unnamed: 0,next_elec_date,LAB_ACT,CON_ACT,LIB_ACT,GRE_ACT,BRX_ACT,SNP_ACT,UKI_ACT,PLC_ACT,OTH_ACT
0,2005-05-05,,,,,,,,,
1,2005-05-05,,,,,,,,,
2,2005-05-05,,,,,,,,,
3,2005-05-05,,,,,,,,,
4,2005-05-05,,,,,,,,,
5,2005-05-05,,,,,,,,,
6,2005-05-05,,,,,,,,,
7,2005-05-05,,,,,,,,,
8,2005-05-05,,,,,,,,,
9,2005-05-05,,,,,,,,,


In [131]:
X_test.tail(10)

Unnamed: 0,samplesize,months_to_elec_weight,rating,poll_length,CON_FC,LAB_FC,LIB_FC,BRX_FC,GRE_FC,OTH_FC,PLC_FC,SNP_FC,UKI_FC
7,0.163846,1.0,0.0,3,0.32,0.41,0.2,,,0.07,,,
8,0.178064,1.0,0.0,3,0.33,0.41,0.19,,,0.07,,,
9,0.186188,1.0,0.0,3,0.31,0.4,0.21,,,0.08,,,
10,0.442112,1.0,0.0,2,0.32,0.4,0.21,,,0.05,,,
11,0.188896,1.0,0.0,3,0.31,0.4,0.21,,,0.08,,,
12,0.192959,1.0,0.0,3,0.31,0.4,0.21,,,0.08,,,
13,0.201083,1.0,0.0,3,0.31,0.4,0.22,,,0.07,,,
14,0.207854,1.0,0.0,3,0.29,0.42,0.21,,,0.08,,,
15,0.209885,1.323891,0.0,2,0.27,0.41,0.23,,,0.09,,,
16,0.421124,1.323891,0.0,2,0.32,0.38,0.22,,,0.08,,,


In [132]:
# Calculates average median value of X_test
averages = X_test.mean()
averages

samplesize               0.233462
months_to_elec_weight    1.038105
rating                        0.0
poll_length              2.705882
CON_FC                   0.317059
LAB_FC                        0.4
LIB_FC                       0.21
BRX_FC                        NaN
GRE_FC                        NaN
OTH_FC                   0.072941
PLC_FC                        NaN
SNP_FC                        NaN
UKI_FC                        NaN
dtype: object

In [133]:
# Creates imputation values for y_train to impute over actuals we are trying to predict
imputation_values = {
    'CON_ACT': averages['CON_FC'],
    'LAB_ACT': averages['LAB_FC'],
    'LIB_ACT': averages['LIB_FC'],
    'BRX_ACT': 0, # doesnt exist in 2015
    'GRE_ACT': 0.006,
    'OTH_ACT': averages['OTH_FC'],
    'PLC_ACT':    0.006,
    'SNP_ACT': 0.04, # imputes value due to low forecasts
    'UKI_ACT': 0
}

In [134]:
# Applies imputation values to y_train
y_train = y_train.fillna(value=imputation_values)


  y_train = y_train.fillna(value=imputation_values)


In [135]:
# Instantiate the model for each party
models = {}
parties = ['CON', 'LAB', 'LIB', 'BRX', 'GRE', 'SNP', 'UKI', 'PLC', 'OTH']
for party in parties:
    models[party] = XGBRegressor(
        learning_rate=0.3, n_estimators=300, max_depth=3, subsample=0.7,
        objective='reg:squarederror', nthread=-1, enable_categorical=True
    )

In [136]:
# Train models
X_train_matrix = np.array(X_train)
for party in parties:
    models[party].fit(X_train_matrix, y_train[f'{party}_ACT'])

In [137]:
actuals_2005 = {
    "CON": 0.324,  # Conservative Party
    "LAB": 0.352,  # Labour Party
    "LIB": 0.220,  # Liberal Democrats
    "BRX": 0.000,  # Brexit Party (did not exist in 2010)
    "GRE": 0.009,  # Green Party
    "SNP": 0.019,  # Scottish National Party
    "UKI": 0.030,  # UK Independence Party
    "PLC": 0.005,  # Plaid Cymru
    "OTH": 0.048   # Other parties
}

In [138]:
# Evaluate predictions
X_test_matrix = np.array(X_test)
mean_predictions = {}
for party in parties:
    mean_predictions[party] = models[party].predict(X_test_matrix).mean()


In [139]:
# Print mean predictions
for party in parties:
    print(f"2005 mean prediction for {party}: {mean_predictions[party]}")
    print(f"2005 actual result for {party}  : {actuals_2005[party]}")


2005 mean prediction for CON: 0.31705883145332336
2005 actual result for CON  : 0.324
2005 mean prediction for LAB: 0.4000000059604645
2005 actual result for LAB  : 0.352
2005 mean prediction for LIB: 0.20999999344348907
2005 actual result for LIB  : 0.22
2005 mean prediction for BRX: 0.0
2005 actual result for BRX  : 0.0
2005 mean prediction for GRE: 0.006000000052154064
2005 actual result for GRE  : 0.009
2005 mean prediction for SNP: 0.03999999910593033
2005 actual result for SNP  : 0.019
2005 mean prediction for UKI: 0.0
2005 actual result for UKI  : 0.03
2005 mean prediction for PLC: 0.006000000052154064
2005 actual result for PLC  : 0.005
2005 mean prediction for OTH: 0.07294117659330368
2005 actual result for OTH  : 0.048


In [78]:
# Evaluate last five polls
last_predictions = {}
for party in parties:
    last_predictions[party] = models[party].predict(X_test_matrix)[-5:]

In [79]:
# Print mean of predictions
for party in parties:
    print(f"2010 last prediction for {party}: {last_predictions[party].mean()}")
    print(f"2010 actual result for {party}  : {actuals_2005[party]}")


2010 last prediction for CON: 0.33500000834465027
2010 actual result for CON  : 0.324
2010 last prediction for LAB: 0.3824999928474426
2010 actual result for LAB  : 0.352
2010 last prediction for LIB: 0.20499999821186066
2010 actual result for LIB  : 0.22
2010 last prediction for BRX: 0.0
2010 actual result for BRX  : 0.0
2010 last prediction for GRE: 0.006000000052154064
2010 actual result for GRE  : 0.009
2010 last prediction for SNP: 0.03999999910593033
2010 actual result for SNP  : 0.019
2010 last prediction for UKI: 0.0
2010 actual result for UKI  : 0.03
2010 last prediction for PLC: 0.006000000052154064
2010 actual result for PLC  : 0.005
2010 last prediction for OTH: 0.07750000059604645
2010 actual result for OTH  : 0.048
