In [1]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.cloud import bigquery
from datetime import datetime, timedelta
import os


In [2]:
# Load csvs from raw_data folder
trends1 = pd.read_csv('../raw_data/LAB_CON_LIB_GRE_BRX_Trends_2004_now.csv')
trends2 = pd.read_csv('../raw_data/LAB_PLC_SNP_UKI_NAT_Trends_2004_now.csv')

In [3]:
trends2.columns

Index(['Month', 'Labour Party: (United Kingdom)',
       'Plaid Cymru: (United Kingdom)',
       'Scottish National Party: (United Kingdom)',
       'UK Independence Party: (United Kingdom)',
       'British National Party: (United Kingdom)'],
      dtype='object')

In [4]:
# Drop duplicate Labour column
trends2.drop(columns='Labour Party: (United Kingdom)',inplace=True)

In [5]:
# Join two trends datasets on 'Month'
trends_merged = pd.merge(trends1,trends2,how='left',on='Month')

In [6]:
# Convert 'Month' field to pd.datetime64[ns] format for joining with polling data
trends_merged['Month'] = pd.to_datetime(trends_merged['Month'])

In [7]:
# Replace <1 values with 0.5 value
trends_merged.replace('<1', 0.5,inplace=True)

In [8]:
# Converting objects (pd.Series) to int dtypes
trends_merged = trends_merged.astype({'Green Party: (United Kingdom)':'int','Reform UK: (United Kingdom)':'int',\
    'Plaid Cymru: (United Kingdom)':'int','Scottish National Party: (United Kingdom)':'int',\
        'UK Independence Party: (United Kingdom)':'int','British National Party: (United Kingdom)':'int'})

In [9]:
# Rename columns to align them more easily with _FC and _ACT columns
trends_merged.rename(columns={'Labour Party: (United Kingdom)': 'LAB_trends',
        'Conservative Party: (United Kingdom)': 'CON_trends',
        'Liberal Democrats: (United Kingdom)': 'LIB_trends',
        'Green Party: (United Kingdom)': 'GRE_trends',
        'Reform UK: (United Kingdom)': 'BRX_trends',
        'Plaid Cymru: (United Kingdom)': 'PLC_trends',
        'Scottish National Party: (United Kingdom)': 'SNP_trends',
        'UK Independence Party: (United Kingdom)': 'UKI_trends',
        'British National Party: (United Kingdom)': 'NAT_trends'}
        ,inplace=True)

In [10]:
ons = pd.read_csv('../raw_data/ons_economic_data_master.csv')

In [11]:
# Print current time
now = datetime.now()
current_time = now.strftime("%Y-%m-%d %H:%M:%S")

In [12]:
# Imports data
data = pd.read_csv('../raw_data/1988_to_2024_combined_clean_polling_and_results.csv')

In [13]:
# Convert date columns to datetime
data['enddate'] = pd.to_datetime(data['enddate'])
data['next_elec_date'] = pd.to_datetime(data['next_elec_date'])
data['startdate'] = pd.to_datetime(data['startdate'])

In [14]:
# Create poll enddate field with YYYY-MM format, so we can join with Trends data
data['enddate_year_month'] = pd.to_datetime(data['enddate']).dt.to_period('M')

In [15]:
# Convert Period datatype to str and then to datetime64
data.enddate_year_month = pd.to_datetime(data.enddate_year_month.astype('str'))

In [16]:
# Merge data with trends data, to get a df with polls, election results and trends
data = pd.merge(data,trends_merged,how='left',left_on='enddate_year_month',right_on='Month')

In [17]:
ons['Month'] = pd.to_datetime(ons['Month'])

In [18]:
# Merge data with trends data, to get a df with polls, election results and trends
data = pd.merge(data,ons,how='left',left_on='enddate_year_month',right_on='Month')

In [19]:
# Scale trends columns by diving by 100
for column in ['LAB_trends', 'CON_trends', 'LIB_trends',
       'GRE_trends', 'BRX_trends', 'PLC_trends', 'SNP_trends', 'UKI_trends',
       'NAT_trends']:
    data[column] = data[column] / 100

In [20]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'months_to_elec_weight', 'GDP','Inflation','Unemployment']
num_transformer = MinMaxScaler()

In [21]:
# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = make_pipeline(OrdinalEncoder(categories = [['F','F+','E-','E','E+','D-','D','D+','C-','C','C+','B-','B','B+','A-']]),MinMaxScaler())

In [22]:
# One-hot encoder for party_in_power
ohe = OneHotEncoder()

In [23]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    (ohe,['party_in_power']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [24]:
data['next_elec_date'].value_counts()

next_elec_date
2015-05-07    1930
2024-07-04    1397
2010-05-06     584
1992-04-09     453
2019-12-12     440
1997-05-01     338
2017-06-08     254
2001-06-07     213
2005-05-05      98
Name: count, dtype: int64

In [32]:
# Define election dates
election_date= datetime.strptime('2024-07-04', '%Y-%m-%d')
cutoff_date = election_date - timedelta(days=84)
prediction_date = election_date - timedelta(days=24)

In [33]:
# Split data
data_train = data[data['startdate'] > '2003-12-31']
data_train = data_train[data_train['startdate'] < cutoff_date]
data_test_1 = data[(data['startdate'] >= cutoff_date) & (data['startdate'] < prediction_date)]
data_test = data_test_1[data_test_1['next_elec_date'] == election_date]

In [34]:
# Fit transform preprocessing pipeline to data_train
data_train_processed = preproc_pipeline.fit_transform(data_train)

In [35]:
# Transform preprocessing pipeline to data_test
data_test_processed = preproc_pipeline.transform(data_test)

In [36]:
# Check feature names make sense for data_train
data_train_processed = pd.DataFrame(
    data_train_processed, columns=preproc_pipeline.get_feature_names_out()
)

# Check feature names make sense for data_test
data_test_processed = pd.DataFrame(
    data_test_processed, columns=preproc_pipeline.get_feature_names_out()
)

In [37]:
data_train_processed.columns

Index(['samplesize', 'months_to_elec_weight', 'GDP', 'Inflation',
       'Unemployment', 'rating', 'party_in_power_Conservative',
       'party_in_power_Conservative_Liberal', 'party_in_power_Labour',
       'Unnamed: 0', 'startdate', 'enddate', 'pollster', 'next_elec_date',
       'days_to_elec', 'months_to_elec', 'poll_length', 'CON_FC', 'LAB_FC',
       'LIB_FC', 'BRX_FC', 'GRE_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC',
       'CON_ACT', 'LAB_ACT', 'LIB_ACT', 'BRX_ACT', 'GRE_ACT', 'PLC_ACT',
       'SNP_ACT', 'UKI_ACT', 'OTH_ACT', 'enddate_year_month', 'Month_x',
       'LAB_trends', 'CON_trends', 'LIB_trends', 'GRE_trends', 'BRX_trends',
       'PLC_trends', 'SNP_trends', 'UKI_trends', 'NAT_trends', 'Month_y'],
      dtype='object')

In [38]:
X_columns = ['samplesize', 'months_to_elec_weight','Inflation','rating','party_in_power_Conservative',
       'party_in_power_Conservative_Liberal', 'poll_length', 'CON_FC',
       'LAB_FC', 'LIB_FC', 'BRX_FC', 'GRE_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC','UKI_FC']

# Define our X by dropping irrelevant and y columns
X_train = data_train_processed[X_columns]
X_test = data_test_processed[X_columns]

In [39]:
X_train.columns

Index(['samplesize', 'months_to_elec_weight', 'Inflation', 'rating',
       'party_in_power_Conservative', 'party_in_power_Conservative_Liberal',
       'poll_length', 'CON_FC', 'LAB_FC', 'LIB_FC', 'BRX_FC', 'GRE_FC',
       'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC'],
      dtype='object')

In [40]:
# Build our target matrix
y_train = data_train_processed[['next_elec_date', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_ACT']]
y_test = data_test_processed[['next_elec_date', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_ACT']]

In [41]:
# Drops y_train actuals where the actual is the actual result we are trying to predict, replaces with NaNs
y_train.loc[y_train['next_elec_date'] == '2019-12-12',
         ['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_ACT']] = np.nan

In [42]:
# Calculates average median value of X_test
averages = X_test.mean()
averages

samplesize                             0.025576
months_to_elec_weight                   0.73944
Inflation                              0.297872
rating                                 0.643651
party_in_power_Conservative                 1.0
party_in_power_Conservative_Liberal         0.0
poll_length                            2.166667
CON_FC                                 0.232222
LAB_FC                                 0.442667
LIB_FC                                 0.094667
BRX_FC                                 0.118889
GRE_FC                                 0.058333
OTH_FC                                 0.021591
PLC_FC                                 0.007727
SNP_FC                                 0.027079
UKI_FC                                      NaN
dtype: object

In [43]:
# Creates imputation values for y_train to impute over actuals we are trying to predict
imputation_values = {
    'CON_ACT': averages['CON_FC'],
    'LAB_ACT': averages['LAB_FC'],
    'LIB_ACT': averages['LIB_FC'],
    'BRX_ACT': averages['BRX_FC'],
    'GRE_ACT': averages['GRE_FC'],
    'OTH_ACT': averages['OTH_FC'],
    'PLC_ACT': averages['PLC_FC'],
    'SNP_ACT': averages['SNP_FC'],
    'UKI_ACT': averages['UKI_FC']
}

In [44]:
# Applies imputation values to y_train
y_train = y_train.fillna(value=imputation_values)


  y_train = y_train.fillna(value=imputation_values)


In [49]:
# Instantiate the model for each party
models = {}
parties = ['CON', 'LAB', 'LIB', 'BRX', 'GRE', 'SNP', 'UKI', 'PLC', 'OTH']
for party in parties:
    models[party] = XGBRegressor(
        learning_rate=0.3, n_estimators=300, max_depth=3, subsample=0.7,
        objective='reg:squarederror', nthread=-1, enable_categorical=True
    )

In [52]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4565 entries, 0 to 4564
Data columns (total 16 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   samplesize                           4565 non-null   object
 1   months_to_elec_weight                4565 non-null   object
 2   Inflation                            4564 non-null   object
 3   rating                               4565 non-null   object
 4   party_in_power_Conservative          4565 non-null   object
 5   party_in_power_Conservative_Liberal  4565 non-null   object
 6   poll_length                          4565 non-null   object
 7   CON_FC                               4565 non-null   object
 8   LAB_FC                               4565 non-null   object
 9   LIB_FC                               4565 non-null   object
 10  BRX_FC                               1321 non-null   object
 11  GRE_FC                               2588 n

In [50]:
# Train models
X_train_matrix = np.array(X_train)
for party in parties:
    models[party].fit(X_train_matrix, y_train[f'{party}_ACT'])

XGBoostError: [14:07:22] /Users/runner/work/xgboost/xgboost/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x00000001576f0994 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x00000001577adc9c xgboost::MetaInfo::SetInfoFromHost(xgboost::Context const&, xgboost::StringView, xgboost::Json) + 3452
  [bt] (2) 3   libxgboost.dylib                    0x00000001577acdc4 xgboost::MetaInfo::SetInfo(xgboost::Context const&, xgboost::StringView, xgboost::StringView) + 164
  [bt] (3) 4   libxgboost.dylib                    0x0000000157707688 XGDMatrixSetInfoFromInterface + 224
  [bt] (4) 5   libffi.dylib                        0x00000001ab065050 ffi_call_SYSV + 80
  [bt] (5) 6   libffi.dylib                        0x00000001ab06dae0 ffi_call_int + 1212
  [bt] (6) 7   _ctypes.cpython-310-darwin.so       0x000000010391b9f0 _ctypes_callproc + 1348
  [bt] (7) 8   _ctypes.cpython-310-darwin.so       0x00000001039143ec PyCFuncPtr_call + 1176
  [bt] (8) 9   libpython3.10.dylib                 0x00000001032db204 _PyObject_MakeTpCall + 360



In [None]:
actuals_2010 = {
    "CON": 0.362,  # Conservative Party
    "LAB": 0.292,  # Labour Party
    "LIB": 0.235,  # Liberal Democrats
    "BRX": 0.000,  # Brexit Party (did not exist in 2010)
    "GRE": 0.009,  # Green Party
    "SNP": 0.019,  # Scottish National Party
    "UKI": 0.030,  # UK Independence Party
    "PLC": 0.005,  # Plaid Cymru
    "OTH": 0.048   # Other parties
}

In [None]:
# Evaluate predictions
X_test_matrix = np.array(X_test)
mean_predictions = {}
for party in parties:
    mean_predictions[party] = models[party].predict(X_test_matrix).mean()


In [None]:
# Print mean predictions
for party in parties:
    print(f"2010 mean prediction for {party}: {mean_predictions[party]}")
    print(f"2010 actual result for {party}  : {actuals_2010[party]}")


2010 mean prediction for CON: 0.35427021980285645
2010 actual result for CON  : 0.362
2010 mean prediction for LAB: 0.2999766767024994
2010 actual result for LAB  : 0.292
2010 mean prediction for LIB: 0.2287672460079193
2010 actual result for LIB  : 0.235
2010 mean prediction for BRX: 0.0
2010 actual result for BRX  : 0.0
2010 mean prediction for GRE: 0.009019231423735619
2010 actual result for GRE  : 0.009
2010 mean prediction for SNP: 0.016155105084180832
2010 actual result for SNP  : 0.019
2010 mean prediction for UKI: 0.029667291790246964
2010 actual result for UKI  : 0.03
2010 mean prediction for PLC: 0.005732825957238674
2010 actual result for PLC  : 0.005
2010 mean prediction for OTH: 0.056253883987665176
2010 actual result for OTH  : 0.048


In [47]:
# Evaluate last five polls
last_predictions = {}
for party in parties:
    last_predictions[party] = models[party].predict(X_test_matrix)[-5:]

NameError: name 'X_test_matrix' is not defined

In [48]:
# Print mean of predictions
for party in parties:
    print(f"2010 last prediction for {party}: {last_predictions[party].mean()}")
    print(f"2010 actual result for {party}  : {actuals_2010[party]}")


KeyError: 'CON'