In [118]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.cloud import bigquery
from datetime import datetime, timedelta

In [119]:
# Print current time
now = datetime.now()
current_time = now.strftime("%Y-%m-%d %H:%M:%S")

In [120]:
# Imports data
data = pd.read_csv('../processed_data/scotland_polling_results_cleaned.csv')

In [121]:
len(data)

217

In [122]:
data.isna().sum()

Unnamed: 0.1        0
Unnamed: 0          0
startdate           0
enddate             0
pollster            0
samplesize          0
rating              0
next_elec_date      0
days_to_elec        0
BRX_FC            189
CON_FC              0
GRE_FC             64
LAB_FC              0
LIB_FC              0
OTH_FC             36
SNP_FC              0
OTH.1              36
UKI_FC            132
SNP_ACT           110
LAB_ACT           110
LIB_ACT           110
CON_ACT           110
UKI_ACT           110
GRE_ACT           110
BRX_ACT           110
OTH_PERCENTAGE    110
dtype: int64

In [123]:
# Convert date columns to datetime
data['enddate'] = pd.to_datetime(data['enddate'])
data['next_elec_date'] = pd.to_datetime(data['next_elec_date'])
data['startdate'] = pd.to_datetime(data['startdate'])

In [124]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'days_to_elec']
num_transformer = MinMaxScaler()

In [125]:
# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = make_pipeline(OrdinalEncoder(categories = [['F','F+','E-','E','E+','D-','D','D+','C-','C','C+','B-','B','B+','A-']]),MinMaxScaler())

In [126]:
# # One-hot encoder for party_in_power
# ohe = OneHotEncoder()

In [127]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [128]:
data['next_elec_date'].value_counts()

next_elec_date
2024-07-04    110
2015-05-07     52
2019-12-12     38
2017-06-08     17
Name: count, dtype: int64

In [129]:
data.rename(columns={'OTH_PERCENTAGE':'OTH_ACT'}, inplace=True)

In [130]:
data.fillna(value=0,inplace=True)

In [131]:
# Define election dates
election_date = datetime.strptime('2019-12-12', '%Y-%m-%d')
cutoff_date = election_date - timedelta(days=100)
prediction_date = election_date - timedelta(days=1)

In [132]:
# Split data
data_train = data[data['startdate'] > '2003-12-31']
data_train = data_train[data_train['startdate'] < cutoff_date]
data_test_1 = data[(data['startdate'] >= cutoff_date) & (data['startdate'] < prediction_date)]
data_test = data_test_1[data_test_1['next_elec_date'] == election_date]

In [133]:
# Fit transform preprocessing pipeline to data_train
data_train_processed = preproc_pipeline.fit_transform(data_train)

In [134]:
# Transform preprocessing pipeline to data_test
data_test_processed = preproc_pipeline.transform(data_test)

In [135]:
# Check feature names make sense for data_train
data_train_processed = pd.DataFrame(
    data_train_processed, columns=preproc_pipeline.get_feature_names_out()
)

# Check feature names make sense for data_test
data_test_processed = pd.DataFrame(
    data_test_processed, columns=preproc_pipeline.get_feature_names_out()
)

In [136]:
# Define our X by dropping irrelevant and y columns
X_train = data_train_processed.drop(columns=['startdate', 'enddate', 'pollster', 'Unnamed: 0', 'next_elec_date', 'days_to_elec', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT','OTH_ACT'])
X_test = data_test_processed.drop(columns=['startdate', 'enddate', 'pollster', 'Unnamed: 0', 'next_elec_date', 'days_to_elec', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT','OTH_ACT'])

In [137]:
## Build our target matrix
y_train = data_train_processed[['next_elec_date', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT']]
y_test = data_test_processed[['next_elec_date', 'LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT']]

In [138]:
# Drops y_train actuals where the actual is the actual result we are trying to predict, replaces with NaNs
y_train.loc[y_train['next_elec_date'] == '2019-12-12',
         ['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_ACT']] = np.nan

In [139]:
# Calculates average median value of X_test
averages = X_test.mean()
averages

samplesize      0.053056
rating               1.0
Unnamed: 0.1       102.5
BRX_FC             1.875
CON_FC            26.125
GRE_FC             1.625
LAB_FC            17.875
LIB_FC            10.875
OTH_FC             0.125
SNP_FC              41.5
OTH.1              0.125
UKI_FC               0.0
dtype: object

In [140]:
# Creates imputation values for y_train to impute over actuals we are trying to predict
imputation_values = {
    'CON_ACT': averages['CON_FC'],
    'LAB_ACT': averages['LAB_FC'],
    'LIB_ACT': averages['LIB_FC'],
    'BRX_ACT': averages['BRX_FC'],
    'SNP_ACT': averages['SNP_FC'],
    'UKI_ACT': averages['UKI_FC'],
    'OTH_ACT': averages['OTH_FC'],
    'GRE_ACT': averages['GRE_FC']
}

In [141]:
# Applies imputation values to y_train
y_train = y_train.fillna(value=imputation_values)

  y_train = y_train.fillna(value=imputation_values)


In [142]:
len(X_train), len(y_train), len(X_test), len(y_test)

(99, 99, 8, 8)

In [143]:
# Instantiate the model for each party
models = {}
parties = ['CON', 'LAB', 'LIB', 'BRX', 'GRE', 'SNP', 'UKI', 'OTH']
for party in parties:
    models[party] = XGBRegressor(
        learning_rate=0.3, n_estimators=300, max_depth=3, subsample=0.7,
        objective='reg:squarederror', nthread=-1, enable_categorical=True
    )

In [144]:
# Train models
X_train_matrix = np.array(X_train)
for party in parties:
    models[party].fit(X_train_matrix, y_train[f'{party}_ACT'])

In [145]:
actuals_2019 = {
    "CON": 0.251,  # Conservative Party
    "LAB": 0.186,  # Labour Party
    "LIB":0.95,  # Liberal Democrats
    "BRX": 0.5,  # Brexit Party
    "GRE": 0.01,  # Green Party
    "SNP": 0.45,  # Scottish National Party
    "UKI": 0.0000,  # UK Independence Party
    "PLC": 0.0051,  # Plaid Cymru
    "OTH": 0.0307   # Other parties
}

In [146]:
# Evaluate predictions
X_test_matrix = np.array(X_test)
mean_predictions = {}
for party in parties:
    mean_predictions[party] = models[party].predict(X_test_matrix).mean()


In [147]:
# Print mean predictions
for party in parties:
    print(f"2019 mean prediction for {party}: {mean_predictions[party]}")


2019 mean prediction for CON: 26.163150787353516
2019 mean prediction for LAB: 17.95258331298828
2019 mean prediction for LIB: 10.832961082458496
2019 mean prediction for BRX: 1.8527486324310303
2019 mean prediction for GRE: 1.4968897104263306
2019 mean prediction for SNP: 41.393436431884766
2019 mean prediction for UKI: 0.0017363462829962373
2019 mean prediction for OTH: 0.125


In [149]:
# Evaluate last five polls
last_predictions = {}
for party in parties:
    last_predictions[party] = models[party].predict(X_test_matrix)[-5:]

In [150]:
# Print mean of predictions
for party in parties:
    print(f"2019 last prediction for {party}: {last_predictions[party].mean()}")


2019 last prediction for CON: 26.187114715576172
2019 last prediction for LAB: 17.98794937133789
2019 last prediction for LIB: 10.809826850891113
2019 last prediction for BRX: 1.8414560556411743
2019 last prediction for GRE: 1.4421885013580322
2019 last prediction for SNP: 41.33161926269531
2019 last prediction for UKI: 0.002801459515467286
2019 last prediction for OTH: 0.125
