In [33]:
# Imports
import pandas as pd
from datetime import datetime
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
import numpy as np


In [34]:
# Load data
data = pd.read_csv('../processed_data/2004_to_2019_combined_clean_polling_and_results.csv')

# Convert dates
data['enddate'] = pd.to_datetime(data['enddate'])
data['next_elec_date'] = pd.to_datetime(data['next_elec_date'])

In [35]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'poll_length', 'days_to_elec', 'months_to_elec_weight']
num_transformer = MinMaxScaler()

# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = make_pipeline(OrdinalEncoder(categories=[['F', 'D-', 'D', 'D+', 'C-', 'B', 'B+', 'A-']]), MinMaxScaler())

In [36]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    remainder='passthrough',
    verbose_feature_names_out=False
)

# Define the split date
split_date = datetime.strptime('2019-09-13', '%Y-%m-%d')

# Split data up to 90 days before the 2019 election, and test the final 90 days running into it
data_train = data[data['enddate'] < split_date]
data_test = data[data['enddate'] >= split_date]

In [37]:
# Fit transform preprocessing pipeline to data_train
data_train_processed = preproc_pipeline.fit_transform(data_train)

# Transform preprocessing pipeline to data_test
data_test_processed = preproc_pipeline.transform(data_test)

In [38]:
# Convert to DataFrame
data_train_processed = pd.DataFrame(data_train_processed, columns=preproc_pipeline.get_feature_names_out())
data_test_processed = pd.DataFrame(data_test_processed, columns=preproc_pipeline.get_feature_names_out())

In [39]:

# Define features for training (excluding election result columns and other irrelevant columns)
exclude_columns = ['pollster', 'next_elec_date', 'BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE', 'enddate', 'party_in_power']
X_train = data_train_processed.drop(columns=exclude_columns)
X_test = data_test_processed.drop(columns=exclude_columns)

In [40]:
# Define target variables for testing (actual election results)
y_test = data_test[['LAB_ACT', 'CON_ACT', 'LIB_ACT', 'GRE_ACT', 'BRX_ACT', 'SNP_ACT', 'UKI_ACT', 'PLC_ACT', 'OTH_PERCENTAGE']]


In [41]:
# Instantiate and train models for each party
model_LAB = XGBRegressor(learning_rate=0.3, n_estimators=300, max_depth=3, subsample=0.7, objective='reg:squarederror', nthread=-1, enable_categorical=True)

In [43]:
X_train = X_train.apply(pd.to_numeric)


In [66]:
exclude_rows = 330
filtered_df = data_train.iloc[:-exclude_rows]


In [70]:
filtered_df['LAB_ACT']

0       0.351872
1       0.351872
2       0.351872
3       0.351872
4       0.351872
          ...   
2815    0.399893
2816    0.399893
2817    0.399893
2818    0.399893
2819    0.399893
Name: LAB_ACT, Length: 2820, dtype: float64

In [67]:
model_LAB.fit(X_train, filtered_df['LAB_ACT'])  # Train only on LAB_ACT column of the training set


XGBoostError: [20:45:43] /workspace/src/data/data.cc:501: Check failed: this->labels.Size() % this->num_row_ == 0 (2820 vs. 0) : Incorrect size for labels.
Stack trace:
  [bt] (0) /home/chris/.pyenv/versions/3.10.6/envs/election_predictor/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x3588ca) [0x7f6fac6a38ca]
  [bt] (1) /home/chris/.pyenv/versions/3.10.6/envs/election_predictor/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x389af7) [0x7f6fac6d4af7]
  [bt] (2) /home/chris/.pyenv/versions/3.10.6/envs/election_predictor/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(+0x38ab51) [0x7f6fac6d5b51]
  [bt] (3) /home/chris/.pyenv/versions/3.10.6/envs/election_predictor/lib/python3.10/site-packages/xgboost/lib/libxgboost.so(XGDMatrixSetInfoFromInterface+0xb0) [0x7f6fac4a93a0]
  [bt] (4) /lib/x86_64-linux-gnu/libffi.so.8(+0x7e2e) [0x7f7060998e2e]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.8(+0x4493) [0x7f7060995493]
  [bt] (6) /home/chris/.pyenv/versions/3.10.6/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0x12e05) [0x7f705f93be05]
  [bt] (7) /home/chris/.pyenv/versions/3.10.6/lib/python3.10/lib-dynload/_ctypes.cpython-310-x86_64-linux-gnu.so(+0xc8d0) [0x7f705f9358d0]
  [bt] (8) /home/chris/.pyenv/versions/3.10.6/lib/libpython3.10.so.1.0(_PyObject_MakeTpCall+0x8c) [0x7f70614ef35c]



In [45]:
# Convert X_test to matrix
X_test_matrix = np.array(X_test)

# Predict using the trained model
y_pred_LAB = model_LAB.predict(X_test_matrix)

# Evaluate model performance
from sklearn.metrics import mean_squared_error
test_score_LAB = mean_squared_error(y_test['LAB_ACT'], y_pred_LAB)
print(f'Test Score for LAB: {test_score_LAB}')

Test Score for LAB: 0.0003975992288582633


In [50]:
y_pred_LAB.mean()

0.32794753