In [125]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# Imports
import os
from google.cloud import bigquery
from  datetime import datetime

now = datetime.now()
current_time = now.strftime("%Y-%m-%d %H:%M:%S")
print("Current Time:", current_time)

Current Time: 2024-06-09 18:50:31


In [126]:
# Imports data
data = pd.read_csv('../processed_data/2004_to_2019_combined_clean_polling_and_results.csv')

In [127]:
# Add a field 'poll length' that shows number of days the poll was held for
data['poll_length'] = pd.to_datetime(data.enddate) - pd.to_datetime(data.startdate)
data['poll_length'] = data['poll_length'].dt.days

In [128]:
# Converts start/end to datetime
data['startdate'] = pd.to_datetime(data['startdate'])
data['enddate'] = pd.to_datetime(data['enddate'])
data['next_elec_date'] = pd.to_datetime(data['next_elec_date'])

In [129]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']:
    data[column] = data[column] / 100

In [130]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']:
    data[column] = data[column] / 100

In [131]:
# Calculates month cleaning column
def calculate_month_diff(d1, d2):
    return (d2.year - d1.year) * 12 + d2.month - d1.month

In [132]:
# Applies month cleaning
data['months_to_election'] = data.apply(lambda row: calculate_month_diff(row['enddate'], row['next_elec_date']), axis=1)

In [133]:
# Function to calculate weight
def calculate_weight(months, max_months=60):
    return max(0, 1 - np.log1p(months) / np.log1p(max_months))

In [134]:
# Applies weight calculation
data['weight'] = data['months_to_election'].apply(calculate_weight)


In [135]:
order = ['startdate', 'enddate', 'pollster', 'samplesize', 'rating',
       'next_elec_date', 'days_to_elec', 'poll_length',
       'party_in_power', 'months_to_election', 'weight', 'CON_FC', 'LAB_FC', 'LIB_FC',
       'BRX_FC', 'GRE_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC', 'CON_ACT', 'LAB_ACT', 'LIB_ACT', 'BRX_ACT', 'GRE_ACT',
       'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']

In [136]:
data = data[order]

In [138]:
data.columns

Index(['startdate', 'enddate', 'pollster', 'samplesize', 'rating',
       'next_elec_date', 'days_to_elec', 'poll_length', 'party_in_power',
       'months_to_election', 'weight', 'CON_FC', 'LAB_FC', 'LIB_FC', 'BRX_FC',
       'GRE_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC', 'CON_ACT', 'LAB_ACT',
       'LIB_ACT', 'BRX_ACT', 'GRE_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT',
       'OTH_PERCENTAGE'],
      dtype='object')