In [15]:
import pandas as pd
import numpy as np

In [16]:
# Imports CSVs from raw for merging
election_results_df = pd.read_csv('../processed_data/2004_to_2019_results_clean.csv')
gb_polls = pd.read_csv('../processed_data/gb_polls.csv')

In [17]:
# Renames election year columns for merge nad makes sure values are ints
election_results_df.rename(columns={'Year': 'election_year'}, inplace=True)
election_results_df['election_year'] = election_results_df['election_year'].astype(int)


In [18]:
# Option 1: filters election year to pre 2019
filtered_gb_polls_df = gb_polls[gb_polls['election_year'] <= 2020]

In [20]:
# Option 2: filters election year to post 2019
# filtered_gb_polls_df = gb_polls[gb_polls['election_year'] >= 2020]

In [21]:
# Merges dataframes
gb_polls_actuals = filtered_gb_polls_df.merge(election_results_df, on='election_year', how='left')

In [22]:
# Dictionary for column names
rename_columns = {
    'BRX': 'BRX_FC',
    'CON': 'CON_FC',
    'GRE': 'GRE_FC',
    'LAB': 'LAB_FC',
    'LIB': 'LIB_FC',
    'OTH': 'OTH_FC',
    'PLC': 'PLC_FC',
    'SNP': 'SNP_FC',
    'UKI': 'UKI_FC',
    'BRX_ACTUAL_PERCENTAGE': 'BRX_ACT',
    'CON_ACTUAL_PERCENTAGE': 'CON_ACT',
    'GRE_ACTUAL_PERCENTAGE': 'GRE_ACT',
    'LIB_ACTUAL_PERCENTAGE': 'LIB_ACT',
    'LABOUR_ACTUAL_PERCENTAGE': 'LAB_ACT',
    'PLC_ACTUAL_PERCENTAGE': 'PLC_ACT',
    'SNP_ACTUAL_PERCENTAGE': 'SNP_ACT',
    'UKI_ACTUAL_PERCENTAGE': 'UKI_ACT'
}

In [23]:
# Renames columns for uniformity
gb_polls_actuals.rename(columns=rename_columns, inplace=True)

In [24]:
# Drops irrelevant columns
gb_polls_actuals.drop(columns=['Country'], inplace=True)

In [25]:
# Renames df for sanity
combined_df = gb_polls_actuals

In [27]:
# Renames columns for space
combined_df.rename(columns={'pollster_rating': 'rating'}, inplace=True)
combined_df.rename(columns={'next_election_date': 'next_elec_date'}, inplace=True)
combined_df.rename(columns={'days_until_next_election': 'days_to_elec'}, inplace=True)

In [28]:
# Drops further columns
combined_df.drop(columns='Geography', inplace=True)  # Drop column


In [29]:
# Add a field 'poll length' that shows number of days the poll was held for
combined_df['poll_length'] = pd.to_datetime(combined_df.enddate) - pd.to_datetime(combined_df.startdate)
combined_df['poll_length'] = combined_df['poll_length'].dt.days

In [30]:
# Converts start/end to datetime
combined_df['startdate'] = pd.to_datetime(combined_df['startdate'])
combined_df['enddate'] = pd.to_datetime(combined_df['enddate'])
combined_df['next_elec_date'] = pd.to_datetime(combined_df['next_elec_date'])

In [31]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']:
    combined_df[column] = combined_df[column] / 100

In [32]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']:
    combined_df[column] = combined_df[column] / 100

In [33]:
# Calculates month cleaning column
def calculate_month_diff(d1, d2):
    return (d2.year - d1.year) * 12 + d2.month - d1.month

In [63]:
# Applies month cleaning
combined_df['months_to_elec'] = combined_df.apply(lambda row: calculate_month_diff(row['enddate'], row['next_elec_date']), axis=1)

In [64]:
# Function to calculate weight
def calculate_weight(months, max_months=60):
    return max(0, 1 - np.log1p(months) / np.log1p(max_months))

In [65]:
# Applies weight calculation
combined_df['months_to_elec_weight'] = combined_df['months_to_elec'].apply(calculate_weight)

In [69]:
# Defines ruling periods
party_in_power = {
    '2004-01-01': 'Labour',
    '2005-05-05': 'Labour',
    '2010-05-06': 'Conservative',
    '2019-12-12': 'Conservative'
}

In [70]:
# Function to get party in power at a given date
def get_party_in_power(date):
    date = pd.to_datetime(date)
    for key in sorted(party_in_power.keys(), reverse=True):
        if date >= pd.to_datetime(key):
            return party_in_power[key]
    return None

In [71]:
# Applies party in power function
combined_df['party_in_power'] = combined_df['startdate'].apply(get_party_in_power)


KeyError: 'startdate'

In [57]:
combined_df

Unnamed: 0,enddate,pollster,samplesize,rating,next_elec_date,days_to_elec,months_to_election,weight,poll_length,party_in_power,...,CON_ACT,LAB_ACT,LIB_ACT,BRX_ACT,GRE_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE,months_to_elec_weight
0,2004-01-04,Populus,566,D+,2005-05-05,489,16,0.310800,2,Labour,...,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838,0.310800
1,2004-01-18,ICM,1007,D+,2005-05-05,475,16,0.310800,2,Labour,...,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838,0.310800
2,2004-02-08,Populus,580,D+,2005-05-05,454,15,0.325548,2,Labour,...,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838,0.325548
3,2004-02-22,ICM,1006,D+,2005-05-05,440,15,0.325548,2,Labour,...,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838,0.325548
4,2004-03-07,Populus,573,D+,2005-05-05,426,14,0.341247,2,Labour,...,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838,0.341247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,2019-12-11,IpsosMORI,2213,A-,2019-12-12,3,0,1.000000,2,Conservative,...,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837,1.000000
3256,2019-12-11,Kantar,2815,B+,2019-12-12,3,0,1.000000,2,Conservative,...,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837,1.000000
3257,2019-12-11,Opinium,3005,A-,2019-12-12,2,0,1.000000,1,Conservative,...,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837,1.000000
3258,2019-12-11,Panelbase,3174,A-,2019-12-12,2,0,1.000000,1,Conservative,...,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837,1.000000


In [72]:
order = ['enddate', 'pollster', 'samplesize', 'rating',
       'next_elec_date', 'days_to_elec', 'months_to_election', 'months_to_elec_weight', 'poll_length',
       'party_in_power', 'CON_FC', 'LAB_FC', 'LIB_FC',
       'BRX_FC', 'GRE_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC', 'CON_ACT', 'LAB_ACT', 'LIB_ACT', 'BRX_ACT', 'GRE_ACT',
       'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']

In [73]:
combined_df = combined_df[order]

In [74]:
combined_df.columns

Index(['enddate', 'pollster', 'samplesize', 'rating', 'next_elec_date',
       'days_to_elec', 'months_to_election', 'months_to_elec_weight',
       'poll_length', 'party_in_power', 'CON_FC', 'LAB_FC', 'LIB_FC', 'BRX_FC',
       'GRE_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC', 'CON_ACT', 'LAB_ACT',
       'LIB_ACT', 'BRX_ACT', 'GRE_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT',
       'OTH_PERCENTAGE'],
      dtype='object')

In [62]:
combined_df

Unnamed: 0,enddate,pollster,samplesize,rating,next_elec_date,days_to_elec,months_to_election,months_to_elec_weight,poll_length,party_in_power,...,UKI_FC,CON_ACT,LAB_ACT,LIB_ACT,BRX_ACT,GRE_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE
0,2004-01-04,Populus,566,D+,2005-05-05,489,16,0.310800,2,Labour,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
1,2004-01-18,ICM,1007,D+,2005-05-05,475,16,0.310800,2,Labour,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
2,2004-02-08,Populus,580,D+,2005-05-05,454,15,0.325548,2,Labour,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
3,2004-02-22,ICM,1006,D+,2005-05-05,440,15,0.325548,2,Labour,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
4,2004-03-07,Populus,573,D+,2005-05-05,426,14,0.341247,2,Labour,...,,0.323596,0.351872,0.220256,0.000000,0.009491,0.006440,0.015186,0.022322,0.050838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,2019-12-11,IpsosMORI,2213,A-,2019-12-12,3,0,1.000000,2,Conservative,...,,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837
3256,2019-12-11,Kantar,2815,B+,2019-12-12,3,0,1.000000,2,Conservative,...,,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837
3257,2019-12-11,Opinium,3005,A-,2019-12-12,2,0,1.000000,1,Conservative,...,,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837
3258,2019-12-11,Panelbase,3174,A-,2019-12-12,2,0,1.000000,1,Conservative,...,,0.436317,0.320809,0.115477,0.020127,0.026120,0.004788,0.038812,0.000713,0.036837


In [75]:
#Option 1: Saves to CSV for pre 2019
combined_df.to_csv('../processed_data/2004_to_2019_combined_clean_polling_and_results.csv', index=True)

In [None]:
#Option 2: Saves to CSV for post 2019
# combined_df.to_csv('../processed_data/2019_to_2024_combined_clean_polling_and_results.csv', index=False)