In [1]:
import pandas as pd
import numpy as np

In [2]:
# Imports CSVs from raw for merging
election_results_df = pd.read_csv('../processed_data/1988_to_2019_results_clean.csv')
gb_polls = pd.read_csv('../processed_data/gb_polls.csv')

In [3]:
# Fxes election year column name
election_results_df['election_year'] = election_results_df['Year']

In [4]:
# Fixes electon year for data type
gb_polls['election_year'] = gb_polls['next_elec_date'].str[:4]

In [5]:
# Fixes electon year for data type
gb_polls['election_year'] = gb_polls['election_year'].astype(int)

In [6]:
# Option 1: filters election year to pre 2019
# filtered_gb_polls_df = gb_polls[gb_polls['election_year'] <= 2020]

In [7]:
# Option 2: filters election year to post 2019
# filtered_gb_polls_df = gb_polls[gb_polls['election_year'] >= 2020]

In [8]:
# Option 3 merges dataframe fully
gb_polls_actuals = gb_polls.merge(election_results_df, on='election_year', how='left')

In [9]:
# Dictionary for column names
rename_columns = {
    'BRX': 'BRX_FC',
    'CON': 'CON_FC',
    'GRE': 'GRE_FC',
    'LAB': 'LAB_FC',
    'LIB': 'LIB_FC',
    'OTH': 'OTH_FC',
    'PLC': 'PLC_FC',
    'REF': 'REF_FC',
    'SNP': 'SNP_FC',
    'UKI': 'UKI_FC',
    'BRX_ACTUAL_PERCENTAGE': 'BRX_ACT',
    'CON_ACTUAL_PERCENTAGE': 'CON_ACT',
    'GRE_ACTUAL_PERCENTAGE': 'GRE_ACT',
    'LIB_ACTUAL_PERCENTAGE': 'LIB_ACT',
    'LABOUR_ACTUAL_PERCENTAGE': 'LAB_ACT',
    'PLC_ACTUAL_PERCENTAGE': 'PLC_ACT',
    'REF_ACTUAL_PERCENTAGE': 'REF_ACT',
    'SNP_ACTUAL_PERCENTAGE': 'SNP_ACT',
    'UKI_ACTUAL_PERCENTAGE': 'UKI_ACT',
    'OTH_ACTUAL_PERCENTAGE': 'OTH_ACT'
}

In [10]:
# Renames columns for uniformity
gb_polls_actuals.rename(columns=rename_columns, inplace=True)

In [11]:
# Drops irrelevant columns
gb_polls_actuals.drop(columns=['Country'], inplace=True)

In [12]:
# Renames df for sanity
combined_df = gb_polls_actuals

In [13]:
# Renames columns for space
combined_df.rename(columns={'pollster_rating': 'rating'}, inplace=True)
combined_df.rename(columns={'next_election_date': 'next_elec_date'}, inplace=True)
combined_df.rename(columns={'days_until_next_election': 'days_to_elec'}, inplace=True)

In [14]:
# Drops further columns
combined_df.drop(columns='Geography', inplace=True)  # Drop column


In [15]:
# Add a field 'poll length' that shows number of days the poll was held for
combined_df['poll_length'] = pd.to_datetime(combined_df.enddate) - pd.to_datetime(combined_df.startdate)
combined_df['poll_length'] = combined_df['poll_length'].dt.days

In [16]:
# Apply function to correct negative values and replace 0 with 1
def adjust_poll_length(days):
    if days < 0:
        return 0
    elif days == 0:
        return 1
    else:
        return days

In [17]:
combined_df['poll_length'] = combined_df['poll_length'].apply(adjust_poll_length)

In [18]:
# Converts start/end to datetime
combined_df['startdate'] = pd.to_datetime(combined_df['startdate'])
combined_df['enddate'] = pd.to_datetime(combined_df['enddate'])
combined_df['next_elec_date'] = pd.to_datetime(combined_df['next_elec_date'])

In [19]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'OTH_FC', 'PLC_FC', 'REF_FC', 'SNP_FC', 'UKI_FC']:
    combined_df[column] = combined_df[column] / 100

In [20]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'REF_ACT',  'SNP_ACT', 'UKI_ACT', 'OTH_ACT']:
    combined_df[column] = combined_df[column] / 100

In [21]:
# Calculates month cleaning column
def calculate_month_diff(d1, d2):
    return (d2.year - d1.year) * 12 + d2.month - d1.month

In [22]:
# Applies month cleaning
combined_df['months_to_elec'] = combined_df.apply(lambda row: calculate_month_diff(row['enddate'], row['next_elec_date']), axis=1)

In [23]:
# Function to calculate weight
def calculate_weight(months, max_months=60):
    return max(0, 1 - np.log1p(months) / np.log1p(max_months))

In [24]:
# Applies weight calculation
combined_df['months_to_elec_weight'] = combined_df['months_to_elec'].apply(calculate_weight)

In [25]:
# Defines ruling periods
party_in_power = {
    '1985-01-01': 'Conservative',
    '1987-06-11': 'Conservative',
    '1992-04-09': 'Conservative',
    '1997-05-01': 'Labour',
    '2001-06-07': 'Labour',
    '2004-01-01': 'Labour',
    '2005-05-05': 'Labour',
    '2010-05-06': 'Conservative_Liberal',
    '2015-05-07': 'Conservative',
    '2017-06-08': 'Conservative',
    '2019-12-12': 'Conservative'
}

In [26]:
# Function to get party in power at a given date
def get_party_in_power(date):
    date = pd.to_datetime(date)
    for key in sorted(party_in_power.keys(), reverse=True):
        if date >= pd.to_datetime(key):
            return party_in_power[key]
    return None

In [27]:
# Applies party in power function
combined_df['party_in_power'] = combined_df['startdate'].apply(get_party_in_power)


In [28]:
combined_df.columns

Index(['Unnamed: 0_x', 'startdate', 'enddate', 'pollster', 'samplesize',
       'rating', 'next_elec_date', 'days_to_elec', 'BRX_FC', 'CON_FC',
       'GRE_FC', 'LAB_FC', 'LIB_FC', 'OTH_FC', 'PLC_FC', 'REF_FC', 'SNP_FC',
       'UKI_FC', 'election_year', 'Unnamed: 0_y', 'Year', 'BRX_ACT', 'CON_ACT',
       'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT',
       'OTH_ACT', 'REF_ACT', 'poll_length', 'months_to_elec',
       'months_to_elec_weight', 'party_in_power'],
      dtype='object')

In [37]:
order = ['startdate', 'enddate', 'pollster', 'samplesize', 'rating',
       'next_elec_date', 'days_to_elec', 'months_to_elec', 'months_to_elec_weight', 'poll_length',
       'party_in_power', 'CON_FC', 'LAB_FC', 'LIB_FC',
       'BRX_FC', 'GRE_FC', 'OTH_FC', 'PLC_FC', 'REF_FC', 'SNP_FC', 'UKI_FC', 'CON_ACT', 'LAB_ACT', 'LIB_ACT', 'BRX_ACT', 'GRE_ACT',
        'OTH_ACT', 'PLC_ACT', 'REF_ACT', 'SNP_ACT', 'UKI_ACT']

In [38]:
combined_df = combined_df[order]

In [39]:
combined_df

Unnamed: 0,startdate,enddate,pollster,samplesize,rating,next_elec_date,days_to_elec,months_to_elec,months_to_elec_weight,poll_length,...,CON_ACT,LAB_ACT,LIB_ACT,BRX_ACT,GRE_ACT,OTH_ACT,PLC_ACT,REF_ACT,SNP_ACT,UKI_ACT
0,1988-01-11,1988-01-11,Marplan,1000,D-,1992-04-09,1550,51,0.038831,1,...,0.419323,0.343843,0.178082,0.0,0.004797,0.030561,0.004664,0.0,0.018731,0.0
1,1988-01-11,1988-01-11,NOP,1000,D-,1992-04-09,1550,51,0.038831,1,...,0.419323,0.343843,0.178082,0.0,0.004797,0.030561,0.004664,0.0,0.018731,0.0
2,1988-01-18,1988-01-18,Gallup,1000,D+,1992-04-09,1543,51,0.038831,1,...,0.419323,0.343843,0.178082,0.0,0.004797,0.030561,0.004664,0.0,0.018731,0.0
3,1988-01-26,1988-01-26,MORI,1000,B,1992-04-09,1535,51,0.038831,1,...,0.419323,0.343843,0.178082,0.0,0.004797,0.030561,0.004664,0.0,0.018731,0.0
4,1988-02-08,1988-02-08,NOP,1000,D-,1992-04-09,1522,50,0.043555,1,...,0.419323,0.343843,0.178082,0.0,0.004797,0.030561,0.004664,0.0,0.018731,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5729,2024-06-14,2024-06-17,RedfieldWilton,10000,D,2024-07-04,20,1,0.831387,3,...,,,,,,,,,,
5730,2024-06-18,2024-06-18,PeoplePolling,1228,D,2024-07-04,16,1,0.831387,1,...,,,,,,,,,,
5731,2024-06-07,2024-06-18,SavantaComRes,17812,B+,2024-07-04,27,1,0.831387,11,...,,,,,,,,,,
5732,2024-06-14,2024-06-18,Survation,1008,A-,2024-07-04,20,1,0.831387,4,...,,,,,,,,,,


In [40]:
#Option 1: Saves to CSV for all years
combined_df.to_csv('../processed_data/1988_to_2024_combined_clean_polling_and_results.csv', index=True)

In [None]:
#Option 2: Saves to CSV for post 2019
# combined_df.to_csv('../processed_data/2019_to_2024_combined_clean_polling_and_results.csv', index=False)