In [1]:
import pandas as pd

In [21]:
# Imports CSVs from raw for merging
election_results_df = pd.read_csv('../processed_data/2004_to_2019_results_clean.csv')
gb_polls = pd.read_csv('../processed_data/gb_polls.csv')

In [22]:
# Renames election year columns for merge nad makes sure values are ints
election_results_df.rename(columns={'Year': 'election_year'}, inplace=True)
election_results_df['election_year'] = election_results_df['election_year'].astype(int)


In [23]:
# Option 1: filters election year to pre 2019
filtered_gb_polls_df = gb_polls[gb_polls['election_year'] <= 2020]


In [24]:
# Option 2: filters election year to post 2019
# filtered_gb_polls_df = gb_polls[gb_polls['election_year'] >= 2020]


In [25]:
# Merges dataframes
gb_polls_actuals = filtered_gb_polls_df.merge(election_results_df, on='election_year', how='left')

In [26]:
# Dictionary for column names
rename_columns = {
    'BRX': 'BRX_FC',
    'CON': 'CON_FC',
    'GRE': 'GRE_FC',
    'LAB': 'LAB_FC',
    'LIB': 'LIB_FC',
    'OTH': 'OTH_FC',
    'PLC': 'PLC_FC',
    'SNP': 'SNP_FC',
    'UKI': 'UKI_FC',
    'BRX_ACTUAL_PERCENTAGE': 'BRX_ACT',
    'CON_ACTUAL_PERCENTAGE': 'CON_ACT',
    'GRE_ACTUAL_PERCENTAGE': 'GRE_ACT',
    'LIB_ACTUAL_PERCENTAGE': 'LIB_ACT',
    'LABOUR_ACTUAL_PERCENTAGE': 'LAB_ACT',
    'PLC_ACTUAL_PERCENTAGE': 'PLC_ACT',
    'SNP_ACTUAL_PERCENTAGE': 'SNP_ACT',
    'UKI_ACTUAL_PERCENTAGE': 'UKI_ACT'
}

In [27]:
# Renames columns for uniformity
gb_polls_actuals.rename(columns=rename_columns, inplace=True)


In [28]:
# Drops irrelevant columns
gb_polls_actuals.drop(columns=['Country'], inplace=True)

In [29]:
# Renames df for sanity
combined_df = gb_polls_actuals

In [30]:
# Renames columns for space
combined_df.rename(columns={'pollster_rating': 'rating'}, inplace=True)
combined_df.rename(columns={'next_election_date': 'next_elec_date'}, inplace=True)
combined_df.rename(columns={'days_until_next_election': 'days_to_elec'}, inplace=True)

In [31]:
# Drops further columns
combined_df.drop(columns='Geography', inplace=True)  # Drop column
combined_df.drop(columns='election_year', inplace=True)  # Drop column

In [36]:
combined_df

Unnamed: 0,startdate,enddate,pollster,samplesize,rating,next_elec_date,days_to_elec,CON_FC,LAB_FC,LIB_FC,...,party_in_power,CON_ACT,LAB_ACT,LIB_ACT,BRX_ACT,GRE_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE
0,2004-01-02,2004-01-04,Populus,566,D+,2005-05-05,489,35.0,40.0,18.0,...,Labour,32.359595,35.187187,22.025555,0.000000,0.94909,0.644030,1.518620,2.232152,5.083771
1,2004-01-16,2004-01-18,ICM,1007,D+,2005-05-05,475,34.0,39.0,20.0,...,Labour,32.359595,35.187187,22.025555,0.000000,0.94909,0.644030,1.518620,2.232152,5.083771
2,2004-02-06,2004-02-08,Populus,580,D+,2005-05-05,454,31.0,36.0,25.0,...,Labour,32.359595,35.187187,22.025555,0.000000,0.94909,0.644030,1.518620,2.232152,5.083771
3,2004-02-20,2004-02-22,ICM,1006,D+,2005-05-05,440,34.0,36.0,21.0,...,Labour,32.359595,35.187187,22.025555,0.000000,0.94909,0.644030,1.518620,2.232152,5.083771
4,2004-03-05,2004-03-07,Populus,573,D+,2005-05-05,426,34.0,36.0,22.0,...,Labour,32.359595,35.187187,22.025555,0.000000,0.94909,0.644030,1.518620,2.232152,5.083771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3255,2019-12-09,2019-12-11,IpsosMORI,2213,A-,2019-12-12,3,44.0,33.0,12.0,...,Conservative,43.631688,32.080872,11.547741,2.012681,2.61203,0.478805,3.881238,0.071281,3.683662
3256,2019-12-09,2019-12-11,Kantar,2815,B+,2019-12-12,3,44.0,32.0,13.0,...,Conservative,43.631688,32.080872,11.547741,2.012681,2.61203,0.478805,3.881238,0.071281,3.683662
3257,2019-12-10,2019-12-11,Opinium,3005,A-,2019-12-12,2,45.0,33.0,12.0,...,Conservative,43.631688,32.080872,11.547741,2.012681,2.61203,0.478805,3.881238,0.071281,3.683662
3258,2019-12-10,2019-12-11,Panelbase,3174,A-,2019-12-12,2,43.0,34.0,11.0,...,Conservative,43.631688,32.080872,11.547741,2.012681,2.61203,0.478805,3.881238,0.071281,3.683662


In [34]:
combined_df.to_csv('../processed_data/2019_to_2024_combined_clean_polling_and_results.csv', index=False)