In [2]:
# Imports using Sklearn make shortcut functions
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
data = pd.read_csv('/Users/nieksonneveld/code/nieksonneveld/election-predictor/raw_data/clean_polling_combined_with_national_results (1).csv')

In [21]:
data.head()

Unnamed: 0,startdate,enddate,pollster,samplesize,rating,next_elec_date,days_to_elec,BRX_FC,CON_FC,GRE_FC,...,UKI_FC,BRX_ACT,CON_ACT,GRE_ACT,LIB_ACT,LAB_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE
0,2004-01-02,2004-01-04,Populus,566,D+,2005-05-05,489,,35.0,,...,,0.0,32.359595,0.94909,22.025555,35.187187,0.64403,1.51862,2.232152,5.083771
1,2004-01-16,2004-01-18,ICM,1007,D+,2005-05-05,475,,34.0,,...,,0.0,32.359595,0.94909,22.025555,35.187187,0.64403,1.51862,2.232152,5.083771
2,2004-02-06,2004-02-08,Populus,580,D+,2005-05-05,454,,31.0,,...,,0.0,32.359595,0.94909,22.025555,35.187187,0.64403,1.51862,2.232152,5.083771
3,2004-02-20,2004-02-22,ICM,1006,D+,2005-05-05,440,,34.0,,...,,0.0,32.359595,0.94909,22.025555,35.187187,0.64403,1.51862,2.232152,5.083771
4,2004-03-05,2004-03-07,Populus,573,D+,2005-05-05,426,,34.0,,...,,0.0,32.359595,0.94909,22.025555,35.187187,0.64403,1.51862,2.232152,5.083771


In [22]:
data.columns

Index(['startdate', 'enddate', 'pollster', 'samplesize', 'rating',
       'next_elec_date', 'days_to_elec', 'BRX_FC', 'CON_FC', 'GRE_FC',
       'LAB_FC', 'LIB_FC', 'NAT_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC',
       'BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT',
       'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE'],
      dtype='object')

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3260 entries, 0 to 3259
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   startdate       3260 non-null   object 
 1   enddate         3260 non-null   object 
 2   pollster        3260 non-null   object 
 3   samplesize      3260 non-null   int64  
 4   rating          3260 non-null   object 
 5   next_elec_date  3260 non-null   object 
 6   days_to_elec    3260 non-null   int64  
 7   BRX_FC          187 non-null    float64
 8   CON_FC          3260 non-null   float64
 9   GRE_FC          1282 non-null   float64
 10  LAB_FC          3260 non-null   float64
 11  LIB_FC          3260 non-null   float64
 12  NAT_FC          78 non-null     float64
 13  OTH_FC          3228 non-null   float64
 14  PLC_FC          336 non-null    float64
 15  SNP_FC          617 non-null    float64
 16  UKI_FC          2528 non-null   float64
 17  BRX_ACT         3260 non-null   f

In [6]:
# Replace NaN with 0 values for _FC and _ACT columns
data = data.fillna(0)

In [7]:
# Add a field 'poll length' that shows number of days the poll was held for
data['poll_length'] = pd.to_datetime(data.enddate) - pd.to_datetime(data.startdate)
data['poll_length'] = data['poll_length'].dt.days

In [8]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'NAT_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']:
    data[column] = data[column] / 100

In [9]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']:
    data[column] = data[column] / 100

In [10]:
# Drop columns we don't need
data = data.drop(columns=['startdate', 'enddate', 'pollster'])

In [11]:
#TODO Build num transformer

num_col = ['samplesize', 'days_to_elec', 'poll_length']
num_transformer = MinMaxScaler()

In [12]:
data.rating.value_counts()

rating
A-    2130
D+     613
B+     281
D       65
F       56
D-      52
B       49
C-      14
Name: count, dtype: int64

In [13]:
#TODO Build cat transformer (encoding and imputing etc)

cat_col = ['rating']
cat_transformer = OrdinalEncoder(categories = [['F','D-','D','D+','C-','B','B+','A-']])

In [14]:
#TODO Build any required column transformers (use column selectors if necessary)

preproc_mvp = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [15]:
# Fit preproc_mvp to data

data_processed = preproc_mvp.fit_transform(data)
data_processed = pd.DataFrame(data_processed, columns=preproc_mvp.get_feature_names_out())

In [16]:
data_processed.head()

Unnamed: 0,samplesize,days_to_elec,poll_length,rating,next_elec_date,BRX_FC,CON_FC,GRE_FC,LAB_FC,LIB_FC,...,UKI_FC,BRX_ACT,CON_ACT,GRE_ACT,LIB_ACT,LAB_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE
0,0.000343,0.268132,0.91067,3.0,2005-05-05,0.0,0.35,0.0,0.4,0.18,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838
1,0.004539,0.26044,0.91067,3.0,2005-05-05,0.0,0.34,0.0,0.39,0.2,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838
2,0.000476,0.248901,0.91067,3.0,2005-05-05,0.0,0.31,0.0,0.36,0.25,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838
3,0.00453,0.241209,0.91067,3.0,2005-05-05,0.0,0.34,0.0,0.36,0.21,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838
4,0.000409,0.233516,0.91067,3.0,2005-05-05,0.0,0.34,0.0,0.36,0.22,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838


In [18]:
data_processed.describe()

Unnamed: 0,samplesize,days_to_elec,poll_length,rating,next_elec_date,BRX_FC,CON_FC,GRE_FC,LAB_FC,LIB_FC,...,UKI_FC,BRX_ACT,CON_ACT,GRE_ACT,LIB_ACT,LAB_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE
count,3260.0,3260.0,3260.0,3260.0,3260,3260.0,3260.0,3260.0,3260.0,3260.0,...,3260.0,3260.0,3260.0,3260.0,3260.0,3260.0,3260.0,3260.0,3260.0,3260.0
unique,1034.0,1507.0,21.0,8.0,5,27.0,35.0,17.0,29.0,31.0,...,26.0,2.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
top,0.004473,0.002747,0.908189,7.0,2015-05-07,0.0,0.33,0.0,0.38,0.09,...,0.0,0.0,0.36811,0.03621,0.078705,0.304506,0.005919,0.047382,0.126435,0.032733
freq,70.0,19.0,1962.0,2130.0,1933,3073.0,365.0,1978.0,257.0,577.0,...,782.0,2820.0,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0,1933.0


In [None]:
#TODO Build union of of transformers into full preprocessor pipeline

# not sure we need this

In [None]:
#TODO Export pipeline as a Pickle file for use in the API