In [97]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
import pandas as pd
import numpy as np

# Imports
import os
from google.cloud import bigquery

In [98]:
# Environment variables
gcp_project_id = os.environ['GCP_PROJECT']
gcp_service_account_key = os.environ['GCP_SERVICE_ACCOUNT_KEY']
bq_source_dataset = os.environ['BQ_SOURCE_DATASET']
bq_cleaned_dataset = os.environ['BQ_CLEANED_DATASET']

In [99]:
# Import data from BQ and set data to X
# Initialize a BigQuery client using the service account JSON file
bq_client = bigquery.Client(project=gcp_project_id).from_service_account_json(gcp_service_account_key)

# Setting the table we want from the source dataset
select_table = 'cleaned_full_polls_combined_national_results_2004_2019'

# SQL query for querying Big Query and fetching entire table
query = f"""
    SELECT *
    FROM `{gcp_project_id}.{bq_cleaned_dataset}.{select_table}`
"""

In [100]:
# Use BQ client to create DF from the selected table
data = bq_client.query(query).to_dataframe()

In [101]:
# # Logic for dealing with specific NaN values in specified columns

# # Select column names including _FC and _ACT for NaN searching
# nan_search_columns = list(data.filter(regex='_FC|_ACT', axis=1).columns)

# if data.isna().any().any():
#     # Replace NaN with 0 values for _FC and _ACT columns
#     data.loc[:, nan_search_columns] = data[nan_search_columns].fillna(0)

# if data.isna().any().any():
#     raise(ValueError('NaN values outside of _FC and _ACT olumns still present in the dataset'))

In [102]:
# Replace any found NaN values with 0
data.replace(np.nan, 0, inplace=True)

In [103]:
# Add a field 'poll length' that shows number of days the poll was held for
data['poll_length'] = pd.to_datetime(data.enddate) - pd.to_datetime(data.startdate)
data['poll_length'] = data['poll_length'].dt.days

In [104]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'NAT_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']:
    data[column] = data[column] / 100

In [105]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']:
    data[column] = data[column] / 100

In [106]:
# Drop columns we don't need
data = data.drop(columns=['startdate', 'enddate', 'pollster'])

In [107]:
# Handle numerical transformer
num_columns_selector = ['samplesize', 'days_to_elec', 'poll_length']
num_transformer = MinMaxScaler()

In [108]:
# Handle categorical transformer
cat_columns_selector = ['rating']
cat_transformer = OrdinalEncoder(categories = [['F','D-','D','D+','C-','B','B+','A-']])

In [109]:
# Build the preprocessing pipeline
preproc_pipeline = make_column_transformer(
    (num_transformer, num_columns_selector),
    (cat_transformer, cat_columns_selector),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [110]:
# Fit transform preprocessing pipeline to data
data_processed = preproc_pipeline.fit_transform(data)

In [111]:
# Check feature names make sense
data_processed = pd.DataFrame(
    data_processed, columns=preproc_pipeline.get_feature_names_out()
)

data_processed.head()

Unnamed: 0,samplesize,days_to_elec,poll_length,rating,next_elec_date,BRX_FC,CON_FC,GRE_FC,LAB_FC,LIB_FC,...,NAT_ACT,BRX_ACT,CON_ACT,GRE_ACT,LIB_ACT,LAB_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE
0,0.012657,0.026923,0.908189,3.0,2017-06-08,0.0,0.44,0.04,0.29,0.08,...,0.014415,0.0,0.423427,0.015909,0.073654,0.399893,0.005107,0.030356,0.018447,0.033062
1,0.014874,0.023077,0.908189,3.0,2017-06-08,0.0,0.42,0.04,0.31,0.1,...,0.014415,0.0,0.423427,0.015909,0.073654,0.399893,0.005107,0.030356,0.018447,0.033062
2,0.009707,0.019231,0.908189,3.0,2017-06-08,0.0,0.46,0.03,0.31,0.09,...,0.014415,0.0,0.423427,0.015909,0.073654,0.399893,0.005107,0.030356,0.018447,0.033062
3,0.009307,0.015385,0.908189,3.0,2017-06-08,0.0,0.46,0.04,0.32,0.08,...,0.014415,0.0,0.423427,0.015909,0.073654,0.399893,0.005107,0.030356,0.018447,0.033062
4,0.009716,0.011538,0.908189,3.0,2017-06-08,0.0,0.46,0.02,0.34,0.07,...,0.014415,0.0,0.423427,0.015909,0.073654,0.399893,0.005107,0.030356,0.018447,0.033062
