In [29]:
# Imports using Sklearn make shortcut functions
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
import pandas as pd
import numpy as np

# Imports
import os
from google.cloud import bigquery

In [52]:
# Environment variables
gcp_project_id = os.environ['GCP_PROJECT']
gcp_service_account_key = os.environ['GCP_SERVICE_ACCOUNT_KEY']
bq_source_dataset = os.environ['BQ_SOURCE_DATASET']
bq_cleaned_dataset = os.environ['BQ_CLEANED_DATASET']

In [69]:
# Import data from BQ and set data to X
# Initialize a BigQuery client using the service account JSON file
bq_client = bigquery.Client(project=gcp_project_id).from_service_account_json(gcp_service_account_key)

# Setting the table we want from the source dataset
select_table = 'cleaned_full_polls_combined_national_results_2004_2019'

# SQL query for querying Big Query and fetching entire table
query = f"""
    SELECT *
    FROM `{gcp_project_id}.{bq_cleaned_dataset}.{select_table}`
"""

In [79]:
# Use BQ client to create DF from the selected table
data = bq_client.query(query).to_dataframe()

In [47]:
# # Logic for dealing with specific NaN values in specified columns

# # Select column names including _FC and _ACT for NaN searching
# nan_search_columns = list(data.filter(regex='_FC|_ACT', axis=1).columns)

# if data.isna().any().any():
#     # Replace NaN with 0 values for _FC and _ACT columns
#     data.loc[:, nan_search_columns] = data[nan_search_columns].fillna(0)

# if data.isna().any().any():
#     raise(ValueError('NaN values outside of _FC and _ACT olumns still present in the dataset'))

In [80]:
# Replace any found NaN values with 0
data.replace(np.nan, 0, inplace=True)

In [81]:
# Add a field 'poll length' that shows number of days the poll was held for
data['poll_length'] = pd.to_datetime(data.enddate) - pd.to_datetime(data.startdate)
data['poll_length'] = data['poll_length'].dt.days

In [82]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'NAT_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']:
    data[column] = data[column] / 100

In [83]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']:
    data[column] = data[column] / 100

In [99]:
# Drop columns we don't need
data = data.drop(columns=['startdate', 'enddate', 'pollster'])

In [100]:
#TODO Build num transformer – refactor transformer
num_col = ['samplesize', 'days_to_elec', 'poll_length']
num_transformer = make_pipeline(MinMaxScaler())

In [101]:
data.rating.value_counts()

rating
A-    2130
D+     613
B+     281
D       65
F       56
D-      52
B       49
C-      14
Name: count, dtype: int64

In [102]:
#TODO Build cat transformer (encoding and imputing etc) – refactor transformer

cat_col = ['rating']
cat_transformer = make_pipeline(OrdinalEncoder(categories = [['F','D-','D','D+','C-','B','B+','A-']]))

In [103]:
#TODO Check column transformer
preproc_mvp = make_column_transformer(
    (num_transformer, num_col),
    (cat_transformer, cat_col),
    remainder='passthrough',
    verbose_feature_names_out=False
)

In [104]:
# Fit preproc_mvp to data
data_processed = preproc_mvp.fit_transform(data)

# Check feature names make sense
data_processed = pd.DataFrame(data_processed, columns=preproc_mvp.get_feature_names_out())

In [83]:
data_processed.head()

Unnamed: 0,samplesize,rating,next_elec_date,days_to_elec,BRX_FC,CON_FC,GRE_FC,LAB_FC,LIB_FC,NAT_FC,...,BRX_ACT,CON_ACT,GRE_ACT,LIB_ACT,LAB_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE,poll_length
0,0.000343,0.268132,0.91067,3.0,2005-05-05,0.0,0.35,0.0,0.4,0.18,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838
1,0.004539,0.26044,0.91067,3.0,2005-05-05,0.0,0.34,0.0,0.39,0.2,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838
2,0.000476,0.248901,0.91067,3.0,2005-05-05,0.0,0.31,0.0,0.36,0.25,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838
3,0.00453,0.241209,0.91067,3.0,2005-05-05,0.0,0.34,0.0,0.36,0.21,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838
4,0.000409,0.233516,0.91067,3.0,2005-05-05,0.0,0.34,0.0,0.36,0.22,...,0.0,0.0,0.323596,0.009491,0.220256,0.351872,0.00644,0.015186,0.022322,0.050838


In [None]:
#TODO Build union of of transformers into full preprocessor pipeline

# not sure we need this

In [None]:
#TODO Export pipeline as a Pickle file for use in the API