In [1]:
# Imports using Sklearn make shortcut functions
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import make_pipeline

from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

# Imports
import os
from google.cloud import bigquery

In [2]:
# Environment variables
gcp_project_id = os.environ['GCP_PROJECT']
gcp_service_account_key = os.environ['GCP_SERVICE_ACCOUNT_KEY']
bq_source_dataset = os.environ['BQ_SOURCE_DATASET']
bq_cleaned_dataset = os.environ['BQ_CLEANED_DATASET']

In [3]:
# Import data from BQ and set data to X
# Initialize a BigQuery client using the service account JSON file
bq_client = bigquery.Client(project=gcp_project_id).from_service_account_json(gcp_service_account_key)

# Setting the table we want from the source dataset
select_table = 'cleaned_full_polls_combined_national_results_2004_2019'

# SQL query for querying Big Query and fetching entire table
query = f"""
    SELECT *
    FROM `{gcp_project_id}.{bq_cleaned_dataset}.{select_table}`
"""

In [4]:
# Use BQ client to create DF from the selected table
data = bq_client.query(query).to_dataframe()



In [5]:
# Replace any found NaN values with 0
data.replace(np.nan, 0, inplace=True)

In [6]:
# Add a field 'poll length' that shows number of days the poll was held for
data['poll_length'] = pd.to_datetime(data.enddate) - pd.to_datetime(data.startdate)
data['poll_length'] = data['poll_length'].dt.days

In [7]:
# Divide forecasts by 100 to create values between 0-1
for column in ['BRX_FC', 'CON_FC', 'GRE_FC', 'LAB_FC', 'LIB_FC', 'NAT_FC', 'OTH_FC', 'PLC_FC', 'SNP_FC', 'UKI_FC']:
    data[column] = data[column] / 100

In [8]:
# Divide actuals by 100 to create values between 0-1
for column in ['BRX_ACT', 'CON_ACT', 'GRE_ACT', 'LIB_ACT', 'LAB_ACT', 'NAT_ACT', 'PLC_ACT', 'SNP_ACT', 'UKI_ACT', 'OTH_PERCENTAGE']:
    data[column] = data[column] / 100

In [9]:
data.sort_values('enddate', inplace=True)

In [10]:
data.reset_index(inplace=True)

In [31]:
data[data.poll_length == -365.000000]

Unnamed: 0,index,startdate,enddate,pollster,samplesize,rating,next_elec_date,days_to_elec,BRX_FC,CON_FC,...,BRX_ACT,CON_ACT,GRE_ACT,LIB_ACT,LAB_ACT,PLC_ACT,SNP_ACT,UKI_ACT,OTH_PERCENTAGE,poll_length
2568,170,2016-05-05,2015-05-06,IpsosMORI,1186,A-,2017-06-08,399,0.0,0.36,...,0.0,0.423427,0.015909,0.073654,0.399893,0.005107,0.030356,0.018447,0.033062,-365


In [11]:
data_by_rating = data.groupby(by='rating')

In [25]:
data_by_rating.describe()

Unnamed: 0_level_0,index,index,index,index,index,index,index,index,samplesize,samplesize,...,OTH_PERCENTAGE,OTH_PERCENTAGE,poll_length,poll_length,poll_length,poll_length,poll_length,poll_length,poll_length,poll_length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
A-,2130.0,1634.234272,821.600943,76.0,941.25,1756.5,2288.75,3101.0,2130.0,1925.728169,...,0.036821,0.043738,2130.0,0.884507,13.742816,-365.0,1.0,1.0,1.0,22.0
B,49.0,279.714286,241.409231,7.0,266.0,278.0,290.0,1332.0,49.0,1484.163265,...,0.036821,0.036821,49.0,3.142857,1.136515,1.0,3.0,3.0,3.0,6.0
B+,281.0,1699.245552,1231.376028,69.0,663.0,1243.0,3189.0,3259.0,281.0,1465.978648,...,0.038223,0.038223,281.0,1.800712,1.036546,0.0,1.0,2.0,2.0,6.0
C-,14.0,751.214286,165.767761,700.0,704.25,707.5,710.75,1327.0,14.0,1748.0,...,0.038223,0.038223,14.0,5.071429,1.979288,1.0,5.0,5.5,6.0,8.0
D,65.0,2491.846154,917.684267,169.0,2914.0,2930.0,2946.0,2962.0,65.0,1259.061538,...,0.032678,0.038223,65.0,2.830769,1.850416,0.0,1.0,3.0,4.0,9.0
D+,613.0,1469.619902,1014.382101,0.0,742.0,1285.0,2757.0,2910.0,613.0,1673.194127,...,0.038223,0.043738,613.0,1.895595,1.935782,0.0,1.0,2.0,2.0,38.0
D-,52.0,3071.153846,402.981262,223.0,3113.75,3126.5,3139.25,3152.0,52.0,2257.769231,...,0.032678,0.033062,52.0,2.557692,2.278746,0.0,2.0,2.0,2.0,12.0
F,56.0,1910.589286,976.181927,694.0,1127.75,1234.5,2973.25,2987.0,56.0,2014.660714,...,0.038223,0.038223,56.0,1.357143,0.772918,0.0,1.0,1.0,2.0,3.0
