# Predicting turnout

In [1]:
%load_ext black

# Libaries that may or may not be useful
import matplotlib
import numpy as np
from pathlib import Path
import pandas as pd

# import seaborn as sns
from sklearn import (
    ensemble,
    linear_model,
    metrics,
    model_selection,
    neural_network,
    tree,
)
from lightgbm import LGBMRegressor

import maven

# Config
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
DATA_DIR = Path("data/")

The black module is not an IPython extension.


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df = pd.read_csv(
    "data/general-election/UK/2015/model/processed/general_election-uk-2015-model.csv"
)
df.head(15)

Unnamed: 0,Press Association Reference,Constituency Name,Region,Electorate,Votes,party,votes_last,pc_last,win_last,polls_now,swing_now,swing_forecast_pc,swing_forecast_win,actual_win_now,actual_pc_now,con,grn,lab,ld,ukip,Region_East Midlands,Region_Eastern,Region_London,Region_North East,Region_North West,Region_Northern Ireland,Region_Scotland,Region_South East,Region_South West,Region_Wales,Region_West Midlands,Region_Yorkshire and the Humber,won_here_last,turnout
0,1.0,Aberavon,Wales,50838.0,30958,con,4411.0,0.142483,lab,0.338182,-0.062021,0.133646,lab,lab,0.118707,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.608954
1,1.0,Aberavon,Wales,50838.0,30958,lab,16073.0,0.519187,lab,0.337273,0.1634,0.604022,lab,lab,0.48904,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0.608954
2,1.0,Aberavon,Wales,50838.0,30958,ld,5034.0,0.162607,lab,0.09,-0.609159,0.063554,lab,lab,0.044317,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.608954
3,1.0,Aberavon,Wales,50838.0,30958,ukip,489.0,0.015796,lab,0.127273,3.109344,0.06491,lab,lab,0.157694,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0.608954
4,1.0,Aberavon,Wales,50838.0,30958,grn,0.0,0.0,lab,0.048182,4.008203,0.0,lab,lab,0.022555,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.608954
5,2.0,Aberconwy,Wales,44593.0,29966,con,10734.0,0.358206,con,0.338182,-0.062021,0.33599,con,con,0.415052,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0.671989
6,2.0,Aberconwy,Wales,44593.0,29966,lab,7336.0,0.244811,con,0.337273,0.1634,0.284813,con,con,0.282407,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.671989
7,2.0,Aberconwy,Wales,44593.0,29966,ld,5786.0,0.193085,con,0.09,-0.609159,0.075466,con,con,0.046139,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.671989
8,2.0,Aberconwy,Wales,44593.0,29966,ukip,632.0,0.021091,con,0.127273,3.109344,0.086668,con,con,0.114999,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0.671989
9,2.0,Aberconwy,Wales,44593.0,29966,grn,0.0,0.0,con,0.048182,4.008203,0.0,con,con,0.024114,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0.671989


In [3]:
census = pd.read_csv("census_file.csv")

In [4]:
df = df.merge(
    census[
        [
            "c11Male",
            "c11EthnicityWhite",
            "c11EthnicityMixed",
            "c11EthnicityAsian",
            "c11EthnicityBlack",
            "c11EthnicityOther",
            "c11EthnicityWhiteBritish",
            "c11EthnicityWhiteIrish",
            "c11EthnicityWhiteTraveller",
            "c11EthnicityWhiteOther",
            "c11EthnicityMixedCaribbean",
            "c11EthnicityMixedAfrican",
            "c11EthnicityMixedAsian",
            "c11EthnicityMixedOther",
            "c11EthnicityIndian",
            "c11EthnicityPakistani",
            "c11EthnicityBangladeshi",
            "c11EthnicityChinese",
            "c11EthnicityOtherAsian",
            "c11EthnicityBlackAfrican",
            "c11EthnicityBlackCaribbean",
            "c11EthnicityBlackOther",
            "c11EthnicityArab",
            "c11EthnicityAnyOther",
            "c11Degree",
            "c11Age0to4",
            "c11Age5to7",
            "c11Age8to9",
            "c11Age10to14",
            "c11Age15",
            "c11Age16to17",
            "c11Age18to19",
            "c11Age20to24",
            "c11Age25to29",
            "c11Age30to44",
            "c11Age45to59",
            "c11Age60to64",
            "c11Age65to74",
            "c11Age75to84",
            "c11Age85to89",
            "c11Age90plus",
            "constituency_name",
        ]
    ].rename({"constituency_name": "Constituency Name"}, axis=1),
    on="Constituency Name",
)

In [5]:
df.columns

Index(['Press Association Reference', 'Constituency Name', 'Region',
       'Electorate', 'Votes', 'party', 'votes_last', 'pc_last', 'win_last',
       'polls_now', 'swing_now', 'swing_forecast_pc', 'swing_forecast_win',
       'actual_win_now', 'actual_pc_now', 'con', 'grn', 'lab', 'ld', 'ukip',
       'Region_East Midlands', 'Region_Eastern', 'Region_London',
       'Region_North East', 'Region_North West', 'Region_Northern Ireland',
       'Region_Scotland', 'Region_South East', 'Region_South West',
       'Region_Wales', 'Region_West Midlands',
       'Region_Yorkshire and the Humber', 'won_here_last', 'turnout',
       'c11Male', 'c11EthnicityWhite', 'c11EthnicityMixed',
       'c11EthnicityAsian', 'c11EthnicityBlack', 'c11EthnicityOther',
       'c11EthnicityWhiteBritish', 'c11EthnicityWhiteIrish',
       'c11EthnicityWhiteTraveller', 'c11EthnicityWhiteOther',
       'c11EthnicityMixedCaribbean', 'c11EthnicityMixedAfrican',
       'c11EthnicityMixedAsian', 'c11EthnicityMixedOther

In [6]:
USELESS = [
    "Press Association Reference",
    "con",
    "grn",
    "lab",
    "ld",
    "ukip",
    "Region_East Midlands",
    "Region_Eastern",
    "Region_London",
    "Region_North East",
    "Region_North West",
    "Region_Northern Ireland",
    "Region_Scotland",
    "Region_South East",
    "Region_South West",
    "Region_Wales",
    "Region_West Midlands",
    "Region_Yorkshire and the Humber",
    "won_here_last",
    "win_last",
    "actual_win_now"
]
CATEGORICAL = [
    "Constituency Name",
    "Region",
    "party",
#    "win_last",
    "swing_forecast_win",
#    "actual_win_now",
    "actual_pc_now",
]

df[CATEGORICAL] = df[CATEGORICAL].astype("category")

In [7]:
df.drop(USELESS, axis=1, inplace=True)

In [8]:
df.head(10)

Unnamed: 0,Constituency Name,Region,Electorate,Votes,party,votes_last,pc_last,polls_now,swing_now,swing_forecast_pc,swing_forecast_win,actual_pc_now,turnout,c11Male,c11EthnicityWhite,c11EthnicityMixed,c11EthnicityAsian,c11EthnicityBlack,c11EthnicityOther,c11EthnicityWhiteBritish,c11EthnicityWhiteIrish,c11EthnicityWhiteTraveller,c11EthnicityWhiteOther,c11EthnicityMixedCaribbean,c11EthnicityMixedAfrican,c11EthnicityMixedAsian,c11EthnicityMixedOther,c11EthnicityIndian,c11EthnicityPakistani,c11EthnicityBangladeshi,c11EthnicityChinese,c11EthnicityOtherAsian,c11EthnicityBlackAfrican,c11EthnicityBlackCaribbean,c11EthnicityBlackOther,c11EthnicityArab,c11EthnicityAnyOther,c11Degree,c11Age0to4,c11Age5to7,c11Age8to9,c11Age10to14,c11Age15,c11Age16to17,c11Age18to19,c11Age20to24,c11Age25to29,c11Age30to44,c11Age45to59,c11Age60to64,c11Age65to74,c11Age75to84,c11Age85to89,c11Age90plus
0,Aberavon,Wales,50838.0,30958,con,4411.0,0.142483,0.338182,-0.062021,0.133646,lab,0.118707,0.608954,48.766879,97.586681,0.733371,1.25051,0.296372,0.133065,96.322562,0.371978,0.133065,0.759076,0.332663,0.054436,0.190525,0.155747,0.234376,0.117944,0.320566,0.2374,0.340223,0.108872,0.170868,0.016633,0.061996,0.071069,8.289718,5.511621,3.16181,2.000514,5.750533,1.236901,2.683985,2.271181,6.010615,6.539852,19.132657,20.889722,6.494488,9.53533,6.270697,1.6739,0.836194
1,Aberavon,Wales,50838.0,30958,lab,16073.0,0.519187,0.337273,0.1634,0.604022,lab,0.48904,0.608954,48.766879,97.586681,0.733371,1.25051,0.296372,0.133065,96.322562,0.371978,0.133065,0.759076,0.332663,0.054436,0.190525,0.155747,0.234376,0.117944,0.320566,0.2374,0.340223,0.108872,0.170868,0.016633,0.061996,0.071069,8.289718,5.511621,3.16181,2.000514,5.750533,1.236901,2.683985,2.271181,6.010615,6.539852,19.132657,20.889722,6.494488,9.53533,6.270697,1.6739,0.836194
2,Aberavon,Wales,50838.0,30958,ld,5034.0,0.162607,0.09,-0.609159,0.063554,lab,0.044317,0.608954,48.766879,97.586681,0.733371,1.25051,0.296372,0.133065,96.322562,0.371978,0.133065,0.759076,0.332663,0.054436,0.190525,0.155747,0.234376,0.117944,0.320566,0.2374,0.340223,0.108872,0.170868,0.016633,0.061996,0.071069,8.289718,5.511621,3.16181,2.000514,5.750533,1.236901,2.683985,2.271181,6.010615,6.539852,19.132657,20.889722,6.494488,9.53533,6.270697,1.6739,0.836194
3,Aberavon,Wales,50838.0,30958,ukip,489.0,0.015796,0.127273,3.109344,0.06491,lab,0.157694,0.608954,48.766879,97.586681,0.733371,1.25051,0.296372,0.133065,96.322562,0.371978,0.133065,0.759076,0.332663,0.054436,0.190525,0.155747,0.234376,0.117944,0.320566,0.2374,0.340223,0.108872,0.170868,0.016633,0.061996,0.071069,8.289718,5.511621,3.16181,2.000514,5.750533,1.236901,2.683985,2.271181,6.010615,6.539852,19.132657,20.889722,6.494488,9.53533,6.270697,1.6739,0.836194
4,Aberavon,Wales,50838.0,30958,grn,0.0,0.0,0.048182,4.008203,0.0,lab,0.022555,0.608954,48.766879,97.586681,0.733371,1.25051,0.296372,0.133065,96.322562,0.371978,0.133065,0.759076,0.332663,0.054436,0.190525,0.155747,0.234376,0.117944,0.320566,0.2374,0.340223,0.108872,0.170868,0.016633,0.061996,0.071069,8.289718,5.511621,3.16181,2.000514,5.750533,1.236901,2.683985,2.271181,6.010615,6.539852,19.132657,20.889722,6.494488,9.53533,6.270697,1.6739,0.836194
5,Aberconwy,Wales,44593.0,29966,con,10734.0,0.358206,0.338182,-0.062021,0.33599,con,0.415052,0.671989,48.334663,97.837455,0.840202,0.879199,0.17194,0.271204,95.175042,0.758663,0.044314,1.859435,0.283613,0.143579,0.22689,0.186121,0.200301,0.060268,0.088629,0.276522,0.253479,0.101037,0.028361,0.042542,0.148897,0.122308,14.152319,4.956129,2.761677,1.777896,5.475494,1.123815,2.451476,2.148365,5.138704,4.729239,16.867854,20.485686,7.861384,12.005672,8.685633,2.338031,1.192945
6,Aberconwy,Wales,44593.0,29966,lab,7336.0,0.244811,0.337273,0.1634,0.284813,con,0.282407,0.671989,48.334663,97.837455,0.840202,0.879199,0.17194,0.271204,95.175042,0.758663,0.044314,1.859435,0.283613,0.143579,0.22689,0.186121,0.200301,0.060268,0.088629,0.276522,0.253479,0.101037,0.028361,0.042542,0.148897,0.122308,14.152319,4.956129,2.761677,1.777896,5.475494,1.123815,2.451476,2.148365,5.138704,4.729239,16.867854,20.485686,7.861384,12.005672,8.685633,2.338031,1.192945
7,Aberconwy,Wales,44593.0,29966,ld,5786.0,0.193085,0.09,-0.609159,0.075466,con,0.046139,0.671989,48.334663,97.837455,0.840202,0.879199,0.17194,0.271204,95.175042,0.758663,0.044314,1.859435,0.283613,0.143579,0.22689,0.186121,0.200301,0.060268,0.088629,0.276522,0.253479,0.101037,0.028361,0.042542,0.148897,0.122308,14.152319,4.956129,2.761677,1.777896,5.475494,1.123815,2.451476,2.148365,5.138704,4.729239,16.867854,20.485686,7.861384,12.005672,8.685633,2.338031,1.192945
8,Aberconwy,Wales,44593.0,29966,ukip,632.0,0.021091,0.127273,3.109344,0.086668,con,0.114999,0.671989,48.334663,97.837455,0.840202,0.879199,0.17194,0.271204,95.175042,0.758663,0.044314,1.859435,0.283613,0.143579,0.22689,0.186121,0.200301,0.060268,0.088629,0.276522,0.253479,0.101037,0.028361,0.042542,0.148897,0.122308,14.152319,4.956129,2.761677,1.777896,5.475494,1.123815,2.451476,2.148365,5.138704,4.729239,16.867854,20.485686,7.861384,12.005672,8.685633,2.338031,1.192945
9,Aberconwy,Wales,44593.0,29966,grn,0.0,0.0,0.048182,4.008203,0.0,con,0.024114,0.671989,48.334663,97.837455,0.840202,0.879199,0.17194,0.271204,95.175042,0.758663,0.044314,1.859435,0.283613,0.143579,0.22689,0.186121,0.200301,0.060268,0.088629,0.276522,0.253479,0.101037,0.028361,0.042542,0.148897,0.122308,14.152319,4.956129,2.761677,1.777896,5.475494,1.123815,2.451476,2.148365,5.138704,4.729239,16.867854,20.485686,7.861384,12.005672,8.685633,2.338031,1.192945


In [9]:
model = LGBMRegressor()

In [10]:
from sklearn.model_selection import GroupKFold, KFold
lgbm = LGBMRegressor(n_estimators=10000, learning_rate=0.01)
train_data = df.drop(["turnout", "Votes"], axis=1)
for train_idx, val_idx in GroupKFold(3).split(df, groups=df["Constituency Name"]):
    lgbm.fit(
        df.drop(["turnout", "Votes"], axis=1).iloc[train_idx],
        df.turnout.iloc[train_idx],
        eval_set=[
            (
                df.drop(["turnout", "Votes"], axis=1).iloc[train_idx],
                df.turnout.iloc[train_idx],
            ),
            (
                df.drop(["turnout", "Votes"], axis=1).iloc[val_idx],
                df.turnout.iloc[val_idx],
            ),
        ],
        eval_metric="mae",
        early_stopping_rounds=100,
        verbose=100,
    )
    break

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l1: 0.0189847	valid_0's l2: 0.000605654	valid_1's l1: 0.0340545	valid_1's l2: 0.00196634
[200]	valid_0's l1: 0.00903709	valid_0's l2: 0.000153685	valid_1's l1: 0.0312447	valid_1's l2: 0.00183519
[300]	valid_0's l1: 0.00495126	valid_0's l2: 5.0879e-05	valid_1's l1: 0.03036	valid_1's l2: 0.00177262
[400]	valid_0's l1: 0.00286126	valid_0's l2: 2.13553e-05	valid_1's l1: 0.0297539	valid_1's l2: 0.00173897
[500]	valid_0's l1: 0.00177398	valid_0's l2: 1.14453e-05	valid_1's l1: 0.0294223	valid_1's l2: 0.00170932
[600]	valid_0's l1: 0.00119447	valid_0's l2: 7.32663e-06	valid_1's l1: 0.0292841	valid_1's l2: 0.0016957
[700]	valid_0's l1: 0.000855574	valid_0's l2: 5.41513e-06	valid_1's l1: 0.0292423	valid_1's l2: 0.00169014
[800]	valid_0's l1: 0.000661562	valid_0's l2: 4.31765e-06	valid_1's l1: 0.0292391	valid_1's l2: 0.00168607
Early stopping, best iteration is:
[758]	valid_0's l1: 0.000732557	valid_0's l2: 4.71607e-06

In [11]:
model = LGBMRegressor(n_estimators=463)

# Some useful functions
def score_model(model, X, y, repeats=5):
    """Calculates a "5x5" (repeated 5-fold) cross-validated shuffled mean average error.
       Returns the mean across 5 (default) repeats."""
    mmae = []
    for i in range(repeats):
        mmae += [
            -np.mean(
                model_selection.cross_val_score(
                    model,
                    X=X,
                    y=y,
                    cv=model_selection.KFold(n_splits=5, shuffle=True),
                    scoring="neg_mean_absolute_error",
                )
            )
        ]
    return np.mean(mmae)


def score_features(model, features):
    """Helper function to run scoring function for a feature subset and using turnout.
       This equates to average turnout error per seat."""
    return score_model(model=model, X=df[features], y=df["turnout"])

In [12]:
# 0.3% average error per seat
score_features(model=model, features=train_data.columns)

0.000386788552185634

In [13]:
df.groupby("Constituency Name").turnout.std().describe()

count    373.0
mean       0.0
std        0.0
min        0.0
25%        0.0
50%        0.0
75%        0.0
max        0.0
Name: turnout, dtype: float64

In [14]:
from eli5 import explain_weights
explain_weights(lgbm)

Weight,Feature
0.3155,c11Age18to19
0.1442,c11Age90plus
0.0966,c11Degree
0.0846,c11Age20to24
0.0719,c11Age25to29
0.0484,c11Male
0.0200,c11Age45to59
0.0146,Electorate
0.0135,c11Age8to9
0.0131,c11EthnicityWhiteIrish
