In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn import cluster, preprocessing
%matplotlib inline

# Data

Load data from `nc_complete-2014.csv`.

In [2]:
df = pd.read_csv('nc_complete-2014.csv', index_col=0)
df.head()

Unnamed: 0_level_0,county_name,precinct_id,sen_red,sen_blue,con_districts,con_contested,con_red_votes,con_blue_votes,sldu_districts,sldu_contested,...,hispanic_pop,other_pop,median_age,median_income,education_pop,school_pop,diploma_pop,college_pop,graduate_pop,area_km2
shape_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2368,ALAMANCE,01,1071,295,2,Yes,1109,313.0,24,No,...,85,108,42,44870,3679,727,1518,1278,156,129
2366,ALAMANCE,02,1246,313,6,Yes,1285,314.0,24,No,...,203,102,41,49485,3525,601,1318,1361,245,77
2361,ALAMANCE,035,920,434,6,Yes,955,431.0,24,No,...,624,204,27,41560,3624,684,1173,1591,176,27
2374,ALAMANCE,03C,943,455,6,Yes,979,447.0,24,No,...,127,44,46,67658,2237,133,557,1167,381,5
2723,ALAMANCE,03N,862,586,6,Yes,881,582.0,24,No,...,85,20,37,67101,2792,92,691,1508,501,5


### Derived Values

Combine contested votes into single percentages, calculate population density, and use population rates instead of raw values.

In [3]:
df = df.assign(density = df.population / df.area_km2.clip(lower=1))

df = df.assign(sen_pctblue = df.sen_blue / (df.sen_blue + df.sen_red))
df = df.assign(sen_turnout = (df.sen_blue + df.sen_red) / df.population)
df = df.assign(con_pctblue = df.con_blue_votes / (df.con_blue_votes + df.con_red_votes))
df = df.assign(con_turnout = (df.con_blue_votes + df.con_red_votes) / df.population)
df = df.assign(sldu_pctblue = df.sldu_blue_votes / (df.sldu_blue_votes + df.sldu_red_votes))
df = df.assign(sldu_turnout = (df.sldu_blue_votes + df.sldu_red_votes) / df.population)
df = df.assign(sldl_pctblue = df.sldl_blue_votes / (df.sldl_blue_votes + df.sldl_red_votes))
df = df.assign(sldl_turnout = (df.sldl_blue_votes + df.sldl_red_votes) / df.population)

df = df.assign(pct_white = df.white_pop / df.population)
df = df.assign(pct_black = df.black_pop / df.population)
df = df.assign(pct_asian = df.asian_pop / df.population)
df = df.assign(pct_hispanic = df.hispanic_pop / df.population)
df = df.assign(pct_other = df.other_pop / df.population)

df = df.assign(pct_school = df.school_pop / df.education_pop)
df = df.assign(pct_diploma = df.diploma_pop / df.education_pop)
df = df.assign(pct_college = df.college_pop / df.education_pop)
df = df.assign(pct_graduate = df.graduate_pop / df.education_pop)

df = df.assign(pct_income = df.median_income / df.median_income.max())
df = df.assign(pct_age = df.median_age / df.median_age.max())

# Learn

In [4]:
from sklearn import datasets, linear_model
import numpy as np

## SLDU Blue Votes

Pick out all complete list of independent variables plus the SLDU dependent variables: `sldu_pctblue` (SLDU Democratic vote percentage) and `sldu_turnout` (number of voters out of total population). After some trial-and-error with the full list of fields, I saw the best fit results with `pct_black` (Race), `pct_income` (Income), `pct_age` (Age), and `pct_diploma` (Education).

In [5]:
indep_fields = (
    'pct_black', 'pct_income', 'pct_age', 'pct_diploma',
    'sen_pctblue', 'con_pctblue', 'sen_turnout', 'con_turnout')

sldu1_full_all = df.query('sldu_contested == "Yes" and con_contested == "Yes"').filter(items=indep_fields + ('sldu_pctblue',)).dropna()
sldu1_full_input = sldu1_full_all.filter(items=indep_fields)
sldu1_full_output = sldu1_full_all.filter(items=('sldu_pctblue',))

assert sldu1_full_input.shape == sldu1_full_input.dropna().shape
assert sldu1_full_output.shape == sldu1_full_output.dropna().shape

Add `sen_pctblue` and `con_pctblue` values for statewide races.

In [6]:
test1_fields = (
    'pct_black', 'pct_income', 'pct_age',
    'pct_diploma', 'sen_pctblue', 'con_pctblue')

sldu1_limited_input = sldu1_full_all.filter(items=test1_fields)

training_size = len(sldu1_full_input) * 4 // 5
print('Training set length:', training_size)
sldu1_limited_train, sldu1_limited_test = sldu1_limited_input[:training_size], sldu1_limited_input[training_size:]
sldu1_output_train, sldu1_output_test = sldu1_full_output[:training_size], sldu1_full_output[training_size:]

sldu1_regr = linear_model.LinearRegression()
sldu1_regr.fit(sldu1_limited_train, sldu1_output_train)

print('Linear regression score: {:.0f}%'.format(100 * sldu1_regr.score(sldu1_limited_test, sldu1_output_test)))

coefficients = sorted([(abs(c), test1_fields[i]) for (i, c) in enumerate(sldu1_regr.coef_[0,:])], reverse=True)
print('Coefficients:', ', '.join(['{1} ({0:.02f})'.format(c, f) for (c, f) in coefficients if c > .01]))

Training set length: 1179
Linear regression score: 95%
Coefficients: sen_pctblue (1.02), pct_income (0.11), con_pctblue (0.07), pct_diploma (0.06), pct_black (0.01)




In [7]:
sldu1_real_input = df.query('sldu_contested != "Yes" and con_contested == "Yes"').filter(items=test1_fields).dropna()
sldu1_real_output = sldu1_real_input.assign(sldu_pctblue=sldu1_regr.predict(sldu1_real_input))
sldu1_real_output.to_csv('sldu1_real_output.csv')

## SLDU Voter Turnout

In [8]:
sldu2_full_all = df.query('sldu_contested == "Yes" and con_contested == "Yes"').filter(items=indep_fields + ('sldu_turnout',)).dropna()
sldu2_full_input = sldu2_full_all.filter(items=indep_fields)
sldu2_full_output = sldu2_full_all.filter(items=('sldu_turnout',))

assert sldu2_full_input.shape == sldu2_full_input.dropna().shape
assert sldu2_full_output.shape == sldu2_full_output.dropna().shape

Add `sen_turnout` and `con_turnout` values for statewide races.

In [9]:
test2_fields = (
    'pct_black', 'pct_income', 'pct_age',
    'pct_diploma', 'sen_turnout', 'con_turnout')

sldu2_limited_input = sldu2_full_all.filter(items=test2_fields)

training_size = len(sldu2_full_input) * 4 // 5
print('Training set length:', training_size)
sldu2_limited_train, sldu2_limited_test = sldu2_limited_input[:training_size], sldu2_limited_input[training_size:]
sldu2_output_train, sldu2_output_test = sldu2_full_output[:training_size], sldu2_full_output[training_size:]

sldu2_regr = linear_model.LinearRegression()
sldu2_regr.fit(sldu2_limited_train, sldu2_output_train)

print('Linear regression score: {:.0f}%'.format(100 * sldu2_regr.score(sldu2_limited_test, sldu2_output_test)))

coefficients = sorted([(abs(c), test2_fields[i]) for (i, c) in enumerate(sldu2_regr.coef_[0,:])], reverse=True)
print('Coefficients:', ', '.join(['{1} ({0:.02f})'.format(c, f) for (c, f) in coefficients if c > .01]))

Training set length: 1179
Linear regression score: 100%
Coefficients: sen_turnout (1.02), pct_income (0.04), pct_black (0.02), pct_diploma (0.02), con_turnout (0.01)


In [10]:
sldu2_real_input = df.query('sldu_contested != "Yes" and con_contested == "Yes"').filter(items=test2_fields).dropna()
sldu2_real_output = sldu2_real_input.assign(sldu_turnout=sldu2_regr.predict(sldu2_real_input))
sldu2_real_output.to_csv('sldu2_real_output.csv')

## SLDL Blue Votes

Pick out all complete list of independent variables plus the SLDL dependent variables: `sldl_pctblue` (SLDL Democratic vote percentage) and `sldl_turnout` (number of voters out of total population).

In [11]:
sldl1_full_all = df.query('sldl_contested == "Yes" and con_contested == "Yes"').filter(items=indep_fields + ('sldl_pctblue',)).dropna()
sldl1_full_input = sldl1_full_all.filter(items=indep_fields)
sldl1_full_output = sldl1_full_all.filter(items=('sldl_pctblue',))

assert sldl1_full_input.shape == sldl1_full_input.dropna().shape
assert sldl1_full_output.shape == sldl1_full_output.dropna().shape

Add `sen_pctblue` and `con_pctblue` values for statewide races.

In [None]:
test1_fields = (
    'pct_black', 'pct_income', 'pct_age',
    'pct_diploma', 'sen_pctblue', 'con_pctblue')

sldl1_limited_input = sldl1_full_all.filter(items=test1_fields)

training_size = len(sldl1_full_input) * 4 // 5
print('Training set length:', training_size)
sldl1_limited_train, sldl1_limited_test = sldl1_limited_input[:training_size], sldl1_limited_input[training_size:]
sldl1_output_train, sldl1_output_test = sldl1_full_output[:training_size], sldl1_full_output[training_size:]

sldl1_regr = linear_model.LinearRegression()
sldl1_regr.fit(sldl1_limited_train, sldl1_output_train)

print('Linear regression score: {:.0f}%'.format(100 * sldl1_regr.score(sldl1_limited_test, sldl1_output_test)))

coefficients = sorted([(abs(c), test1_fields[i]) for (i, c) in enumerate(sldl1_regr.coef_[0,:])], reverse=True)
print('Coefficients:', ', '.join(['{1} ({0:.02f})'.format(c, f) for (c, f) in coefficients if c > .01]))

Training set length: 992
Linear regression score: 92%
Coefficients: sen_pctblue (1.05), con_pctblue (0.11), pct_income (0.10), pct_black (0.05), pct_diploma (0.04), pct_age (0.01)


In [None]:
sldl1_real_input = df.query('sldl_contested != "Yes" and con_contested == "Yes"').filter(items=test1_fields).dropna()
sldl1_real_output = sldl1_real_input.assign(sldl_pctblue=sldl1_regr.predict(sldl1_real_input))
sldl1_real_output.to_csv('sldl1_real_output.csv')

## SLDL Voter Turnout

In [None]:
sldl2_full_all = df.query('sldl_contested == "Yes" and con_contested == "Yes"').filter(items=indep_fields + ('sldl_turnout',)).dropna()
sldl2_full_input = sldl2_full_all.filter(items=indep_fields)
sldl2_full_output = sldl2_full_all.filter(items=('sldl_turnout',))

assert sldl2_full_input.shape == sldl2_full_input.dropna().shape
assert sldl2_full_output.shape == sldl2_full_output.dropna().shape

Add `sen_turnout` and `con_turnout` values for statewide races.

In [None]:
test2_fields = (
    'pct_black', 'pct_income', 'pct_age',
    'pct_diploma', 'sen_turnout', 'con_turnout')

sldl2_limited_input = sldl2_full_all.filter(items=test2_fields)

training_size = len(sldl2_full_input) * 4 // 5
print('Training set length:', training_size)
sldl2_limited_train, sldl2_limited_test = sldl2_limited_input[:training_size], sldl2_limited_input[training_size:]
sldl2_output_train, sldl2_output_test = sldl2_full_output[:training_size], sldl2_full_output[training_size:]

sldl2_regr = linear_model.LinearRegression()
sldl2_regr.fit(sldl2_limited_train, sldl2_output_train)

print('Linear regression score: {:.0f}%'.format(100 * sldl2_regr.score(sldl2_limited_test, sldl2_output_test)))

coefficients = sorted([(abs(c), test2_fields[i]) for (i, c) in enumerate(sldl2_regr.coef_[0,:])], reverse=True)
print('Coefficients:', ', '.join(['{1} ({0:.02f})'.format(c, f) for (c, f) in coefficients if c > .01]))

In [None]:
sldl2_real_input = df.query('sldl_contested != "Yes" and con_contested == "Yes"').filter(items=test2_fields).dropna()
sldl2_real_output = sldl2_real_input.assign(sldl_turnout=sldl2_regr.predict(sldl2_real_input))
sldl2_real_output.to_csv('sldl2_real_output.csv')