# Description of problem

Things.

stuff.

http://www.businessinsider.com/most-educated-places-map-2014-9

https://www.census.gov/geo/maps-data/data/tiger/char_encoding.html

# Setup

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import us

# Read data

In [None]:
def state_and_place(df):
    columns = (
        df
        .id
        .astype(str)
        .str.extract('(?P<state_fips>\d{1,2})(?P<place_fips>\d{5})', expand=True)
        .astype(int)
    )
    return df.join(columns)

In [None]:
def read_acs(year):
    filename = '../data/ACS_{year}_5YR_S1501/ACS_{year}_5YR_S1501_with_ann.csv'.format(year=str(year)[-2:])
    na_values = ('**', '-', '+', '***', '*****', 'N', '(X)')

    variables = {
        'Id2': 'id',
        'Geography': 'place',
        "Total; Estimate; Percent bachelor's degree or higher": 'pct_bachelor_plus',
        "Total; Margin of Error; Percent bachelor's degree or higher": 'pct_bachelor_plus_moe'
    }
    
    return (
        pd
        .read_csv(
            filename,
            usecols=variables.keys(),
            na_values=na_values,
            encoding='ISO-8859-1',
            skiprows=1
        )
        .rename(columns=variables)
        .pipe(state_and_place)
        .drop('id', axis=1)
        .set_index(['state_fips', 'place_fips'])
    )

In [None]:
def read_population(year):
    url = 'http://api.census.gov/data/{}/acs5?get=B01003_001E,B01003_001M&for=place:*'.format(year)
    r = requests.get(url)
    data = r.json()
    
    columns = {
        'B01003_001E': 'population',
        'B01003_001M': 'population_moe',
        'state': 'state_fips',
        'place': 'place_fips'
    }
    
    return (
        pd.DataFrame(data=data[1:], columns=data[0])
        .rename(columns=columns)
        .apply(lambda s: s.astype(int))
        .set_index(['state_fips', 'place_fips'])
    )

In [None]:
def state(df):
    data = (
        df
        .index
        .get_level_values('state_fips')
        .astype(str)
        .str.rjust(2, fillchar='0')
    )
    s = pd.Series(data=data, index=df.index)
    
    return s.apply(lambda x: us.states.lookup(x).name)

In [None]:
def read_data(year):
    acs = read_acs(year)
    population = read_population(year)
    
    return (
        acs
        .join(population)
        .assign(state=state)
        .set_index(['state', 'place'])
        .sort_index(axis=1)
    )

In [None]:
df = read_data(2012)

# Let's check out the dataset

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.dropna().describe()

# Population vs. Attainment

In [None]:
sns.jointplot('population', 'pct_bachelor_plus', data=df);

# Recreating the original metric

In [None]:
def highest_attainment_orig(df):
    # Only include places with at least 1,000 people
    subset = df.loc[df.population > 1000]
    # Find the place with the higehst attainment by state
    idx = (
        subset
        .groupby(level='state')
        .pct_bachelor_plus
        .transform('max')
        .eq(subset.pct_bachelor_plus)
    )
    
    return subset.loc[idx].sort_values('pct_bachelor_plus', ascending=False)

 Note about how results don't match. Probe into whether they dropped universities

In [None]:
df.pipe(highest_attainment_orig).population.max()

In [None]:
ax = df.plot.scatter('population', 'pct_bachelor_plus', xlim=(0, 700000))
df.pipe(highest_attainment_orig).plot.scatter('population', 'pct_bachelor_plus', ax=ax, c='red');

# The Problem: Uncertainty

Something something error. Lift language from Bayesian Inference for Hackers (and attribute appropriately)

# Doing it the right way

In [None]:
def lower_bound(df, column):
    lower = df[column] - df['{}_moe'.format(column)]
    # Minimum value is 0
    lower.loc[lower < 0] = 0
    
    return lower

In [None]:
def add_lower_bounds(df):
    return (
        df
        .assign(
            population_lower=lambda df: df.pipe(lower_bound, 'population'),
            pct_bachelor_plus_lower=lambda df: df.pipe(lower_bound, 'pct_bachelor_plus')
        )
    )

In [None]:
df = df.pipe(add_lower_bounds)

In [None]:
df.head()

In [None]:
sns.jointplot('population', 'pct_bachelor_plus_lower', data=df);

In [None]:
def highest_attainment(df):
    # Find the place with the higehst attainment by state, using the lower bound
    idx = (
        df
        .groupby(level='state')
        .pct_bachelor_plus_lower
        .transform('max')
        .eq(df.pct_bachelor_plus_lower)
    )
    
    return (
        df
        .loc[idx]
        .sort_values('pct_bachelor_plus_lower', ascending=False)
    )

In [None]:
test = df.pipe(highest_attainment_orig).sort_values('pct_bachelor_plus')
plt.figure(figsize=(16, 10))
plt.errorbar(test.pct_bachelor_plus, np.arange(len(test)), xerr=test.pct_bachelor_plus_moe, fmt='o')
plt.xlim(0, 100)
plt.ylim(-1, len(test))
plt.yticks(np.arange(len(test)), test.index.get_level_values('place'));

In [None]:
test2 = df.loc['New York'].sort_values('pct_bachelor_plus_lower', na_position='first').tail(10)
plt.errorbar(test2.pct_bachelor_plus, np.arange(len(test2)), xerr=test2.pct_bachelor_plus_moe, fmt='o')
plt.xlim(0, 100)
plt.ylim(-1, len(test2))
plt.yticks(np.arange(len(test2)), test2.index.get_level_values('place'));