# Percent College Graduates Prediction
---
* Input: `'../data/census_tract_feats.csv'`
* Output: Regression model that predicts the percentage of the age 25+ population that has a college degree in a given census tract

In [17]:
import pandas as pd

# read in data
df = pd.read_csv('../data/census_tract_feats.csv')
df.columns

Index(['geoID', 'Total Population:', 'Population Density (Per Sq. Mile)',
       'Total Population: Male', 'Total Population: White Alone',
       'Total Population: Black or African American Alone',
       'Total Population: American Indian and Alaska Native Alone',
       'Total Population: Asian Alone',
       'Total Population: Native Hawaiian and Other Pacific Islander Alone',
       'Total Population: Some Other Race Alone',
       ...
       'Pct. Households: $100,000 to $124,999',
       'Pct. Households: $125,000 to $149,999',
       'Pct. Households: $150,000 to $199,999',
       'Pct. Households: $200,000 or More',
       'Pct. Families below poverty level',
       'Pct. Population for Whom Poverty Status Is Determined: Under 1.00 (Doing Poorly)',
       'Pct. Population for Whom Poverty Status Is Determined: 1.00 to 1.99 (Struggling)',
       'Pct. Population for Whom Poverty Status Is Determined: Under 2.00 (Poor or Struggling)',
       'Number of Accessible Universities',

In [19]:
# select columns to keep
train_feats = [
    # demographics
    'Population Density (Per Sq. Mile)',
    'Pct. Male', 'Pct. White Alone', 'Pct. Black or African American Alone', 'Pct. American Indian and Alaska Native Alone', 'Pct. Asian Alone', 'Pct. Native Hawaiian and Other Pacific Islander Alone', 'Pct. Some Other Race Alone', 'Pct. Two or More Races', 
    'Pct. Under 5 Years', 'Pct. 5 to 9 Years', 'Pct. 10 to 14 Years', 'Pct. 15 to 17 Years', 'Pct. 18 to 24 Years', 'Pct. 25 to 34 Years', 'Pct. 35 to 44 Years', 'Pct. 45 to 54 Years', 'Pct. 55 to 64 Years', 'Pct. 65 to 74 Years', 'Pct. 75 to 84 Years', 'Pct. 85 Years and Over',
    
    # education - omit most features (e.g. Pct. pop. high school graduates), for fear of leakage
    'Pct. Students enrolled in private school',
    
    # employment
    'Pct. Pop 16+ not in labor force',
    'Pct. Pop 16+ in armed forces',
    'Pct. Pop 16+ unemployed',
    
    # household income
    'Median Gross Rent',
    'Median Household Income (In 2017 Inflation Adjusted Dollars)',
    'Pct. Households: Less than $10,000', 'Pct. Households: $10,000 to $14,999', 'Pct. Households: $15,000 to $19,999', 'Pct. Households: $20,000 to $24,999', 'Pct. Households: $25,000 to $29,999', 'Pct. Households: $30,000 to $34,999', 'Pct. Households: $35,000 to $39,999', 'Pct. Households: $40,000 to $44,999', 'Pct. Households: $45,000 to $49,999', 'Pct. Households: $50,000 to $59,999', 'Pct. Households: $60,000 to $74,999', 'Pct. Households: $75,000 to $99,999', 'Pct. Households: $100,000 to $124,999', 'Pct. Households: $125,000 to $149,999', 'Pct. Households: $150,000 to $199,999', 'Pct. Households: $200,000 or More', 
    
    # poverty
    'Pct. Families below poverty level', 'Pct. Population for Whom Poverty Status Is Determined: Under 1.00 (Doing Poorly)', 'Pct. Population for Whom Poverty Status Is Determined: 1.00 to 1.99 (Struggling)', 'Pct. Population for Whom Poverty Status Is Determined: Under 2.00 (Poor or Struggling)',
    
    # education desert
    'Education Desert'
]

df_train = df[train_feats]
y = df["Pct. Population 25 Years and Over: Bachelor's Degree"]

print('df_train shape: ', df_feats.shape)
print('y shape: ', )
df_feats.head()

df_train shape:  (73745, 48)
y shape:  (73745,)


Unnamed: 0,Population Density (Per Sq. Mile),Pct. Male,Pct. White Alone,Pct. Black or African American Alone,Pct. American Indian and Alaska Native Alone,Pct. Asian Alone,Pct. Native Hawaiian and Other Pacific Islander Alone,Pct. Some Other Race Alone,Pct. Two or More Races,Pct. Under 5 Years,...,"Pct. Households: $75,000 to $99,999","Pct. Households: $100,000 to $124,999","Pct. Households: $125,000 to $149,999","Pct. Households: $150,000 to $199,999","Pct. Households: $200,000 or More",Pct. Families below poverty level,Pct. Population for Whom Poverty Status Is Determined: Under 1.00 (Doing Poorly),Pct. Population for Whom Poverty Status Is Determined: 1.00 to 1.99 (Struggling),Pct. Population for Whom Poverty Status Is Determined: Under 2.00 (Poor or Struggling),Education Desert
0,487.1106,0.487263,0.886721,0.052033,0.0,0.011924,0.0,0.0,0.049322,0.044444,...,0.180371,0.074271,0.079576,0.090186,0.02122,0.120287,0.106775,0.117073,0.223848,1
1,1684.013,0.537293,0.42035,0.54512,0.0,0.010129,0.006446,0.003223,0.014733,0.03361,...,0.139208,0.0894,0.040868,0.015326,0.0,0.182903,0.224138,0.220588,0.444726,1
2,1638.934,0.45288,0.613885,0.264697,0.005613,0.007386,0.003545,0.080355,0.02452,0.031905,...,0.04613,0.078968,0.039093,0.027365,0.0,0.100363,0.146529,0.28065,0.427179,1
3,1731.473,0.468948,0.80689,0.083431,0.004687,0.002344,0.0,0.074994,0.027654,0.054605,...,0.161807,0.104631,0.03259,0.038308,0.005146,0.014617,0.022967,0.182564,0.205531,1
4,2264.419,0.507175,0.784446,0.164375,0.0,0.030607,0.0,0.0,0.020572,0.055896,...,0.153314,0.178827,0.048164,0.054125,0.020029,0.084132,0.122349,0.133108,0.255457,1


In [24]:
# only keep non-education deserts -- education oases? (these are the census blocks we want to regress on)
    # uncomment once GIS radius problem has been fixed
# df_train = df_train[df_train['Education Desert'] == 0]
df_train = df_train.drop(labels='Education Desert', axis=1)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=100),
    'GaussianProcess': GaussianProcessRegressor()
}

# Evaluate each model using R^2 metric
for name, model in models.items():
    model = make_pipeline(StandardScaler(), model) # perform standardization first (fit to train-set)
    score = cross_val_score(model, X, y, cv=5, scoring='r2')
    print(name, ': ', score)

LinearRegression :  [0.74088996 0.72471103 0.70102107 0.6602488  0.68066959]
RandomForest :  [0.77457948 0.76949759 0.73778525 0.72594793 0.71749507]
