In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
raw_data = pd.read_csv('data/us-census-demographic-data/acs2015_census_tract_data.csv', engine='python')

raw_data.head()

Unnamed: 0,CensusTract,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001020100,Alabama,Autauga,1948,940,1008,0.9,87.4,7.7,0.3,...,0.5,2.3,2.1,25.0,943,77.1,18.3,4.6,0.0,5.4
1,1001020200,Alabama,Autauga,2156,1059,1097,0.8,40.4,53.3,0.0,...,0.0,0.7,0.0,23.4,753,77.0,16.9,6.1,0.0,13.3
2,1001020300,Alabama,Autauga,2968,1364,1604,0.0,74.5,18.6,0.5,...,0.0,0.0,2.5,19.6,1373,64.1,23.6,12.3,0.0,6.2
3,1001020400,Alabama,Autauga,4423,2172,2251,10.5,82.8,3.7,1.6,...,0.0,2.6,1.6,25.3,1782,75.7,21.2,3.1,0.0,10.8
4,1001020500,Alabama,Autauga,10763,4922,5841,0.7,68.5,24.8,0.0,...,0.0,0.6,0.9,24.8,5037,67.1,27.6,5.3,0.0,4.2


In [3]:
df = raw_data

df.shape

(74001, 37)

In [4]:
df.columns

Index(['CensusTract', 'State', 'County', 'TotalPop', 'Men', 'Women',
       'Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific', 'Citizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment'],
      dtype='object')

In [5]:
data = df.iloc[:, 3:]

data.head()

Unnamed: 0,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,Citizen,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1948,940,1008,0.9,87.4,7.7,0.3,0.6,0.0,1503,...,0.5,2.3,2.1,25.0,943,77.1,18.3,4.6,0.0,5.4
1,2156,1059,1097,0.8,40.4,53.3,0.0,2.3,0.0,1662,...,0.0,0.7,0.0,23.4,753,77.0,16.9,6.1,0.0,13.3
2,2968,1364,1604,0.0,74.5,18.6,0.5,1.4,0.3,2335,...,0.0,0.0,2.5,19.6,1373,64.1,23.6,12.3,0.0,6.2
3,4423,2172,2251,10.5,82.8,3.7,1.6,0.0,0.0,3306,...,0.0,2.6,1.6,25.3,1782,75.7,21.2,3.1,0.0,10.8
4,10763,4922,5841,0.7,68.5,24.8,0.0,3.8,0.0,7666,...,0.0,0.6,0.9,24.8,5037,67.1,27.6,5.3,0.0,4.2


In [6]:
data = data.dropna()

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72727 entries, 0 to 74000
Data columns (total 34 columns):
TotalPop           72727 non-null int64
Men                72727 non-null int64
Women              72727 non-null int64
Hispanic           72727 non-null float64
White              72727 non-null float64
Black              72727 non-null float64
Native             72727 non-null float64
Asian              72727 non-null float64
Pacific            72727 non-null float64
Citizen            72727 non-null int64
Income             72727 non-null float64
IncomeErr          72727 non-null float64
IncomePerCap       72727 non-null float64
IncomePerCapErr    72727 non-null float64
Poverty            72727 non-null float64
ChildPoverty       72727 non-null float64
Professional       72727 non-null float64
Service            72727 non-null float64
Office             72727 non-null float64
Construction       72727 non-null float64
Production         72727 non-null float64
Drive            

In [8]:
from sklearn import neighbors
from sklearn.model_selection import cross_val_score

# Build our model.
knn = neighbors.KNeighborsRegressor(n_neighbors=15)
X = data.drop(['Unemployment'], axis=1)
Y = data.Unemployment
knn.fit(X, Y)

from sklearn.model_selection import cross_val_score
score = cross_val_score(knn, X, Y, cv=70)
print("Unweighted Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))


Unweighted Accuracy: 0.33 (+/- 0.48)


In [None]:
import matplotlib.pyplot as plt

data[['Men', 'Women']].plot.hist()
plt.show()

In [None]:
np.log(data[['Men', 'Women']]+1).plot.hist()
plt.show()

In [None]:
data['White'].plot.hist()
plt.show()

In [None]:
import seaborn as sns

g = sns.pairplot(np.log(data[['Hispanic', 'Black', 'Native',
       'Asian', 'Pacific']]+1))

In [None]:
data = np.log(data+1)

In [None]:
from sklearn import linear_model
# Suppress annoying harmless error.
import warnings
warnings.filterwarnings(action="ignore", module="seaborn", message="^internal gelsd")

lr = linear_model.LinearRegression()
y = data.Unemployment.values.reshape(-1, 1)
X = data.drop(['Unemployment'], axis=1)

# Instantiate our model.
lr = linear_model.LinearRegression()

# Fit our model to our data.
lr.fit(X, y)

# Inspect the results.
print('\nCoefficients: \n', lr.coef_)
print('\nIntercept: \n', lr.intercept_)
print('\nR-squared:')
print(lr.score(X, y))

In [None]:
# Extract predicted values.
predicted = lr.predict(X).ravel()
actual = np.log(data.Unemployment+1)

# Calculate the error, also called the residual.
residual = actual - predicted

# This looks a bit concerning.
plt.hist(residual)
plt.title('Residual counts')
plt.xlabel('Residual')
plt.ylabel('Count')
plt.show()

In [None]:
plt.scatter(predicted, residual)
plt.xlabel('Predicted')
plt.ylabel('Residual')
plt.axhline(y=0)
plt.title('Residual vs. Predicted')
plt.show()

Although R^2 is high for OLS, Residuals are not homscedastic
KNN gives a good estimate, but is time consuming