In [1]:
import os
import pandas as pd

In [2]:
datadir = os.path.join(os.path.realpath('.'), '..', 'data')
print os.listdir(datadir)

['csv_pca.zip', 'ss15hca.csv', 'PUMSDataDict15.pdf', 'ACS2015_PUMS_README.pdf', 'csv_hca.zip', 'ss15pca.csv']


In [3]:
hca = pd.read_csv(os.path.join(datadir, 'ss15hca.csv'))
print hca.shape

(155218, 235)


In [4]:
# Basic filtering
# Housing units only (no group quarters)
# Moved in the last year

filters = {
    'TYPE': 1,
    'MV': 1
}

print len(hca)

for var in filters.keys():
    hca = hca[hca[var] == filters[var]]
    
print len(hca)

155218
15928


In [5]:
# Remove weighting variables

cols = hca.columns.tolist()
print len(cols)

new_cols = [var for var in cols if not var.startswith('wgtp')]
print len(new_cols)

hca = hca[new_cols]
print hca.shape

235
155
(15928, 155)


In [6]:
# Recode tenure to binary and remove old variable

hca.loc[hca.TEN.isin([1,2]), 'tenure_own'] = 1
hca.loc[hca.TEN == 3, 'tenure_own'] = 0
hca = hca.loc[pd.notnull(hca.tenure_own)]
hca = hca.drop('TEN', axis=1)
print hca.shape

(15667, 155)


In [7]:
# Get columns that are not strings and remove string columns
numeric_cols = hca.dtypes[hca.dtypes != 'object'].index.tolist()

hca = hca[numeric_cols]
print hca.shape

(15667, 154)


In [8]:
# Remove columns that have NAs (have to return to this)
hca = hca.dropna(axis=1)
print hca.shape

(15667, 117)


In [9]:
housing_vars = [
    'ACCESS',
    'BATH',
    'RMSP',
    'YBL',
    'KIT'
]

In [10]:
def recode_binary(df, oldvar, newvar):
    
    df.loc[df[oldvar] == 1, newvar] = 1
    df.loc[df[oldvar] == 2, newvar] = 0

In [11]:
# Recoding categorical variables

hca.loc[hca.ACCESS.isin([1, 2]), 'access_recode'] = 1
hca.loc[hca.ACCESS == 3, 'access_recode'] = 0

recode_binary(hca, 'BATH', 'bath_recode')

hca.loc[hca.YBL.isin(range(1,7)), 'before1990'] = 1
hca.loc[hca.YBL.isin(range(7,20)), 'before1990'] = 0

recode_binary(hca, 'KIT', 'kit_recode')

housing_vars_recode = [
    'access_recode',
    'bath_recode',
    'RMSP',
    'before1990',
    'kit_recode'
]

In [12]:
household_vars = [
    'FS',
    'LAPTOP',
    'VEH',
    'HHL',
    'HHT',
    'HINCP',
    'HUGCL',
    'HUPAC',
    'LNGI',
    'MULTG',
    'NR',
    'PARTNER',
    'SSMC'
]

In [13]:
recode_binary(hca, 'FS', 'fs_recode')
recode_binary(hca, 'LAPTOP', 'laptop_recode')

hca.loc[hca.HHL == 1, 'english_hh'] = 1
hca.loc[hca.HHL.isin(range(2,6)), 'english_hh'] = 0

hca.loc[hca.HHT == 1, 'single_parent'] = 0
hca.loc[hca.HHT.isin(range(4,8)), 'single_parent'] = 0
hca.loc[hca.HHT.isin(range(2,4)), 'single_parent'] = 1

hca.loc[hca.HHT == 1, 'nonfamily'] = 0
hca.loc[hca.HHT.isin(range(4,8)), 'nonfamily'] = 1
hca.loc[hca.HHT.isin(range(2,4)), 'nonfamily'] = 0

hca.loc[hca.HUPAC == 4, 'children'] = 0
hca.loc[hca.HUPAC.isin(range(1,4)), 'children'] = 1

recode_binary(hca, 'LNGI', 'good_english_speaker') 
recode_binary(hca, 'MULTG', 'multigen') 

hca.loc[hca.PARTNER == 0, 'unmarried_partner'] = 0
hca.loc[hca.PARTNER.isin(range(1,5)), 'unmarried_partner'] = 1

hca.loc[hca.SSMC == 0, 'samesex_marriage'] = 0
hca.loc[hca.SSMC.isin([1,2]), 'samesex_marriage'] = 1

household_vars_recode = [
    'fs_recode',
    'laptop_recode',
    'VEH',
    'english_hh',
    'single_parent',
    'nonfamily',
    'HINCP',
    'HUGCL',
    'children',
    'good_english_speaker',
    'multigen',
    'NR',
    'unmarried_partner',
    'samesex_marriage'
]

In [14]:
# hca = hca[housing_vars + household_vars + ['tenure_own']]
# hca = hca[household_vars + ['tenure_own']]
hca = hca[household_vars_recode + ['tenure_own']]

In [15]:
hca[housing_vars + household_vars + ['tenure_own']].to_csv('./../hca_all.csv')
hca[household_vars + ['tenure_own']].to_csv('./../hca_household_vars.csv')
hca[housing_vars_recode + household_vars_recode + ['tenure_own']].to_csv('./../hca_all_recode.csv')
hca[household_vars_recode + ['tenure_own']].to_csv('./../hca_household_vars_recode.csv')

KeyError: "['ACCESS' 'BATH' 'RMSP' 'YBL' 'KIT' 'FS' 'LAPTOP' 'HHL' 'HHT' 'HUPAC'\n 'LNGI' 'MULTG' 'PARTNER' 'SSMC'] not in index"

In [16]:
# Prepare data for models

data = hca.as_matrix()
X = data[:, :-1]
y = data[:, -1]

In [17]:
X

array([[ 0.,  1.,  2., ...,  0.,  0.,  0.],
       [ 0.,  1.,  2., ...,  1.,  1.,  0.],
       [ 0.,  1.,  1., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  2., ...,  1.,  0.,  0.],
       [ 1.,  1.,  0., ...,  0.,  0.,  0.]])

In [18]:
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score
import numpy as np

In [19]:
from sklearn.tree import DecisionTreeClassifier
import itertools

In [20]:
for i in range(1,10):
    tree = DecisionTreeClassifier(max_depth=i)
    tree.fit(X, y)
    print i, cross_val_score(tree, X, y).mean()

1 0.738175789508
2 0.745133654605
3 0.760771212502
4 0.763643308517
5 0.764664616351
6 0.764664653016
7 0.763196639871
8 0.759813749392
9 0.758473291261


In [22]:
for i in cross_val_score(tree, X, y):
    print i

0.757419107792
0.756032171582
0.763309076982


In [66]:
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X,y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [18]:
from sklearn.tree import export_graphviz
export_graphviz(decision_tree=tree, feature_names = housing_vars + household_vars)

In [67]:
from sklearn.ensemble import RandomForestClassifier

In [68]:
for i in range(1,20):
    tree = RandomForestClassifier(max_depth=i)
    tree.fit(X, y)
    print i, cross_val_score(tree, X, y).mean()

1 0.738175789508
2 0.738175789508
3 0.750686960196
4 0.756940663441
5 0.759813064992
6 0.76383465937
7 0.766260025632
8 0.765749646697
9 0.766324249221
10 0.765047256951
11 0.762557960414
12 0.76128144478
13 0.757196604526
14 0.750367101058
15 0.745516087441
16 0.742580000044
17 0.734856413776
18 0.732431169728
19 0.731728853311


In [81]:
for i in range(1,10):
    model = LinearSVC(C=i)
    model.fit(X, y)
    print i, cross_val_score(model, X, y).mean()

1 0.713600312321
2 0.67281136702
3 0.670769142437
4 0.730708156547
5 0.72885628066
6 0.734537483465
7 0.724580358425
8 0.682896963682
9 0.718579406473


In [82]:
from sklearn.neighbors import KNeighborsClassifier

In [83]:
for i, j in itertools.product(range(1,100,5), ['uniform', 'distance']):
    model = KNeighborsClassifier(n_neighbors=i, weights=j)
    model.fit(X, y)
    print i, j, cross_val_score(model, X, y).mean()

1 uniform 0.673645393736
1 distance 0.673645393736
6 uniform 0.737218277513
6 distance 0.712070141008
11 uniform 0.736771657752
11 distance 0.716984962685
16 uniform 0.745069113259
16 distance 0.71992102564
21 uniform 0.74360123455
21 distance 0.721644368799
26 uniform 0.74736712005
26 distance 0.722601721916
31 uniform 0.746792493082
31 distance 0.724069808389
36 uniform 0.746218159429
36 distance 0.724452778968
41 uniform 0.746026771912
41 distance 0.724835737324
46 uniform 0.744622579049
46 distance 0.724835712881
51 uniform 0.744814015452
51 distance 0.725027173728
56 uniform 0.74545236493
56 distance 0.725410168749
61 uniform 0.745260891862
61 distance 0.725346299581
66 uniform 0.746665023618
66 distance 0.725473989031
71 uniform 0.745835249958
71 distance 0.725601641816
76 uniform 0.746154229155
76 distance 0.725984600173
81 uniform 0.747622388957
81 distance 0.725729282381
86 uniform 0.747813923131
86 distance 0.726048396013
91 uniform 0.747686172574
91 distance 0.726112216295
9

In [84]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

In [85]:
model = LogisticRegressionCV()

In [86]:
model.fit(X, y)

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [87]:
cross_val_score(model, X, y)

array([ 0.74210224,  0.74894676,  0.7384144 ])

In [88]:
model.coef_

array([[ -7.92149634e-02,  -1.24275410e-01,   2.31236445e-02,
         -9.25694448e-02,  -7.51622806e-02,  -1.96098075e-01,
          2.41771083e-06,  -2.28549411e-04,  -4.98295732e-02,
         -1.79444474e-01,  -2.24877849e-01,  -1.08959018e-01,
         -4.58386253e-02,   7.26379113e-04]])