In [1]:
import pandas as pd

In [2]:
acs = pd.read_csv('./acs_ny.csv')
acs.columns

Index(['Acres', 'FamilyIncome', 'FamilyType', 'NumBedrooms', 'NumChildren',
       'NumPeople', 'NumRooms', 'NumUnits', 'NumVehicles', 'NumWorkers',
       'OwnRent', 'YearBuilt', 'HouseCosts', 'ElectricBill', 'FoodStamp',
       'HeatingFuel', 'Insurance', 'Language'],
      dtype='object')

In [3]:
from patsy import dmatrices

In [5]:
response, predictors = dmatrices("""FamilyIncome ~ NumBedrooms + NumChildren + NumPeople +
                                 NumRooms + NumUnits + NumVehicles + NumWorkers + OwnRent +
                                 YearBuilt + ElectricBill + FoodStamp + HeatingFuel +
                                 Insurance + Language""", data= acs)

In [6]:
response.shape, predictors.shape

((22745, 1), (22745, 39))

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(predictors, response, random_state=0)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((17058, 39), (5687, 39), (17058, 1), (5687, 1))

In [10]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True).fit(X_train, y_train)

In [12]:
lr.coef_

array([[ 3.52265954e-11,  3.13564627e+04,  2.41836840e+04,
         2.83918617e+04,  7.22958631e+03,  1.29216900e+04,
         2.05779266e+04,  1.76483534e+04,  1.75688089e+04,
         2.55256637e+04,  2.98394397e+04,  3.01250228e+04,
         4.31864839e+04,  3.24203787e+04,  3.56206099e+04,
         3.71247034e+04,  3.03513293e+04,  7.36452937e+04,
         1.21871070e+04, -2.74571155e+04,  1.94655236e+04,
         2.58848230e+04,  2.53245174e+04,  2.53580285e+04,
         1.73453310e+04,  8.42499122e+03,  8.89800243e+02,
        -1.87362396e+04, -4.46333339e+03, -1.40946579e+04,
        -2.60334680e+04,  3.44393144e+03,  8.21572330e+03,
        -8.20382606e+03,  5.73549434e+03,  7.48453535e+03,
         2.28362982e+04,  9.33252376e+01,  3.09944119e+01]])

In [14]:
list(zip(predictors.design_info.column_names, lr.coef_[0]))

[('Intercept', 3.522659542873099e-11),
 ('NumUnits[T.Single attached]', 31356.462744870176),
 ('NumUnits[T.Single detached]', 24183.683986675245),
 ('OwnRent[T.Outright]', 28391.861681946193),
 ('OwnRent[T.Rented]', 7229.586310897469),
 ('YearBuilt[T.1940-1949]', 12921.689978379942),
 ('YearBuilt[T.1950-1959]', 20577.926622475392),
 ('YearBuilt[T.1960-1969]', 17648.353419795705),
 ('YearBuilt[T.1970-1979]', 17568.80889625331),
 ('YearBuilt[T.1980-1989]', 25525.663699356304),
 ('YearBuilt[T.1990-1999]', 29839.439721217997),
 ('YearBuilt[T.2000-2004]', 30125.022784558307),
 ('YearBuilt[T.2005]', 43186.48388720416),
 ('YearBuilt[T.2006]', 32420.3787491689),
 ('YearBuilt[T.2007]', 35620.60989305538),
 ('YearBuilt[T.2008]', 37124.70340049855),
 ('YearBuilt[T.2009]', 30351.329331279285),
 ('YearBuilt[T.2010]', 73645.2936886646),
 ('YearBuilt[T.Before 1939]', 12187.107009925863),
 ('FoodStamp[T.Yes]', -27457.115458398534),
 ('HeatingFuel[T.Electricity]', 19465.52355863531),
 ('HeatingFuel[T.G

In [15]:
model_coefs =pd.DataFrame(list(zip(predictors.design_info.column_names, lr.coef_[0])), columns=['variable','coef_lr'])

In [16]:
model_coefs

Unnamed: 0,variable,coef_lr
0,Intercept,3.52266e-11
1,NumUnits[T.Single attached],31356.46
2,NumUnits[T.Single detached],24183.68
3,OwnRent[T.Outright],28391.86
4,OwnRent[T.Rented],7229.586
5,YearBuilt[T.1940-1949],12921.69
6,YearBuilt[T.1950-1959],20577.93
7,YearBuilt[T.1960-1969],17648.35
8,YearBuilt[T.1970-1979],17568.81
9,YearBuilt[T.1980-1989],25525.66


In [17]:
lr.score(X_train, y_train)

0.2726140465638568

In [18]:
lr.score(X_test, y_test)

0.26976979568488124

In [19]:
from sklearn.linear_model import Lasso

In [20]:
lasso = Lasso(normalize=True, random_state=0).fit(X_test, y_test)

In [21]:
coefs_lasso = pd.DataFrame(list(zip(predictors.design_info.column_names, lasso.coef_)),
                          columns=['variable','coef_lasso'])

In [23]:
model_coefs = pd.merge(model_coefs, coefs_lasso, on='variable')

In [24]:
model_coefs

Unnamed: 0,variable,coef_lr,coef_lasso
0,Intercept,3.52266e-11,0.0
1,NumUnits[T.Single attached],31356.46,23847.097905
2,NumUnits[T.Single detached],24183.68,20278.620009
3,OwnRent[T.Outright],28391.86,30153.611697
4,OwnRent[T.Rented],7229.586,1440.140884
5,YearBuilt[T.1940-1949],12921.69,-6382.312453
6,YearBuilt[T.1950-1959],20577.93,-905.14203
7,YearBuilt[T.1960-1969],17648.35,-0.0
8,YearBuilt[T.1970-1979],17568.81,-1579.827129
9,YearBuilt[T.1980-1989],25525.66,7854.066748
