In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline

In [30]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


As most models have a assumption of normal distribution of data. We should check for the skewness of the data set provided. 

Apart from this MSSubClass cannot be used as a numeric feature as it is just codes for different type of houses
Alot of Categorical variable - ExterCond, etc have values as excellent, good, average and so on which can be converted in to numerical values

In [31]:
# All the categorical columns that have Excellent, Good, Average/Typical, Fair, Poor and NA as it is values
cols = ['PoolQC', 'GarageCond', 'GarageQual', 'FireplaceQu', 'KitchenQual', 'BsmtCond', 'BsmtQual', 'ExterCond', 'ExterQual', 'HeatingQC']
for c in cols:
    train[c] = train[c].map({'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'NA':0})
    train[c] = train[c].fillna(0) # This is required as the NA in csv is getting read as NAN 

train.Fence = train.Fence.map({'GdPrv':4, 'MnPrv':3, 'GdWo':2, 'MnWw':1, 'NA':0}).fillna(0)
train.GarageFinish = train.GarageFinish.map({'Fin':3, 'RFn':2, 'Unf':1, 'NA':0}).fillna(0)

cols = ['BsmtFinType1', 'BsmtFinType2']
for c in cols:
    train[c] = train[c].map({'GLQ':6, 'ALQ':5, 'BLQ':4, 'Rec':3, 'LwQ':2, 'Unf':1, 'NA':0}).fillna(0)

train.BsmtExposure = train.BsmtExposure.map({'Gd':4, 'Av':3, 'Mn': 2, 'No':1, 'NA':0}).fillna(0)

train.MSSubClass = train.MSSubClass.map({20:'1-STORY 1946 & NEWER ALL STYLES', 30: '1-STORY 1945 & OLDER'
                                         , 40:'1-STORY W/FINISHED ATTIC ALL AGES', 45: '1-1/2 STORY - UNFINISHED ALL AGES'
                                        , 50:'1-1/2 STORY FINISHED ALL AGES', 60:'2-STORY 1946 & NEWER', 70:'2-STORY 1945 & OLDER',
                                        75:'2-1/2 STORY ALL AGES', 80:'SPLIT OR MULTI-LEVEL', 85:'SPLIT FOYER', 90:'DUPLEX - ALL STYLES AND AGES',
                                        120:'1-STORY PUD (Planned Unit Development) - 1946 & NEWER',
                                        150:'1-1/2 STORY PUD - ALL AGES', 160:'2-STORY PUD - 1946 & NEWER',
                                        180:'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER', 190:'2 FAMILY CONVERSION - ALL STYLES AND AGES'})

In [33]:
# Despite all these attempts the score is not going up. Someting is seriously up. The original solution of just applying
# a log of the value is performing much better than these improvments 
train.dtypes

Id                 int64
MSSubClass        object
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual          int64
ExterCond          int64
Foundation        object
                  ...   
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual        int64
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu      float64
GarageType        object
GarageYrBlt      float64


In [3]:
import pandas as pd
import numpy as np
from scipy.stats import skew
from sklearn.linear_model import LassoCV
from sklearn.cross_validation import cross_val_score
from sklearn.feature_selection import SelectFromModel


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
                      test.loc[:, 'MSSubClass':'SaleCondition']))

# log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])
# log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# For right skewed data
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.8]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

# For left skewed data
skewed_feats_left = train[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats_left = skewed_feats_left[skewed_feats_left < -0.8]
skewed_feats_left = skewed_feats_left.index

all_data[skewed_feats_left] = np.square(all_data[skewed_feats_left])


all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())

X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

In [4]:
model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005]).fit(X_train, y)

In [9]:
a = pd.Series(model_lasso.coef_)
print a[a > 0]

2      0.071826
3      0.053160
4      0.043027
5      0.001777
6      0.000722
8      0.009292
11     0.019208
12     0.030350
15     0.400009
16     0.025674
18     0.021307
19     0.013793
22     0.005079
23     0.022117
24     0.000029
25     0.038789
26     0.000051
27     0.004409
28     0.001631
29     0.003123
30     0.002501
31     0.009100
32     0.002705
37     0.008027
44     0.015922
46     0.001098
55     0.006653
56     0.027076
61     0.009816
66     0.041226
67     0.017969
69     0.105138
78     0.085717
79     0.071620
84     0.043437
85     0.112493
90     0.048968
105    0.019074
109    0.001516
131    0.038890
135    0.070464
140    0.013210
144    0.002614
160    0.003872
175    0.018121
178    0.016816
182    0.050350
191    0.038701
208    0.020436
212    0.027081
224    0.063709
234    0.072597
237    0.003146
241    0.006315
245    0.011446
246    0.001945
261    0.005346
279    0.021575
286    0.019666
dtype: float64


In [8]:
a > 0

True