In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.pipeline import make_pipeline, Pipeline, BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer, RobustScaler, StandardScaler, KernelCenterer, Normalizer
from sklearn.decomposition import PCA



In [3]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [17]:
class ProcessDataFrame(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_result = X.copy().drop(['Id'], axis=1)
        ordered_levels = {
            "Alley": ["Grvl", "Pave"],
            "BsmtCond": ["Po", "Fa", "TA", "Gd"],
            "BsmtExposure": ["No", "Mn", "Av", "Gd"],
            "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
            "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
            "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
            "CentralAir": ["N", "Y"],
            "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
            "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
            "ExterQual": ["Fa", "TA", "Gd", "Ex"],
            "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
            "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
            'Functional': ['Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
            "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
            "GarageFinish": ["Unf", "RFn", "Fin"],
            "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
            "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
            "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
            "LotShape": ["IR3", "IR2", "IR1", "Reg"],
            "PavedDrive": ["N", "P", "Y"],
            "PoolQC": ["Fa", "Gd", "Ex"],
            "Street": ["Grvl", "Pave"],   
            "Utilities": ["NoSeWa", "AllPub"]
        }
        
        unordered_levels = {
            "Alley": ["Grvl", "Pave"],
            "BsmtCond": ["Po", "Fa", "TA", "Gd"],
            "BsmtExposure": ["No", "Mn", "Av", "Gd"],
            "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
            "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
            "BsmtQual": ["Fa", "TA", "Gd", "Ex"],
            "CentralAir": ["N", "Y"],
            "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
            "ExterCond": ["Po", "Fa", "TA", "Gd", "Ex"],
            "ExterQual": ["Fa", "TA", "Gd", "Ex"],
            "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
            "FireplaceQu": ["Po", "Fa", "TA", "Gd", "Ex"],
            'Functional': ['Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
            "GarageCond": ["Po", "Fa", "TA", "Gd", "Ex"],
            "GarageFinish": ["Unf", "RFn", "Fin"],
            "GarageQual": ["Po", "Fa", "TA", "Gd", "Ex"],
            "HeatingQC": ["Po", "Fa", "TA", "Gd", "Ex"],
            "KitchenQual": ["Fa", "TA", "Gd", "Ex"],
            "LotShape": ["IR3", "IR2", "IR1", "Reg"],
            "PavedDrive": ["N", "P", "Y"],
            "PoolQC": ["Fa", "Gd", "Ex"],
            "Street": ["Grvl", "Pave"],   
            "Utilities": ["NoSeWa", "AllPub"],
            'BldgType': ['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'],
            'Condition1': ['Norm', 'Feedr', 'PosN', 'Artery', 'RRAe', 'RRNn', 'RRAn', 'PosA', 'RRNe'],
            'Condition2': ['Norm', 'Artery', 'RRNn', 'Feedr', 'PosN', 'PosA', 'RRAn', 'RRAe'],
            'Exterior1st': ['VinylSd', 'MetalSd', 'Wd Sdng', 'HdBoard', 'BrkFace', 'WdShing', 'CemntBd', 'Plywood', 
                            'AsbShng','Stucco', 'BrkComm', 'AsphShn', 'Stone', 'ImStucc', 'CBlock'],
            'Exterior2nd': ['VinylSd', 'MetalSd', 'Wd Shng', 'HdBoard', 'Plywood', 'Wd Sdng', 'CmentBd', 'BrkFace', 
                            'Stucco', 'AsbShng', 'Brk Cmn', 'ImStucc', 'AsphShn', 'Stone', 'Other', 'CBlock'],
            'Foundation': ['PConc', 'CBlock', 'BrkTil', 'Wood', 'Slab', 'Stone'],
            'GarageType': ['Attchd', 'Detchd', 'BuiltIn', 'CarPort', 'Basment', '2Types'],
            'Heating': ['GasA', 'GasW', 'Grav', 'Wall', 'OthW', 'Floor'],
            'HouseStyle': ['2Story', '1Story', '1.5Fin', '1.5Unf', 'SFoyer', 'SLvl', '2.5Unf', '2.5Fin'],
            'LandContour': ['Lvl', 'Bnk', 'Low', 'HLS'],
            'LandSlope': ['Sev', 'Mod', 'Gtl'],
            'LotConfig': ['Inside', 'FR2', 'Corner', 'CulDSac', 'FR3'],
            'MSZoning': ['RL', 'RM', 'C (all)', 'FV', 'RH'],
            'MasVnrType': ['BrkFace', 'None', 'Stone', 'BrkCmn'],
            'MiscFeature': ['Shed', 'Gar2', 'Othr', 'TenC'],
            'Neighborhood': ['CollgCr', 'Veenker', 'Crawfor', 'NoRidge', 'Mitchel', 'Somerst', 'NWAmes', 
                             'OldTown', 'BrkSide', 'Sawyer', 'NridgHt', 'NAmes', 'SawyerW', 'IDOTRR', 
                             'MeadowV', 'Edwards', 'Timber', 'Gilbert', 'StoneBr', 'ClearCr', 'NPkVill', 
                             'Blmngtn', 'BrDale', 'SWISU', 'Blueste'],
            'RoofMatl': ['CompShg', 'WdShngl', 'Metal', 'WdShake', 'Membran', 'Tar&Grv', 'Roll', 'ClyTile'],
            'RoofStyle': ['Gable', 'Hip', 'Gambrel', 'Mansard', 'Flat', 'Shed'],
            'SaleCondition': ['Normal', 'Abnorml', 'Partial', 'AdjLand', 'Alloca', 'Family'],
            'SaleType': ['WD', 'New', 'COD', 'ConLD', 'ConLI', 'CWD', 'ConLw', 'Con', 'Oth']}
        
        X_result["HasFireplace"] = X_result["FireplaceQu"].isnull() * 1
        
        # skewed columns (>0.75)
        for c in ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF2', '1stFlrSF', 'GrLivArea', 'KitchenAbvGr',
                  'OpenPorchSF', 'PoolArea', 'MiscVal']:
            X_result[c] = np.log1p(X_result[c])
        
## MSSubClass     
## BsmtFinSF1
## BsmtUnfSF 
## TotalBsmtSF 
## 2ndFlrSF
## LowQualFinSF 
## BsmtHalfBath 
## WoodDeckSF 
## EnclosedPorch 
## 3SsnPorch
## ScreenPorch 

        
        for c in X_result.columns:
            #convert ordered categorical to integers
            if c in ordered_levels:
                X_result[c] = X_result[c].astype('category', categories=ordered_levels[c], ordered=True).cat.codes
            #apply levels to unordered categorical columns
            if c in unordered_levels:
                X_result[c] = X_result[c].astype('category', categories=unordered_levels[c])
        
        #convert unordered categorical to dummy columns        
        X_result = pd.get_dummies(X_result)
        return X_result
    
    def fit_transform(self, X, y=None):
        result = self.transform(X)
        return result

In [18]:
def load_data():
    train = pd.read_csv('../input/train.csv')
    test = pd.read_csv('../input/test.csv')            
    y = np.log(train['SalePrice'].values)
    X = train.drop(['SalePrice'], axis=1)
    X_submission = test
    ids_submission = test['Id'].values
    return y, X, X_submission, ids_submission

In [19]:
# load data
y, X, X_submission, ids_submission = load_data()

In [20]:
# parameters = {
#      'gradientboostingregressor__learning_rate': [0.1],
#      'gradientboostingregressor__max_depth': [4],
#      'gradientboostingregressor__max_features': [0.7],
#      'gradientboostingregressor__min_samples_leaf': [1],
#      'gradientboostingregressor__min_samples_split': [2],
#      'gradientboostingregressor__n_estimators': [250],
#      'gradientboostingregressor__subsample': [0.75],
# }

parameters = {
    'lasso__alpha': [0.0008, 0.0009],
    'lasso__l1_ratio': [0.4, 0.45, 0.5]
#     'estimator__lasso__copy_X': True,
#     'estimator__lasso__fit_intercept': True,
#     'estimator__lasso__max_iter': 1000,
#     'estimator__lasso__normalize': False,
#     'estimator__lasso__positive': False,
#     'estimator__lasso__precompute': False,
#     'estimator__lasso__random_state': 1773,
#     'estimator__lasso__selection': 'cyclic',
#     'estimator__lasso__tol': 0.0001,
#     'estimator__lasso__warm_start': False
}

pipeline = Pipeline([('process', ProcessDataFrame()),
                     ('impute', Imputer(strategy='most_frequent')),
                     ('scaler', KernelCenterer()),                    
                     ('lasso', ElasticNet(random_state=1773, max_iter=10000))])
#0.0005
#RobustScaler, StandardScaler, KernelCenterer, Normalizer

folds = KFold(10, random_state=42)

model = GridSearchCV(pipeline,
                     parameters,
                     
                     #n_iter=60, 
                     #random_state=1773,
                     scoring=make_scorer(rmse, greater_is_better=False), 
                     cv=folds, n_jobs=4)

In [21]:
pipeline.get_params()

{'impute': Imputer(axis=0, copy=True, missing_values='NaN', strategy='most_frequent',
     verbose=0),
 'impute__axis': 0,
 'impute__copy': True,
 'impute__missing_values': 'NaN',
 'impute__strategy': 'most_frequent',
 'impute__verbose': 0,
 'lasso': ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
       max_iter=10000, normalize=False, positive=False, precompute=False,
       random_state=1773, selection='cyclic', tol=0.0001, warm_start=False),
 'lasso__alpha': 1.0,
 'lasso__copy_X': True,
 'lasso__fit_intercept': True,
 'lasso__l1_ratio': 0.5,
 'lasso__max_iter': 10000,
 'lasso__normalize': False,
 'lasso__positive': False,
 'lasso__precompute': False,
 'lasso__random_state': 1773,
 'lasso__selection': 'cyclic',
 'lasso__tol': 0.0001,
 'lasso__warm_start': False,
 'process': ProcessDataFrame(),
 'scaler': KernelCenterer(),
 'steps': [('process', ProcessDataFrame()),
  ('impute',
   Imputer(axis=0, copy=True, missing_values='NaN', strategy='most_frequent',
       

In [22]:
#from scipy.stats import skew

In [23]:
# for c in X.columns:
#     if X[c].dtype != 'object':
#         xs = np.abs(skew(X[c], nan_policy='omit'))
#         if xs > 0.75:
#             print c

In [24]:
#-0.123892361064

In [25]:
%%time
model.fit(X,y);
print 'best score:', model.best_score_
print 'best parameters:', model.best_params_

best score: -0.128521033837
best parameters: {'lasso__l1_ratio': 0.4, 'lasso__alpha': 0.0009}
CPU times: user 3.69 s, sys: 225 ms, total: 3.91 s
Wall time: 1min 10s
