In [113]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
from scipy.stats import norm, skew #for some statistics
warnings.filterwarnings('ignore')
%matplotlib

Using matplotlib backend: TkAgg


In [114]:
# 加载训练集和测试集的数据
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [115]:
train.drop(train[(train.GrLivArea>4000)&(train.SalePrice<300000)].index, inplace=True)

In [116]:
train.drop(train[(train.TotalBsmtSF>3000)&(train.SalePrice<300000)].index, inplace=True)

In [117]:
# 拼接训练集和测试集数据，并删除Id,SalePrice
full = pd.concat([train,test],ignore_index=True)
full.drop(['Id','SalePrice'],axis=1,inplace=True)

In [118]:
full['LotFrontage'] = np.sqrt(full['LotArea'])

In [119]:
cols = ["PoolQC","MiscFeature","Alley","Fence","FireplaceQu"]
for col in cols:
    full[col].fillna('None',inplace=True)

for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    full[col].fillna('None',inplace=True)

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    full[col].fillna(0,inplace=True)
    
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    full[col].fillna(0,inplace=True)
    
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    full[col].fillna('None',inplace=True)

full["MasVnrType"].fillna("None",inplace=True)
full["MasVnrArea"].fillna(0,inplace=True)
full['MSZoning'].fillna(full['MSZoning'].mode()[0],inplace=True)
full.drop(['Utilities'], axis=1,inplace=True)
full["Functional"].fillna("Typ",inplace=True)
full['Electrical'].fillna(full['Electrical'].mode()[0],inplace=True)
full['KitchenQual'].fillna(full['KitchenQual'].mode()[0],inplace=True)
full['Exterior1st'].fillna(full['Exterior1st'].mode()[0],inplace=True)
full['Exterior2nd'].fillna(full['Exterior2nd'].mode()[0],inplace=True)
full['SaleType'].fillna(full['SaleType'].mode()[0],inplace=True)
full['MSSubClass'].fillna("None",inplace=True)

In [120]:
from sklearn.preprocessing import LabelEncoder

In [121]:
full['YearBuilt'] = LabelEncoder().fit_transform(full.YearBuilt)

In [122]:
# 增加新的特征，与SalePrice有较高的关系
full['TotalSF'] = full['TotalBsmtSF'] + full['1stFlrSF'] +full['2ndFlrSF']

In [123]:
full['YrBltAndRemod'] = full['YearBuilt'] + full['YearRemodAdd']

In [124]:
full['Total_sqr_footage'] = full['BsmtFinSF1']+full['BsmtFinSF2']+full['1stFlrSF']+full['2ndFlrSF']

In [125]:
full['Total_Bathrooms'] = full['FullBath'] + full['BsmtFullBath'] + 0.5*full['HalfBath'] + 0.5*full['BsmtHalfBath']

In [127]:
full = pd.get_dummies(full)

In [128]:
from sklearn.preprocessing import StandardScaler,RobustScaler

In [129]:
full = RobustScaler().fit_transform(full)

In [130]:
train_len = len(train)
train_x = full[:train_len]
test_x = full[train_len:]
y = np.log(train.SalePrice)

# 模型融合

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,cross_val_score
from sklearn.metrics import mean_squared_error

In [60]:
def rmse(model,X,y):
    rmse = np.sqrt(-cross_val_score(model,X,y,scoring='neg_mean_squared_error',cv=5))
    return rmse

In [66]:
def benchmark(model):
    pred = model.predict(train_x)
    rmse = np.sqrt(mean_squared_error(y,pred))
    return rmse

# 模型测试

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet,SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor 

# Ridge Regression

In [62]:
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV

In [131]:
kfolds = KFold(n_splits=10,shuffle=True,random_state=123)

In [132]:
alphas = [0.0001,0.0003,0.0005, 0.0009,
          0.01,0.03,0.05, 0.09,
          0.1,0.3,0.9,
          1,3,5,7,9,
          10,20,30,40,60,70,80,90]
rcv = RidgeCV(alphas=alphas,cv=kfolds)
rcv.fit(train_x,y)

RidgeCV(alphas=array([1.e-04, 3.e-04, 5.e-04, 9.e-04, 1.e-02, 3.e-02, 5.e-02, 9.e-02,
       1.e-01, 3.e-01, 9.e-01, 1.e+00, 3.e+00, 5.e+00, 7.e+00, 9.e+00,
       1.e+01, 2.e+01, 3.e+01, 4.e+01, 6.e+01, 7.e+01, 8.e+01, 9.e+01]),
        cv=KFold(n_splits=10, random_state=123, shuffle=True),
        fit_intercept=True, gcv_mode=None, normalize=False, scoring=None,
        store_cv_values=False)

In [133]:
benchmark(rcv)

0.0980596610917399

# Lasso Regression

In [134]:
from sklearn.linear_model import LassoCV

In [135]:
alphas2 = [0.00005,0.0001,0.0002,0.0005,0.0006,0.0007,0.0008,0.0009]

In [136]:
lcv = LassoCV(alphas=alphas2, random_state=123,cv=10).fit(train_x,y)

In [137]:
benchmark(lcv)

0.09923536068418425

# Elastic Net

In [138]:
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import make_pipeline
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

elastic_cv = make_pipeline( 
                           ElasticNetCV( alphas=e_alphas, 
                                        cv=kfolds, l1_ratio=e_l1ratio))

In [139]:
elastic_model3=elastic_cv.fit(train_x,y)

In [140]:
benchmark(elastic_model3)

0.10098244195069919

# xgboost

In [141]:
import xgboost as xgb

In [142]:
#XGBoost

xg_reg=xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [143]:
xg_reg.fit(train_x,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.4603, gamma=0.0468, importance_type='gain',
             learning_rate=0.05, max_delta_step=0, max_depth=3,
             min_child_weight=1.7817, missing=None, n_estimators=2200, n_jobs=1,
             nthread=-1, objective='reg:linear', random_state=7,
             reg_alpha=0.464, reg_lambda=0.8571, scale_pos_weight=1, seed=None,
             silent=1, subsample=0.5213)

In [144]:
benchmark(xg_reg)

0.07799424675530847

In [145]:
from sklearn.ensemble import GradientBoostingRegressor

In [146]:
gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [147]:
gbr.fit(train_x,y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.05, loss='huber', max_depth=4,
                          max_features='sqrt', max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=15, min_samples_split=10,
                          min_weight_fraction_leaf=0.0, n_estimators=3000,
                          n_iter_no_change=None, presort='auto', random_state=5,
                          subsample=1.0, tol=0.0001, validation_fraction=0.1,
                          verbose=0, warm_start=False)

In [148]:
benchmark(gbr)

0.05043606374001577

In [158]:
svr = SVR(C=20,epsilon=0.008,gamma=0.003)

In [159]:
svr.fit(train_x,y)

SVR(C=20, cache_size=200, coef0=0.0, degree=3, epsilon=0.008, gamma=0.003,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [160]:
benchmark(svr)

0.036080260234947766

In [172]:
from lightgbm import LGBMRegressor

In [173]:
lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
                                       )

In [174]:
lightgbm.fit(train_x,y)

LGBMRegressor(bagging_fraction=0.75, bagging_freq=5, bagging_seed=7,
              boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              feature_fraction=0.2, feature_fraction_seed=7,
              importance_type='split', learning_rate=0.01, max_bin=200,
              max_depth=-1, min_child_samples=20, min_child_weight=0.001,
              min_split_gain=0.0, n_estimators=5000, n_jobs=-1, num_leaves=4,
              objective='regression', random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0, verbose=-1)

In [175]:
benchmark(lightgbm)

0.07530241932834864

In [226]:
rf = RandomForestRegressor()
linsvr = LinearSVR()
sgd = SGDRegressor(max_iter=1000,tol=1e-3)
ker = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
extra = ExtraTreesRegressor()

In [227]:
rf.fit(train_x,y)
linsvr.fit(train_x,y)
sgd.fit(train_x,y)
ker.fit(train_x,y)
extra.fit(train_x,y)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
                    max_features='auto', max_leaf_nodes=None,
                    min_impurity_decrease=0.0, min_impurity_split=None,
                    min_samples_leaf=1, min_samples_split=2,
                    min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                    oob_score=False, random_state=None, verbose=0,
                    warm_start=False)

In [149]:
from mlxtend.regressor import StackingCVRegressor

In [228]:
stack_gen = StackingCVRegressor(regressors=(rcv,lcv,gbr,svr,rf,linsvr,sgd,ker,extra,xg_reg,lightgbm),
                               meta_regressor=lcv,
                               use_features_in_secondary=True)

In [229]:
stack_gen.fit(train_x,y)

StackingCVRegressor(cv=5,
                    meta_regressor=LassoCV(alphas=[5e-05, 0.0001, 0.0002,
                                                   0.0005, 0.0006, 0.0007,
                                                   0.0008, 0.0009],
                                           copy_X=True, cv=10, eps=0.001,
                                           fit_intercept=True, max_iter=1000,
                                           n_alphas=100, n_jobs=None,
                                           normalize=False, positive=False,
                                           precompute='auto', random_state=123,
                                           selection='cyclic', tol=0.0001,
                                           verbose=False),
                    n_jobs=None, pre_dispatch='2*n_jobs', rando...
                                              max_depth=-1,
                                              min_child_samples=20,
                                              min_

In [231]:
benchmark(stack_gen)

0.08599336807687888

In [232]:
def mark(y,y_pred):
    print(np.sqrt(mean_squared_error(y,pred)))

In [264]:
pred = 0.1*stack_gen.predict(test_x)+\
       0.1*lightgbm.predict(test_x)+\
       0.1*xg_reg.predict(test_x)+\
       0.4*svr.predict(test_x)+\
       0.3*gbr.predict(test_x)

In [266]:
pred = np.exp(pred)

In [267]:
result = pd.DataFrame({'Id':test.Id,'SalePrice':pred})
result.to_csv('submission.csv',index=False)

In [268]:
print('a')

a
