In [14]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_log_error

from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgbm

import warnings
from sklearn.metrics import mean_squared_error

from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, Ridge
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import StackingRegressor

warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^.*LAPACK bug 0038.*")

In [15]:
def loading_data():
    # load training data split train_x and train_y
    train_pd = pd.read_csv("all/train.csv")

    # remove outlier from data analysis
    #drop outliers
#     query = train_pd[(train_pd['GrLivArea'] > 4000) & (train_pd['SalePrice'] < 200000)]
#     train_pd = train_pd.drop(query.index.values)

    train_x = train_pd.drop(['SalePrice'], axis=1)
    train_x['is_train'] = 1
    train_y = np.log1p(train_pd["SalePrice"])
    test_pd = pd.read_csv("all/test.csv")
    test_pd['is_train'] = 0
    
    # concate train and test
    all_data = pd.concat((train_x, test_pd)).reset_index(drop=True)
    all_data = all_data.drop(['Id'], axis=1)
    # There are 4 features has more than 80 percent nan value
    # drop these columns
    all_data = all_data.drop(['Alley', 'PoolQC', 'Fence','MiscFeature'], axis=1)

    # drop low variance feature
#     all_data = all_data.drop(["Heating", 'Exterior2nd', 'Condition2', 'RoofMatl', 'Utilities', 'Street'], axis=1)
    
    # fill lotfrontage with mean value corespond to overall quality
    lot_not_nan = all_data[all_data['LotFrontage'].notna()]
    lot_mean_map = dict(lot_not_nan.groupby('OverallQual').mean()['LotFrontage'])
    all_data['LotFrontage'] = all_data['LotFrontage'].fillna(all_data['OverallQual'].map(lot_mean_map))
    
    # MasVnrArea nan means no Masonry veneer area
    # just fill with 0
    all_data['MasVnrArea'] = all_data['MasVnrArea'].fillna(0)
    # BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF with 1 nan only. fill with 0
    all_data['BsmtFinSF1'] = all_data['BsmtFinSF1'].fillna(0)
    all_data['BsmtFinSF2'] = all_data['BsmtFinSF2'].fillna(0)
    all_data['BsmtUnfSF'] = all_data['BsmtUnfSF'].fillna(0)
    all_data['TotalBsmtSF'] = all_data['TotalBsmtSF'].fillna(0)
    # BsmtFullBath, BsmtHalfBath means basement bathrooms, value can be 0
    all_data['BsmtFullBath'] = all_data['BsmtFullBath'].fillna(0)
    all_data['BsmtHalfBath'] = all_data['BsmtHalfBath'].fillna(0)
    # fill GarageYrBlt with average yearbuilt+3
    all_data['GarageYrBlt'] = all_data['GarageYrBlt'].fillna(all_data['YearBuilt'] + 3)
    # GarageCars, GarageArea can be 0, fill with 0
    all_data['GarageCars'] = all_data['GarageCars'].fillna(0)
    all_data['GarageArea'] = all_data['GarageArea'].fillna(0)
    # MasVnrType has None value, fill with None
    all_data['MasVnrType'] = all_data['MasVnrType'].fillna('None')
    # BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 GarageType GarageFinish GarageQual GarageCond FireplaceQu have NA value, fill with NA
    fill_na_col = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'FireplaceQu']
    all_data[fill_na_col] = all_data[fill_na_col].fillna('NA')
    # fill remain nan feature with common value
    unfilled = all_data.columns[all_data.isna().any()].tolist()
    all_data[unfilled] = all_data[unfilled].fillna(all_data[unfilled].mode().iloc[0])
    
    # transform some feature to numerical
    level_map  = {
        'Ex':5,
        'Gd':4,
        'TA':3,
        'Fa':2,
        'Po':1,
        'NA':0,
        'Y':1,
        'N':0,
        'Reg':3,
        'IR1':2,
        'IR2':1,
        'IR3':0,
        "Unf" : 1,
        "LwQ": 2,
        "Rec" : 3,
        "BLQ" : 4,
        "ALQ" : 5,
        "GLQ" : 6
    }
    # assign value 
    all_data['LotShape'] = all_data['LotShape'].map(level_map)
    all_data['ExterQual'] = all_data['ExterQual'].map(level_map)
    all_data['ExterCond'] = all_data['ExterCond'].map(level_map)
    all_data['BsmtQual'] = all_data['BsmtQual'].map(level_map)
    all_data['BsmtCond'] = all_data['BsmtCond'].map(level_map)
    all_data['BsmtFinType1'] = all_data['BsmtFinType1'].map(level_map)
    all_data['BsmtFinType2'] = all_data['BsmtFinType2'].map(level_map)
    all_data['HeatingQC'] = all_data['HeatingQC'].map(level_map)
    all_data['CentralAir'] = all_data['CentralAir'].map(level_map)
    all_data['KitchenQual'] = all_data['KitchenQual'].map(level_map)
    all_data['FireplaceQu'] = all_data['FireplaceQu'].map(level_map)
    all_data['GarageQual'] = all_data['GarageQual'].map(level_map)
    all_data['GarageCond'] = all_data['GarageCond'].map(level_map)
    
    # build house age feature
    house_age = all_data['YrSold'] - all_data['YearBuilt']
    all_data['house_age'] = house_age
    
    all_data['YrBltAndRemod']=all_data['YearBuilt']+all_data['YearRemodAdd']
    all_data['TotalSF']=all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

    all_data['Total_sqr_footage'] = (all_data['BsmtFinSF1'] + all_data['BsmtFinSF2'] +
                                     all_data['1stFlrSF'] + all_data['2ndFlrSF'])

    all_data['Total_Bathrooms'] = (all_data['FullBath'] + (0.5 * all_data['HalfBath']) +
                                   all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath']))

    all_data['Total_porch_sf'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
                                  all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
                                  all_data['WoodDeckSF'])

    # simplified features
    all_data['haspool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    all_data['has2ndfloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    all_data['hasgarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    all_data['hasbsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    all_data['hasfireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    
    # one_hot other features
    all_data = pd.get_dummies(all_data)
    
    train_x = all_data[all_data['is_train'] == 1]
    test_x = all_data[all_data['is_train'] == 0]
    train_x = train_x.drop(['is_train'], axis=1)
    test_x = test_x.drop(['is_train'], axis=1)
    
    return train_x, train_y, test_x

def random_cross_validation(model):
    kf = KFold(n_splits=5, random_state=64, shuffle=True).get_n_splits(train_x)
    scores = np.sqrt(-cross_val_score(model, train_x, train_y, cv=kf, scoring = 'neg_mean_squared_error'))
    return scores.mean()

In [16]:
# loading data
train_x, train_y, test_x = loading_data()
print(train_x.shape, train_y.shape, test_x.shape)

(1460, 243) (1460,) (1459, 243)


# build model

In [17]:
# building model
# lgbm 
model_lgbm = lgbm.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,verbosity=0)
# xgb
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213,
                             random_state =7, nthread = -1)

# ridge
model_ridge = make_pipeline(RobustScaler(), Ridge(alpha =10, random_state=34))

# lasso
model_lasso = make_pipeline(RobustScaler(), Lasso(alpha =1, random_state=32 , max_iter = 1e6))

# elnet
model_ela = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3, max_iter = 1e6))

# sgd
model_sgd = make_pipeline(RobustScaler(), SGDRegressor(alpha = 1)) 

# svr
model_svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003))


# stack
kf = KFold(n_splits=5, random_state=64, shuffle=True).get_n_splits(train_x)
estimators = [("ridge",model_ridge), ("lasso",model_svr), ("ela",model_ela)]
model_stack = StackingRegressor(
    estimators=estimators,
    cv = kf
)

# eval model

In [18]:
score = random_cross_validation(model_lgbm)
print("lgbm avg score: " + str(score))

lgbm avg score: 0.12023570743586191


In [19]:
score = random_cross_validation(model_xgb)
print("xgb avg score: " + str(score))

xgb avg score: 0.12136233265070166


In [20]:
score = random_cross_validation(model_ridge)
print("ridge avg score: " + str(score))

ridge avg score: 0.13840988966240148


In [21]:
score = random_cross_validation(model_lasso)
print("lasso avg score: " + str(score))

lasso avg score: 0.39400141687467155


In [22]:
score = random_cross_validation(model_ela)
print("ela avg score: " + str(score))

ela avg score: 0.13465033409774954


In [23]:
score = random_cross_validation(model_sgd)
print("SGD avg score: " + str(score))

SGD avg score: 1016616197197361.2


In [24]:
score = random_cross_validation(model_svr)
print("SVR avg score: " + str(score))

SVR avg score: 0.18216422045126318


In [21]:
score = random_cross_validation(model_stack)
print("stack avg score: " + str(score))

stack avg score: 0.13502409344411337


In [None]:
stack avg score: 0.13610616260087877

# submission section

In [10]:
model_xgb.fit(train_x, train_y)
model_lgbm.fit(train_x, train_y)
model_stack.fit(train_x, train_y)
y_pred_xgb = model_xgb.predict(test_x)
y_pred_lgbm = model_lgbm.predict(test_x)
y_pred_stack = model_stack.predict(test_x)



In [33]:
print(y_pred_xgb.shape, y_pred_lgbm.shape)

(1459,) (1459,)


In [11]:
# y_pred = 0.4 * np.expm1(y_pred_xgb) + 0.6 *np.expm1(y_pred_lgbm) 0.11963  top 7%
# y_pred = 0.2 * np.expm1(y_pred_xgb) + 0.7 *np.expm1(y_pred_lgbm) + 0.1 * np.expm1(y_pred_stack) 0.11907 
# y_pred = 0.15 * np.expm1(y_pred_xgb) + 0.7 *np.expm1(y_pred_lgbm) + 0.05 * np.expm1(y_pred_stack) 0.11904
# y_pred = 0.2 * np.expm1(y_pred_xgb) + 0.5 *np.expm1(y_pred_lgbm) + 0.3 * np.expm1(y_pred_stack) 0.120...
# y_pred = 0.15 * np.expm1(y_pred_xgb) + 0.7 *np.expm1(y_pred_lgbm) + 0.15 * np.expm1(y_pred_stack) 0.118!!!
y_pred = 0.15 * np.expm1(y_pred_xgb) + 0.7 *np.expm1(y_pred_lgbm) + 0.15 * np.expm1(y_pred_stack)
print(y_pred.shape)

(1459,)


In [23]:
model_lgbm.fit(train_x, train_y)
y_pred = model_lgbm.predict(test_x)
print(y_pred.shape)

(1459,)


In [12]:
test_pd = pd.read_csv('all/test.csv')
sub = pd.concat([test_pd['Id'],pd.DataFrame(y_pred)],axis=1)
sub.columns=['Id','SalePrice']
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,124358.533544
1,1462,163599.278001
2,1463,184042.689237
3,1464,197572.74989
4,1465,194572.282451


In [13]:
sub.to_csv('submission.csv',index=False)