In [23]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

In [25]:
train = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test  = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# حفظ IDs
train_ID = train['Id']
test_ID = test['Id']

# حذف IDs
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

In [26]:
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)

In [27]:
y_train = np.log1p(train['SalePrice'])
train.drop(['SalePrice'], axis=1, inplace=True)

In [28]:
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat((train, test)).reset_index(drop=True)

In [29]:
for col in ('PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
            'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
            'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
            'MasVnrType', 'MSSubClass'):
    all_data[col] = all_data[col].fillna('None')

In [30]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars',
            'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
            'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'):
    all_data[col] = all_data[col].fillna(0)

In [31]:
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)

In [32]:
for col in ('MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st',
            'Exterior2nd', 'SaleType', 'Functional'):
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

all_data = all_data.drop(['Utilities'], axis=1)

In [33]:
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)
all_data['OverallCond'] = all_data['OverallCond'].astype(str)
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

In [34]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']
all_data['Total_sqr_footage'] = (all_data['BsmtFinSF1'] + all_data['BsmtFinSF2'] +
                                 all_data['1stFlrSF'] + all_data['2ndFlrSF'])
all_data['Total_Bathrooms'] = (all_data['FullBath'] + (0.5 * all_data['HalfBath']) +
                               all_data['BsmtFullBath'] + (0.5 * all_data['BsmtHalfBath']))
all_data['Total_porch_sf'] = (all_data['OpenPorchSF'] + all_data['3SsnPorch'] +
                              all_data['EnclosedPorch'] + all_data['ScreenPorch'] +
                              all_data['WoodDeckSF'])
all_data['haspool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['has2ndfloor'] = all_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['hasgarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
all_data['hasbsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
all_data['hasfireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [37]:
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
        'ExterQual', 'ExterCond', 'HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1',
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond',
        'YrSold', 'MoSold')

In [38]:
for c in cols:
    lbl = LabelEncoder()
    lbl.fit(list(all_data[c].values))
    all_data[c] = lbl.transform(list(all_data[c].values))

In [39]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = all_data[numeric_feats].apply(lambda x: x.skew()).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew': skewed_feats})
skewness = skewness[abs(skewness) > 0.75]

In [40]:
from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

In [41]:
all_data = pd.get_dummies(all_data)

In [42]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [43]:
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)

def rmsle_cv(model):
    rmse = np.sqrt(-cross_val_score(
        model,
        train.values,
        y_train,
        scoring="neg_mean_squared_error",
        cv=kfolds
    ))
    return rmse

In [44]:
lasso = Lasso(alpha=0.0005, random_state=1)
ridge = Ridge(alpha=10, random_state=1)
elasticnet = ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=3)
gbr = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=4,
    max_features='sqrt',
    min_samples_leaf=15,
    min_samples_split=10,
    loss='huber',
    random_state=5
)

In [45]:
xgb = XGBRegressor(
    learning_rate=0.01,
    n_estimators=3460,
    max_depth=3,
    min_child_weight=0,
    gamma=0,
    subsample=0.7,
    colsample_bytree=0.7,
    objective='reg:squarederror',
    nthread=-1,
    scale_pos_weight=1,
    seed=27,
    reg_alpha=0.00006
)

In [46]:
lgb = LGBMRegressor(
    objective='regression',
    num_leaves=5,
    learning_rate=0.05,
    n_estimators=720,
    max_bin=55,
    bagging_fraction=0.8,
    bagging_freq=5,
    feature_fraction=0.2319,
    feature_fraction_seed=9,
    bagging_seed=9,
    min_data_in_leaf=6,
    min_sum_hessian_in_leaf=11
)

In [47]:
print("CV RMSLE (lower is better):")
models = [("Lasso", lasso), ("Ridge", ridge), ("ElasticNet", elasticnet),
          ("GBR", gbr), ("XGB", xgb), ("LGB", lgb)]
for name, m in models:
    score = rmsle_cv(m).mean()
    print(f"{name}: {score:.5f}")

CV RMSLE (lower is better):
Lasso: 0.11169
Ridge: 0.11262
ElasticNet: 0.11160
GBR: 0.11394
XGB: 0.11176
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003343 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1593
[LightGBM] [Info] Number of data points in the train set: 1312, number of used features: 182
[LightGBM] [Info] Start training from score 12.024654
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001021 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1599
[LightGBM] [Info] Number of data points in the train set: 1312, number of used features: 185
[LightGBM] [Info] Start training from score 12.022795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing 

In [48]:
print("\nTraining models...")
lasso.fit(train, y_train)
ridge.fit(train, y_train)
elasticnet.fit(train, y_train)
gbr.fit(train, y_train)
xgb.fit(train, y_train)
lgb.fit(train, y_train)


Training models...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1614
[LightGBM] [Info] Number of data points in the train set: 1458, number of used features: 189
[LightGBM] [Info] Start training from score 12.024015


In [49]:
def blended_predictions(X):
    return ((0.1 * elasticnet.predict(X)) +
            (0.1 * lasso.predict(X)) +
            (0.1 * ridge.predict(X)) +
            (0.2 * gbr.predict(X)) +
            (0.25 * xgb.predict(X)) +
            (0.25 * lgb.predict(X)))


In [50]:
print("\nMaking predictions...")
blended_pred = np.expm1(blended_predictions(test.values))


Making predictions...


In [51]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = blended_pred
sub.to_csv('submission.csv', index=False)

In [52]:
print("\nDone! Check submission.csv")


Done! Check submission.csv


In [None]:
best_name, best_model = min(models, key=lambda t: rmsle_cv(t[1]).mean())
print(f"Best single model by CV: {best_name} | RMSLE: {rmsle_cv(best_model).mean():.5f}")