In [1]:
import numpy as np  # linear algebra
import pandas as pd  #
from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from scipy import stats
import os

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
df_train = pd.read_csv('./Output/csv/train_engineer.csv')
df_test = pd.read_csv('./Output/csv/test_engineer.csv')

In [3]:
target = df_train['SalePrice']
df_train.drop(['SalePrice'], axis = 1, inplace = True)

In [4]:
final_features = pd.get_dummies(df_train).reset_index(drop=True)
final_test = pd.get_dummies(df_test).reset_index(drop=True)

In [5]:
overfit = []
for i in final_features.columns:
    counts = final_features[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(final_features) * 100 > 99.94:
        overfit.append(i)

In [6]:
# kfold is 10
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
# make the function to get the score
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))
def cv_rmse(model, X=final_features):
    rmse = np.sqrt(-cross_val_score(model, X, target, scoring="neg_mean_squared_error", cv=kfolds))
    return (rmse)

In [7]:
# setup models
# all the parameter is based on existed data processes
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(),
                      RidgeCV(alphas=alphas_alt, cv=kfolds))

lasso = make_pipeline(RobustScaler(),
                      LassoCV(max_iter=1e7, alphas=alphas2,
                              random_state=42, cv=kfolds))

elasticnet = make_pipeline(RobustScaler(),
                           ElasticNetCV(max_iter=1e7, alphas=e_alphas,
                                        cv=kfolds, l1_ratio=e_l1ratio))
                                        
svr = make_pipeline(RobustScaler(),
                      SVR(C= 20, epsilon= 0.008, gamma=0.0003,))


gbr = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42)
                                   

lightgbm = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
                                       )
                                       

xgboost = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)

# stack
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet,
                                            gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

In [8]:
# ridge
# linear regreesion without feature importance
score = cv_rmse(ridge)
score.mean()

0.10409444557120766

In [9]:
# lasso
# linear regreesion with no feature importance
score = cv_rmse(lasso)
score.mean()

0.10324134187898312

In [10]:
# elastic net
# linear model with no feature importance
score = cv_rmse(elasticnet)
score.mean()

0.10317876736244669

In [11]:
# svr
score = cv_rmse(svr)
score.mean()

0.10834618828263567

In [14]:
from sklearn.ensemble import RandomForestRegressor
# random Forest Regression
forest_reg = RandomForestRegressor(n_estimators=40, random_state=42, 
                                   min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
score = cv_rmse(forest_reg)
score.mean()

0.1302543918925113

In [24]:
# lightgbm
score = cv_rmse(lightgbm)
score.mean()

0.10896411067233729

In [25]:
# gbr
score = cv_rmse(gbr)
score.mean()

0.11047061960146405

In [26]:
# xgboost
score = cv_rmse(xgboost)
score.mean()

  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




0.10712744442462667

In [None]:
stack_gen_model = stack_gen.fit(np.array(final_features), np.array(target))



In [None]:
elastic_model_full_data = elasticnet.fit(final_feature, target)

In [None]:
lasso_model_full_data = lasso.fit(final_feature, target)

In [None]:
ridge_model_full_data = ridge.fit(final_feature, target)

In [29]:
init_feature.values.tolist()

[['MSSubClass', 0.0016349228130816357],
 ['LotFrontage', 0.0044197817823763644],
 ['LotArea', 0.00984627093082312],
 ['OverallQual', 0.26344680026712536],
 ['OverallCond', 0.007246163088317686],
 ['YearBuilt', 0.01809525578190701],
 ['YearRemodAdd', 0.007076926324877024],
 ['MasVnrArea', 0.0015510122402282716],
 ['BsmtFinSF1', 0.007337847061278609],
 ['BsmtFinSF2', 0.00010372698966913396],
 ['BsmtUnfSF', 0.0036360503211098644],
 ['TotalBsmtSF', 0.009585222246531919],
 ['1stFlrSF', 0.00941193753631238],
 ['2ndFlrSF', 0.004045206365407001],
 ['LowQualFinSF', 0.0],
 ['GrLivArea', 0.06993176318406105],
 ['BsmtFullBath', 0.00036409804467955735],
 ['BsmtHalfBath', 1.069448660745847e-05],
 ['FullBath', 0.010353285574276427],
 ['HalfBath', 0.0005704731856375642],
 ['BedroomAbvGr', 0.0005774649064151042],
 ['KitchenAbvGr', 0.0004964665573416421],
 ['TotRmsAbvGrd', 0.0012318142797127039],
 ['Fireplaces', 0.0026283178502128812],
 ['GarageYrBlt', 0.005485398136837366],
 ['GarageCars', 0.0169432374

In [12]:
# Plot feature importance
def plot_feature_importance(model, df):
    feature_importance = model.feature_importances_[:30]
    # make importances relative to max importance
    plt.figure(figsize=(20, 20)) #figure size
    #making it a percentage relative to the max value
    feature_importance = 100.0 * (feature_importance / feature_importance.max()) 
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    #used train_drop here to show the name of each feature instead of our train_prepared
    plt.yticks(pos, df.columns[sorted_idx], fontsize=15) 
    plt.xlabel('Relative Importance', fontsize=20)
    plt.ylabel('Features', fontsize=20)
    plt.title('Variable Importance', fontsize=30)
    return pd.DataFrame({'columns_name': df.columns, 'feature_importance': model.feature_importances_})