In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats import skew
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score





from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR as SupportVectorRegression
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Ridge, RidgeCV
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.ensemble import VotingRegressor

import itertools
from math import sqrt
import copy

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#reading csv file of train and test data
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
#saving the test id for submission
test_ID = test_df['Id']

In [None]:
print(train_df.head())

In [None]:
print(train_df.describe)

In [None]:
train_df.info()

# EDA

In [None]:
#SalePrice histogram
fig = px.histogram(train_df, x="SalePrice", marginal="violin", hover_data=train_df.columns, width=800, height=500, template="plotly_dark")
fig.show()

In [None]:
#skewness of target
print(train_df['SalePrice'].skew())

In [None]:
#OverallQual vs SalePrice
fig = px.scatter(train_df, x="OverallQual", y='SalePrice',hover_data=train_df.columns, width=800, height=500, template="plotly_dark")
fig.show()

In [None]:
#OverallQual histogram
fig = px.histogram(train_df, x="OverallQual", nbins=15,marginal="violin", hover_data=train_df.columns, width=800, height=500, template="plotly_dark")
fig.show()

In [None]:
#Overall skewness
print(train_df['OverallCond'].skew())

In [None]:
#GrLivArea vs SalePrice
fig = px.scatter(train_df, x="GrLivArea", y='SalePrice', hover_data=train_df.columns, width=800, height=500, template="plotly_dark")
fig.show()

In [None]:
#GrLivArea histogram
fig = px.histogram(train_df, x="GrLivArea", nbins=15,marginal="violin", hover_data=train_df.columns, width=800, height=500, template="plotly_dark")
fig.show()

In [None]:
#GrLivArea skewness
print(train_df['GrLivArea'].skew())

In [None]:
#pairplotting for other features
cols = ['TotalBsmtSF', '1stFlrSF', 'MasVnrArea', 'TotRmsAbvGrd', 'YearBuilt', 'SalePrice']
fig = px.scatter_matrix(train_df[cols])
fig.update_layout(height=600, width=1290, template="plotly_dark")
fig.show()

# Feature Engineering

in this section we will start dealing with data. From data description we decided how we are going to deal with features.

## Target Skewness

dealing with the skewness of the target by applying log(1+x) transofrmation

In [None]:
fig = px.histogram(train_df, x="SalePrice", marginal="violin", hover_data=train_df.columns, width=800, height=500, template="plotly_dark")
fig.show()

In [None]:
print(train_df['SalePrice'].skew())

As we can see fro the graph data is positively skewed, and from the skewness coefficient which should be as close to zero as possible. A way to deal with this is using feature transformation. We will use log(1+x) transofrmation, which is equivalent to using boxcox1p with lambda set to equal zero. 

In [None]:
train_df['SalePrice'] = np.log1p(train_df['SalePrice'])

#train_df['SalePrice'] = boxcox1p(train_df['SalePrice'], 0) 

In [None]:
fig = px.histogram(train_df, x="SalePrice", marginal="violin", hover_data=train_df.columns, width=800, height=500, template="plotly_dark")
fig.show()

In [None]:
print(train_df['SalePrice'].skew())

In [None]:
#combining train and test data into the same dataframe

#concatting train and test dataframes
train_rows = train_df.shape[0]
y_train = train_df['SalePrice']
all_data = pd.concat((train_df.drop(['SalePrice'], axis=1), test_df)).reset_index(drop=True)

In [None]:
#checking all data info
all_data.info()

Displaying information of columns which only have NaN values

In [None]:
all_data[['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType']].info()

## Dealing with NaN Values


From carefully reading the data description and looking at the data description in the EDA section, we looked at each one of the features which contin NaN values and we decided on some important steps that are going to be taken to deal with such data.




### Numerical Features with NaN Values

'LotFrontage' has a lot of missing values, and since its numerical data we will replace NaN values with the median of the neighborhood which the house is located in. Because the data is combined, we will preform the median replacement on the train rows with the median of the train data, and median repalcement on the test rows with the median of the test data.

In [None]:
LotFrontge_train = all_data[['LotFrontage', 'Neighborhood']][:train_rows]
LotFrontge_test = all_data[['LotFrontage', 'Neighborhood']][train_rows:]

LotFrontge_train['LotFrontage'] = LotFrontge_train.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
LotFrontge_test['LotFrontage'] = LotFrontge_test.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

LotFrontge_train.drop('Neighborhood', axis=1, inplace=True)
LotFrontge_test.drop('Neighborhood', axis=1, inplace=True)

LotFrontage = pd.concat((LotFrontge_train, LotFrontge_test)).reset_index(drop=True)

all_data['LotFrontage'] = LotFrontage

'MasVnrArea' we will replace NaN values with 0, since it is most likely to mean that the house doesn't have masnory veneer.

In [None]:
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

'BsmtFinSF1' and ''BsmtFinSF2' will be replaced with 0, it most likely mean that there is no basement. 

In [None]:
all_data["BsmtFinSF1"] = all_data["BsmtFinSF1"].fillna(0)
all_data["BsmtFinSF2"] = all_data["BsmtFinSF2"].fillna(0)

'BsmtUnfSF' and 'TotalBsmtUnf' NaN values will be replaced by 0, as it also indicates there's no basement. 

In [None]:
all_data["BsmtUnfSF"] = all_data["BsmtUnfSF"].fillna(0)
all_data["TotalBsmtSF"] = all_data["TotalBsmtSF"].fillna(0)

'BsmtFullBath' and 'BsmtHalfBath' NaN values will be replaced with 0, for having no basement. 

In [None]:
all_data["BsmtFullBath"] = all_data["BsmtFullBath"].fillna(0)
all_data["BsmtHalfBath"] = all_data["BsmtHalfBath"].fillna(0)

'GarageYrBlt', 'GarageCars' and 'GarageArea' NaN valeus will be replaced by 0, for having no garage.

In [None]:
all_data["GarageYrBlt"] = all_data["GarageYrBlt"].fillna(0)
all_data["GarageCars"] = all_data["GarageCars"].fillna(0)
all_data["GarageArea"] = all_data["GarageArea"].fillna(0)

### Categorical Features with NaN values

'MSZoning' NaN values will be replaced by the mode.

In [None]:
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])

'Utilities' doesn't have any variance to it. So we will drop it.

In [None]:
all_data.drop('Utilities', axis=1, inplace=True)

'Exterior1st' and 'Exterior2nd' NaN values will be replaced with the mode. 

In [None]:
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])

'Electrical', 'KitchenQual' have one NaN value so we will be replace that with the mode.

In [None]:
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

'Functional' will be replace with Typical according to the data description.

In [None]:
all_data["Functional"] = all_data["Functional"].fillna("Typ")

'SaleType' only has one record with NaN so we will replace it with the mode.

In [None]:
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

The following list of features are feature which have NaN values that will be replaced with None. as None has a meaning for them. For example 'Alley' NaN values mean that there is no access to an alley. The same logic could be applied to the other features in the list.  

In [None]:
replace_na_none = ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 
                   'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

for feature in replace_na_none:
  all_data[feature] = all_data[feature].fillna("None")

## Encoding Categorical Features

In [None]:
#printing all categorical features
all_data.select_dtypes('object').columns

After looking at the data alongside data description we divided encoding into two sections.

1 - Encoding non-ordinal features 

In [None]:
non_ordinal_features = ['MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'Foundation', 'Heating', 'CentralAir', 'Electrical',
'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']

for feature in non_ordinal_features:
  le = LabelEncoder()
  le.fit(all_data[feature].values)
  all_data[feature] = le.transform(all_data[feature].values)

2 - Encoding ordinal features:

These features have an order to them, also most of these features had NaN values in them, and as explained in the previous section of the notebook, we replaced the NaN values in these features with 'None' to be able to encode them correctly.

In [None]:
for col in all_data.columns:
  if col in ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']:
    oe = OrdinalEncoder(categories=[['None','Po','Fa','TA','Gd','Ex']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]])
  
  elif col in ['BsmtFinType1', 'BsmtFinType2']:
    oe = OrdinalEncoder(categories=[['None','Unf','LwQ','Rec','BLQ','ALQ','GLQ']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]])

  elif col in ['HeatingQC', 'ExterCond', 'ExterQual', 'KitchenQual']:
    oe = OrdinalEncoder(categories=[['Po','Fa','TA','Gd','Ex']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]])
  
  elif col == 'Alley':
    oe = OrdinalEncoder(categories=[['None','Pave','Grvl']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]])

  elif col == 'BsmtExposure':
    oe = OrdinalEncoder(categories=[['None','No','Mn','Av','Gd']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]])  

  elif col == 'GarageType':
    oe = OrdinalEncoder(categories=[['None','Detchd','CarPort','BuiltIn','Basment','Attchd','2Types']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]])

  elif col == 'GarageFinish':
    oe = OrdinalEncoder(categories=[['None','Unf','RFn','Fin']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]]) 

  elif col == 'Fence':
    oe = OrdinalEncoder(categories=[['None','MnWw','GdWo','MnPrv','GdPrv']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]]) 

  elif col == 'PoolQC':
    oe = OrdinalEncoder(categories=[['None','Fa','TA','Gd','Ex']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]])

  elif col == 'MiscFeature':
    oe = OrdinalEncoder(categories=[['None','TenC','Shed','Othr','Gar2','Elev']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]])

  elif col == 'MasVnrType':
    oe = OrdinalEncoder(categories=[['None','Stone','CBlock','BrkFace','BrkCmn']])
    oe.fit(all_data.loc[:,[col]])
    all_data[col] = oe.transform(all_data.loc[:,[col]])

In [None]:
#making sure all data is now numeric
all_data.dtypes.unique()

## Adding Feature To Represent House Area

In [None]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [None]:
train_df = all_data[:train_rows].join(y_train)
test_df = all_data[train_rows:]

## Feature Selection

In [None]:
#correlation matrix for the target and the features
corr_matrix=train_df.corr()
target_corr_list = corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
#printing correlation matrix in descinding order
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(target_corr_list)

In [None]:
#obtaining features that have the lowest correlation
lowest_corr_matrix = round(abs(train_df.corr()), 2)
zero_target_corr_list = lowest_corr_matrix["SalePrice"].sort_values()

In [None]:
zero_target_corr_list.head(8)

We will remove the 6 lowest correlation columns as they wouldn't make much of a difference when training our model. The 'ID' column will also be removed as it doesn't have any meaning to any ML model.

In [None]:
#removing columns with low correlation
train_df.drop(['Condition2', 'BsmtFinSF2', 'BsmtHalfBath', 'MiscVal','Id'], axis=1, inplace=True)
test_df.drop(['Condition2', 'BsmtFinSF2', 'BsmtHalfBath', 'MiscVal','Id'], axis=1, inplace=True)

In [None]:
#remove 'TotalBsmtSF', '1stFlrSF' and '2ndFlrSF' since we added the 'TotalSF' feature to represent them
train_df.drop(['TotalBsmtSF', '1stFlrSF','2ndFlrSF'], axis=1,inplace=True)
test_df.drop(['TotalBsmtSF', '1stFlrSF','2ndFlrSF'], axis=1,inplace=True)

In [None]:
#removing 'PoolQC', 'PoolArea', 'Street' and '3SsnPorch' from the data since there is no much variance in them, so the models wouldn't be able to observe a pattern from them.
train_df.drop(['PoolQC', 'PoolArea', 'Street', '3SsnPorch'], axis=1,inplace=True)
test_df.drop(['PoolQC', 'PoolArea', 'Street', '3SsnPorch'], axis=1,inplace=True)

In [None]:
#creating the heatmap
mask = np.zeros_like(train_df.corr())
triangle_indices = np.triu_indices_from(mask)
mask[triangle_indices] = True


plt.figure(figsize=(100,100))
sns.heatmap(train_df.corr(), mask=mask, annot=True, annot_kws={"size": 14})
sns.set_style('white')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

### Colinearity
In this section we will look for features that have high correlation with each other and start to deal with them.

1 - 'GarageCars' and 'Garage Area'.

2 - 'TotalSF' with 'GrLivArea'

3 - 'GarageYrBlt' with 'YrBlt'

4 - 'FirePlaceQl' with 'FirePlaces'

5 - 'TotRmsAbvGrd' with 'GrLivArea'

6 - 'ExterQual' with 'OverallQual'

7 - 'BldgType' with 'MSSubClass'

after knowing the highly correlated features we want to deal with them. This is done through looking at the VIF for the features then removing the correlated featuers and checking whether removing the features will help reduce the VIF. 

In [None]:
#function to calculate VIF 
def calc_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return vif

In [None]:
VIF = calc_vif(train_df.iloc[:, :-1])

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(VIF)

if VIF > 10, then the features correpsonding to such result are highly correlated with other features in the dataset. We are aiming to be able to reduce the VIF values but still maintain data. 

so from the previous list:

1 - 'GarageCars' and 'GarageArea'. We will remove 'GarageArea'

2 - 'TotalSF' with 'GrLivArea'. We will remove 'GrLivArea'

3 - 'GarageYrBlt' with 'YrBlt'. We will remove  'GarageYrBlt'

4 - 'FirePlaceQual' with 'FirePlaces'. We will remove 'FirePlaces'

5 - 'ExterQual' with 'OverallQual'. We will remove 'ExterQual'

6 - 'BldgType' with 'MSSubClass'. We will remove 'MsSubClass'

In [None]:
train_df.drop(['GarageArea', 'GrLivArea', 'GarageYrBlt', 'Fireplaces', 'ExterQual','MSSubClass'], axis=1, inplace=True)
test_df.drop(['GarageArea', 'GrLivArea', 'GarageYrBlt', 'Fireplaces', 'ExterQual','MSSubClass'], axis=1, inplace=True)

In [None]:
VIF = calc_vif(train_df.drop(['SalePrice'], axis=1))

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(VIF)

In [None]:
y_train = train_df['SalePrice']
all_data = pd.concat((train_df.drop(['SalePrice'], axis=1), test_df)).reset_index(drop=True)

## Feature Skewness

we have a lot of features and most of the features are skewed. Since we don't want the models to be trined on skewed data, we will deal with them in this section. 

In [None]:
#obtaining skewness for all features

skewed_features = all_data[all_data.columns].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_features})
skewness.head(10)

In [None]:
skewness = skewness[abs(skewness) > 0.5]
print(f'Number of features that have absloute skewness > 0.5: {skewness.shape[0]} features')

we will plot the histogram of 'LotArea' to show data before correcting skewness

In [None]:
fig = px.histogram(all_data, x="LotArea", marginal="violin", hover_data=all_data.columns, width=800, height=500, template="plotly_dark")
fig.show()

we will transform these features by using boxcox1p transformation. The reason for this is that some columns have 0 in them, so using regular boxcox would result in an error as it only accepts posetive values.

In [None]:
skewed_features = skewness.index

In [None]:
for index in skewed_features:
    all_data[index] = np.log1p(all_data[index])

In [None]:
fig = px.histogram(all_data, x="LotArea", marginal="violin", hover_data=all_data.columns, width=800, height=500, template="plotly_dark")
fig.show()

# Data Modeling

In [None]:
train_df = all_data[:train_rows].join(y_train)
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']

In [None]:
test_df = all_data[train_rows:]

In order to be able to evaluate our models and tune their hyper parameters, we need a test data of some sort. To do this, we will split the training data into a train and test sets, just in order to be able to see what we're actually working with. After obtaining the best models and the best hyper paramter, and for the actual submitted predictions we will re-train the models on the entire train data set.  

## Best Seed

In this section we will look for the best seed to split the data by.

In [None]:
df1 = pd.DataFrame(columns=('LR', 'DTR', 'SVR', 'ARDR', 'BR', 'RR', 'RR_CV', 'LGBM', 'XGB', 'RFR', 'GBR', 'Ada', 'BaggingR'))

for i in range(0,20):
  X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=i)
  LR = LinearRegression()
  DTR = DecisionTreeRegressor()
  SVR = SupportVectorRegression()
  ARDR = ARDRegression()
  BR = BayesianRidge()
  RR = Ridge()
  RR_CV = RidgeCV()
  LGBM = LGBMRegressor()
  XGB = XGBRegressor(verbosity = 0)
  RFR = RandomForestRegressor()
  GBR = GradientBoostingRegressor()
  Ada = AdaBoostRegressor()
  BaggingR = BaggingRegressor()

  df1.loc[i] = [LR.fit(X_train, y_train).score(X_test, y_test), DTR.fit(X_train, y_train).score(X_test, y_test), SVR.fit(X_train, y_train).score(X_test, y_test),
                ARDR.fit(X_train, y_train).score(X_test, y_test), BR.fit(X_train, y_train).score(X_test, y_test), RR.fit(X_train, y_train).score(X_test, y_test),
                RR_CV.fit(X_train, y_train).score(X_test, y_test), LGBM.fit(X_train, y_train).score(X_test, y_test), XGB.fit(X_train, y_train).score(X_test, y_test),
                RFR.fit(X_train, y_train).score(X_test, y_test), GBR.fit(X_train, y_train).score(X_test, y_test), Ada.fit(X_train, y_train).score(X_test, y_test),
                BaggingR.fit(X_train, y_train).score(X_test, y_test)]              

In [None]:
df1.sort_values(['LR', 'DTR', 'SVR', 'ARDR', 'BR', 'RR', 'RR_CV', 'LGBM', 'XGB', 'RFR', 'GBR', 'Ada', 'BaggingR'], ascending=False)

from looking at the above data frame, we will use seed 18

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=18)

We will write a class that we're going to use in grid search to tune the hyper paramteres of our models. This class works by training the model on the training set and evaluating it on the test set.

In [None]:
class GridSearch:


  def __init__(self, model, parameter_grid):
    self.model = model
    self.parameter_grid = parameter_grid
    self.best_model_ = None
    self.best_params_ = dict()
  
  def generate_permutations(self):
    keys = self.parameter_grid.keys()
    vals = self.parameter_grid.values()
    for instance in itertools.product(*vals):
        yield dict(zip(keys, instance))

  def fit_test(self, X_train, y_train, X_test, y_test):
    dummy = list(self.generate_permutations())
    keys = list(dummy[0].keys())
    best_score = 0.0
    scores = []
    for para in dummy:
      self.model.set_params(**para)
      self.model.fit(X_train, y_train)
      score = self.model.score(X_test, y_test)
      scores.append(score)
      if score >= best_score:
        best_score = score
        self.best_params_ = self.model.get_params(deep=False)
        self.best_model_ = copy.deepcopy(self.model)
    self.plot_curve(scores)

  def plot_curve(self,scores):
    fig = px.line(
        pd.DataFrame({
            'Iterations': range(len(scores)),
            'Scores': scores
        }),
        x='Iterations',
        y='Scores',
        template="plotly_dark",
        width=800, height=500)
    fig.update_layout(title={
        'text': f'{type(self.model).__name__} Model Accuracy Graph',
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
    fig.show()

In [None]:
def reg_metrics(y_true, y_predict):
  acc = r2_score(y_true, y_predict)
  mse = mean_squared_error(y_true, y_predict)
  print(f'The accuracy score: {acc}')
  print(f'The root mean squred error: {sqrt(mse)}')

1 - Linear Regression

In [None]:
LR = LinearRegression()
LR.fit(X_train, y_train)

In [None]:
reg_metrics(y_test, LR.predict(X_test))

2 - Decision tree Regressor

In [None]:
decision_tree_grid = {
        'max_depth': range(1,20),
        'min_samples_leaf': range(1,20)
    }
decision_tree_grid_search = GridSearch(DecisionTreeRegressor(), decision_tree_grid)

In [None]:
decision_tree_grid_search.fit_test(X_train, y_train, X_test, y_test)

In [None]:
DTR = decision_tree_grid_search.best_model_

In [None]:
DTR.get_params

In [None]:
reg_metrics(y_test, DTR.predict(X_test))

3 - Support Vector Regressor


In [None]:
SVR_parameter_grid = {
         'C': [0.1,1, 10, 100], 
         'gamma': [1,0.1,0.01,0.001]
         }
SVR_grid_search = GridSearch(SupportVectorRegression(), SVR_parameter_grid)

In [None]:
SVR_grid_search.fit_test(X_train, y_train, X_test, y_test)

In [None]:
SVR = SVR_grid_search.best_model_

In [None]:
SVR.get_params

In [None]:
reg_metrics(y_test, SVR.predict(X_test))

4 - ARD Regression

In [None]:
ARDR = ARDRegression()
ARDR.fit(X_train, y_train)

In [None]:
reg_metrics(y_test, ARDR.predict(X_test))

5 - Bayesian Ridge

In [None]:
BR = BayesianRidge()
BR.fit(X_train, y_train)

In [None]:
reg_metrics(y_test, BR.predict(X_test))

6 - Ridge Regression

In [None]:
RR_parameter_grid = {
    'alpha': [0.01, 0.1, 1, 10],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

RR_grid_search = GridSearch(Ridge(), RR_parameter_grid)

In [None]:
RR_grid_search.fit_test(X_train, y_train, X_test, y_test)

In [None]:
RR = RR_grid_search.best_model_

In [None]:
RR.get_params

In [None]:
reg_metrics(y_test, RR.predict(X_test))

7 - Ridge CV

In [None]:
RR_CV = RidgeCV()
RR_CV.fit(X_train, y_train)

In [None]:
reg_metrics(y_test, RR_CV.predict(X_test))

8 - LGBM

In [None]:
LGBM_parameter_grid = {
    'num_leaves': [7, 14, 21, 28, 31, 50],
    'learning_rate': [0.1, 0.03, 0.003],
    'max_depth': [-1, 3, 5],
    'n_estimators': [50, 100, 200, 500],
}

LGBM_grid_search = GridSearch(LGBMRegressor(), LGBM_parameter_grid)

In [None]:
LGBM_grid_search.fit_test(X_train, y_train, X_test, y_test)

In [None]:
LGBM = LGBM_grid_search.best_model_

In [None]:
LGBM.get_params

In [None]:
reg_metrics(y_test, LGBM.predict(X_test))

9 - XGB

In [None]:
XGB_parameter_grid = {
 "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
 "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
 "gamma": [0.0, 0.1, 0.2 , 0.3, 0.4]
}

XGB_grid_search = GridSearch(XGBRegressor(verbosity=0), XGB_parameter_grid)

In [None]:
XGB_grid_search.fit_test(X_train, y_train, X_test, y_test)

In [None]:
XGB = XGB_grid_search.best_model_

In [None]:
XGB.get_params

In [None]:
reg_metrics(y_test, XGB.predict(X_test))

10 - Random Forest Regressor 

In [None]:
RFR_paramter_grid = {
 'max_depth': [10, 20, 30, 40, 50],
 'min_samples_leaf': [1, 2, 4],
 'n_estimators': [200, 400, 600, 800, 1000]
}
RFR_grid_search = GridSearch(RandomForestRegressor(), RFR_paramter_grid)

In [None]:
RFR_grid_search.fit_test(X_train, y_train, X_test, y_test)

In [None]:
RFR = RFR_grid_search.best_model_

In [None]:
RFR.get_params

In [None]:
reg_metrics(y_test, RFR.predict(X_test))

11 - Gradient Boosting Regressor

In [None]:
GB_paramter_grid = {
    'n_estimators': [100,500,1000, 1500],
    'max_depth': [4,6,8,10]
}
GB_grid_search = GridSearch(GradientBoostingRegressor(), GB_paramter_grid)

In [None]:
GB_grid_search.fit_test(X_train, y_train, X_test, y_test)

In [None]:
GB = GB_grid_search.best_model_

In [None]:
GB.get_params

In [None]:
reg_metrics(y_test, GB.predict(X_test))

12 - Ada Boost Regressor

In [None]:
Ada_paramter_grid = {
 'n_estimators': [50, 100],
 'learning_rate' : [0.01,0.05,0.1,0.3,1],
 'loss' : ['linear', 'square', 'exponential']
}
Ada_grid_search = GridSearch(AdaBoostRegressor(), Ada_paramter_grid)

In [None]:
Ada_grid_search.fit_test(X_train, y_train, X_test, y_test)

In [None]:
Ada = Ada_grid_search.best_model_

In [None]:
Ada.get_params

In [None]:
reg_metrics(y_test, Ada.predict(X_test))

13 - Bagging Regressor

In [None]:
bagging_paramter_grid = {
  "max_samples": [0.5, 1.0],
  "max_features": [0.5, 1.0],
  'n_estimators': np.arange(10,150,10)
}
bagging_grid_search = GridSearch(BaggingRegressor(), bagging_paramter_grid)

In [None]:
bagging_grid_search.fit_test(X_train, y_train, X_test, y_test)

In [None]:
BaggingR = bagging_grid_search.best_model_

In [None]:
BaggingR.get_params

In [None]:
reg_metrics(y_test, BaggingR.predict(X_test))

## Regression Ensemble

In this section we will combine some of the best performing models from the previous section, to form one regressor that depends on all models.

In [None]:
#creating the objects to that will go into the ensemble, using the parameters obtained from grid search
SVR = SupportVectorRegression(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.001,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
 
RR = Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='svd', tol=0.001)
 
LGBM = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.03, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=200, n_jobs=-1, num_leaves=28, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
 
XGB = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0.0,
             importance_type='gain', learning_rate=0.15, max_delta_step=0,
             max_depth=4, min_child_weight=1, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=0)
 
GB = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=4,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1000,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
 
BaggingR = BaggingRegressor(base_estimator=None, bootstrap=True, bootstrap_features=False,
                 max_features=0.5, max_samples=1.0, n_estimators=40,
                 n_jobs=None, oob_score=False, random_state=None, verbose=0,
                 warm_start=False)

In [None]:
ensemble_estimators = [('SVR', SVR), ('RR', RR), ('LGBM', LGBM), ('XGB', XGB), ('GB', GB), ('BaggingR', BaggingR)]
#creating the ensemble with arbitrary weights, which will be optimized later
ensemble = VotingRegressor(estimators=ensemble_estimators, weights=[2,1,2,3,3,3])

In [None]:
ensemble.fit(X_train, y_train)

In [None]:
ensemble.score(X_test, y_test)

optimizing the weights of the ensemble

In [None]:
df2 = pd.DataFrame(columns=('w1', 'w2','w3', 'w4','score'))

i = 0
for w1 in range(1,5):
    for w2 in range(1,5):
      for w3 in range(1,5):
        for w4 in range(1,5):
          if len(set((w1,w2,w3,w4))) == 1:
          # skip if all weights are equal
            continue
          ensemble = VotingRegressor(estimators=ensemble_estimators, weights=[w2, w1, w2, w4, w2, w3])
          ensemble.fit(X_train, y_train)

          df2.loc[i] = [w1, w2, w3, w4, ensemble.score(X_test, y_test)]
          i += 1

In [None]:
df2.sort_values('score', ascending=False)

# Creating The Used Ensemble & Making Predictions

We will create the ensemble using the best weights obtained. This ensemble will be trained on the entire train data set, and then we will make predictions and submit them.  

In [None]:
ensemble = VotingRegressor(estimators=ensemble_estimators, weights=[1,1,1,2,1,1])

In [None]:
#X represents the train features, y is the target
ensemble.fit(X, y)

In [None]:
#making predictions
pred = ensemble.predict(test_df)

In [None]:
#taking the inverse of log1p transformation
pred = np.expm1(pred)

Submission

In [None]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = pred

In [None]:
sub.to_csv('submission.csv', index=False)