**Read the data**

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
home_data = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv', index_col='Id')
X_test_full = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv', index_col='Id')

Target


In [None]:
home_data['SalePrice'].hist(bins = 50)

In [None]:
#home_data["SalePrice"] = np.log1p(home_data["SalePrice"])

In [None]:
home_data['SalePrice'].hist(bins = 50)

**Numerical data**

In [None]:
numerical_features = home_data.select_dtypes(exclude=['object']).drop(['SalePrice'], axis=1).copy()

In [None]:
plt.rcParams.update({'font.size': 12})


fig = plt.figure(figsize=(12,18))
for i in range(len(numerical_features.columns)):
    fig.add_subplot(9,4,i+1)
    sns.distplot(numerical_features.iloc[:,i].dropna())
    plt.xlabel(numerical_features.columns[i])
plt.tight_layout()
plt.show()

**Categorical data**

In [None]:
home_data['MSSubClass'] = home_data['MSSubClass'].apply(str)
X_test_full['MSSubClass'] = X_test_full['MSSubClass'].apply(str)

home_data['MoSold'] = home_data['MoSold'].apply(str)
X_test_full['MoSold'] = X_test_full['MoSold'].apply(str)

**Add boolean flags (new features)**

In [None]:
def make_more_corr(tbl):
    tbl['TotalSF'] = tbl['TotalBsmtSF'] + tbl['1stFlrSF'] +tbl['2ndFlrSF']
    tbl['Total_Bathrooms'] = tbl['FullBath'] + (0.5* tbl['HalfBath']) + tbl['BsmtFullBath'] + (0.5* tbl['BsmtHalfBath'])
    tbl['Total_sqrt_footage'] = tbl['BsmtFinSF1'] +tbl['BsmtFinSF2'] + tbl['1stFlrSF']+tbl['2ndFlrSF']
    tbl['Total_porch_SF'] = tbl['OpenPorchSF'] + tbl['3SsnPorch'] +tbl['EnclosedPorch'] +  tbl['ScreenPorch'] + tbl['WoodDeckSF']

make_more_corr(home_data)
make_more_corr(X_test_full)

def remove_repeated_colls(tbl):
    repeated=['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'FullBath', 'HalfBath',
             'BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', '1stFlrSF', '2ndFlrSF',
            'OpenPorchSF', '3SsnPorch', 'EnclosedPorch', 'ScreenPorch', 'WoodDeckSF']
    tbl.drop(repeated, axis=1, inplace=True)

**Add build year and garage year**

In [None]:
import datetime
now = datetime.datetime.now()

building_age = now.year - home_data['YearBuilt']
home_data['building_age'] = now.year - home_data['YearBuilt']
X_test_full['building_age'] = now.year - X_test_full['YearBuilt']

gar_age = now.year - home_data['GarageYrBlt']
home_data['garage_age'] = now.year - home_data['GarageYrBlt']
X_test_full['garage_age'] = now.year - X_test_full['GarageYrBlt']

In [None]:
def expand(tbl):
    tbl['haspool'] = tbl['PoolArea'].apply(lambda x:1 if x>0 else 0)
    tbl['has2ndFloor'] = tbl['2ndFlrSF'].apply(lambda x:1 if x>0 else 0)
    tbl['hasgarage'] = tbl['GarageArea'].apply(lambda x:1 if x>0 else 0)
    tbl['hasbsmt'] = tbl['TotalBsmtSF'].apply(lambda x:1 if x>0 else 0)
    tbl['hasfireplace'] = tbl['Fireplaces'].apply(lambda x:1 if x>0 else 0)
    
expand(home_data)
expand(X_test_full)

**Remove outliers**

In [None]:
home_data = home_data.drop(home_data['LotFrontage'][home_data['LotFrontage']>200].index)
home_data = home_data.drop(home_data['LotArea'][home_data['LotArea']>100000].index)
home_data = home_data.drop(home_data['BsmtFinSF1'][home_data['BsmtFinSF1']>4000].index)
home_data = home_data.drop(home_data['TotalBsmtSF'][home_data['TotalBsmtSF']>6000].index)
home_data = home_data.drop(home_data['1stFlrSF'][home_data['1stFlrSF']>4000].index)
home_data = home_data.drop(home_data.GrLivArea[(home_data['GrLivArea']>4000) & (home_data.SalePrice<300000)].index)
home_data = home_data.drop(home_data.LowQualFinSF[home_data['LowQualFinSF']>550].index)

**Drop repeated**

In [None]:
remove_repeated_colls(home_data)
remove_repeated_colls(X_test_full)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=[30,15])
#Heatmap lookup
#https://towardsdatascience.com/feature-selection-with-pandas-e3690ad8504b
# One of the assumptions of linear regression is that the independent variables need to be uncorrelated with each other.
#If these variables are correlated with each other, then we need to keep only one of them and drop the rest.
sns.heatmap(home_data.corr(), annot=True)

In [None]:
correlation_num = home_data.select_dtypes(exclude='object').corr()
plt.figure(figsize=(30,15))
plt.title('Correlation wit Sale Price')
sns.heatmap(data=abs(correlation_num)>0.80, annot=True)

**Drop correlated data (from heatmap lookup)**

In [None]:
marked_cols = ['MiscVal', 'YrSold', 'GarageYrBlt'] 
home_data.drop(marked_cols, axis=1, inplace=True)
X_test_full.drop(marked_cols, axis=1, inplace=True)

In [None]:
X_full = home_data.copy()
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)


In [None]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [None]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


Voting regressor (mainy xgb)

In [None]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import VotingRegressor
def regressor_ctor():
    xgb1 = XGBRegressor(
        n_estimators=1387,
        learning_rate=.033965,
        max_depth=3,
        colsample_bytree=0.401,
        colsample_bylevel=0.656,
        subsample=0.71235,
        random_state=4,
        verbosity=0,
        n_jobs=-1
    )
    
    xgb5 = XGBRegressor(
        n_estimators=3460,
        learning_rate=.01,
        max_depth=3,
        colsample_bytree=0.7,
        colsample_bylevel=0.656,
        subsample=0.7,
        reg_alpha=0.00006,
        min_child_weight=0,
        verbosity=0,
        n_jobs=-1
    )
    
    xgb2 = XGBRegressor(
        n_estimators=7200,    
        learning_rate=0.01,
        max_depth=4,
        colsample_bytree=0.2,
        subsample=0.2,
        min_child_weight=1.5,
        reg_alpha=0.9,
        reg_lambda=0.6,
        seed=42,
        verbosity=0,
        n_jobs=-1
    )
    
    xgb3 = XGBRegressor(
        n_estimators=7200,    
        learning_rate=0.01,
        max_depth=6,
        colsample_bytree=0.2,
        subsample=0.2,
        min_child_weight=1.5,
        reg_alpha=0.9,
        reg_lambda=0.6,
        seed=42,
        verbosity=0,
        n_jobs=-1
    )
    
    xgb4 = XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3,
                                min_child_weight=1.7817, n_estimators=2200, reg_alpha=0.4640, reg_lambda=0.8571,
                                subsample=0.5213, silent=1, random_state=7, nthread=-1)
    
    lgbm1 = LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55,
                                bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9,
                                bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11)
    
    final_regressor = VotingRegressor(estimators=[
    ('xgb1', xgb1),
    ('xgb2', xgb2),
    ('xgb3', xgb3),
    ('lgbm', lgbm1),
    ('xgb4', xgb4)
    ], weights=[0.25, 0.3, 0.3, 0.10, 0.05])

    return final_regressor

In [None]:
model = regressor_ctor()

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('xgbmodel', model)
                     ])

clf.fit(X_train, y_train)

preds = clf.predict(X_valid)

print('MAE:', mean_absolute_error(y_valid, preds))

In [None]:
model_full = regressor_ctor()
clf_full = Pipeline(steps=[('preprocessor', preprocessor),
                      ('xgbmodel', model_full)
                     ])

clf_full.fit(pd.concat([X_train, X_valid]), pd.concat([y_train, y_valid]))

In [None]:
preds_test = clf_full.predict(X_test)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission1.csv', index=False)