In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python


# general packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

pd.set_option("display.max_columns", 100)

# package for graphs
import matplotlib
import matplotlib.pyplot as plt

# machine learning packages
import sklearn
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer

import xgboost
from xgboost import XGBRegressor


import warnings
warnings.filterwarnings('ignore')

In [2]:
# get package versions
print('numpy version is:{}'.format(np.__version__))
print('pandas version is:{}'.format(pd.__version__))
print('matplotlib version is:{}'.format(matplotlib.__version__))
print('xgboost version is:{}'.format(xgboost.__version__))
print('scikit-learn version is:{}'.format(sklearn.__version__))

numpy version is:1.26.4
pandas version is:2.2.3
matplotlib version is:3.7.2
xgboost version is:2.0.3
scikit-learn version is:1.2.2


In [3]:
# load training data
train_df= pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")

# look at training data
print(train_df.head())
print('shape of test dataframe is:{}'.format(train_df.shape))

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities LotConfig LandSlope Neighborhood Condition1  \
0         Lvl    AllPub    Inside       Gtl      CollgCr       Norm   
1         Lvl    AllPub       FR2       Gtl      Veenker      Feedr   
2         Lvl    AllPub    Inside       Gtl      CollgCr       Norm   
3         Lvl    AllPub    Corner       Gtl      Crawfor       Norm   
4         Lvl    AllPub       FR2       Gtl      NoRidge       Norm   

  Condition2 BldgType HouseStyle  OverallQual  OverallCond  YearBuilt  \
0       Norm     1Fam     2Story            7          

In [4]:
# look at null values in training data
null_train_df= train_df.isnull().sum()[train_df.isnull().sum() > 50]
print(null_train_df.sort_values(ascending= False))

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
dtype: int64


In [5]:
# load test data
test_df= pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# look at test data
print(test_df.head())
print('shape of test dataframe is:{}'.format(test_df.shape))

     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   

  LandContour Utilities LotConfig LandSlope Neighborhood Condition1  \
0         Lvl    AllPub    Inside       Gtl        NAmes      Feedr   
1         Lvl    AllPub    Corner       Gtl        NAmes       Norm   
2         Lvl    AllPub    Inside       Gtl      Gilbert       Norm   
3         Lvl    AllPub    Inside       Gtl      Gilbert       Norm   
4         HLS    AllPub    Inside       Gtl      StoneBr       Norm   

  Condition2 BldgType HouseStyle  OverallQual  OverallCond  YearBuilt  \
0       Norm     1Fam     1Story           

In [6]:
# look at null values for test
null_test_df= test_df.isnull().sum()[test_df.isnull().sum() > 0]
print(null_test_df.sort_values(ascending= False))

PoolQC          1456
MiscFeature     1408
Alley           1352
Fence           1169
MasVnrType       894
FireplaceQu      730
LotFrontage      227
GarageCond        78
GarageYrBlt       78
GarageQual        78
GarageFinish      78
GarageType        76
BsmtCond          45
BsmtExposure      44
BsmtQual          44
BsmtFinType1      42
BsmtFinType2      42
MasVnrArea        15
MSZoning           4
BsmtFullBath       2
BsmtHalfBath       2
Functional         2
Utilities          2
GarageCars         1
GarageArea         1
TotalBsmtSF        1
KitchenQual        1
BsmtUnfSF          1
BsmtFinSF2         1
BsmtFinSF1         1
Exterior2nd        1
Exterior1st        1
SaleType           1
dtype: int64


In [7]:
# drop high null from train
all_col_set= set(train_df.columns.tolist())
null_col_set= set(null_train_df.index.tolist())
not_null_col_set= all_col_set - null_col_set

# look at training data after removing most of the null values
some_null_train_df= train_df.loc[:, list(not_null_col_set)]
print(some_null_train_df.head())
print('shape of some null train df is:{}'.format(some_null_train_df.shape))

   TotRmsAbvGrd  MSSubClass  MiscVal  TotalBsmtSF  WoodDeckSF LotConfig  \
0             8          60        0          856           0    Inside   
1             6          20        0         1262         298       FR2   
2             6          60        0          920           0    Inside   
3             7          70        0          756           0    Corner   
4             9          60        0         1145         192       FR2   

   BsmtFinSF2 BsmtCond BsmtQual  HalfBath KitchenQual  Id ExterQual SaleType  \
0           0       TA       Gd         1          Gd   1        Gd       WD   
1           0       TA       Gd         0          TA   2        TA       WD   
2           0       TA       Gd         1          Gd   3        Gd       WD   
3           0       Gd       TA         0          Gd   4        TA       WD   
4           0       TA       Gd         1          Gd   5        Gd       WD   

  HeatingQC SaleCondition ExterCond Electrical  Fireplaces PavedDriv

In [8]:
# drop the same columns as above from test also
test_drop_set= not_null_col_set - set(['SalePrice'])

test_drop_df= test_df.loc[:, list(test_drop_set)]

# look at test after dropping columns
print(test_drop_df.head())
print('shape of test df is:{}'.format(test_drop_df.shape))

   TotRmsAbvGrd  MSSubClass  MiscVal  TotalBsmtSF  WoodDeckSF LotConfig  \
0             5          20        0        882.0         140    Inside   
1             6          20    12500       1329.0         393    Corner   
2             6          60        0        928.0         212    Inside   
3             7          60        0        926.0         360    Inside   
4             5         120        0       1280.0           0    Inside   

   BsmtFinSF2 BsmtCond BsmtQual  HalfBath KitchenQual    Id ExterQual  \
0       144.0       TA       TA         0          TA  1461        TA   
1         0.0       TA       TA         1          Gd  1462        TA   
2         0.0       TA       Gd         1          TA  1463        TA   
3         0.0       TA       TA         1          Gd  1464        TA   
4         0.0       TA       Gd         0          Gd  1465        Gd   

  SaleType HeatingQC SaleCondition ExterCond Electrical  Fireplaces  \
0       WD        TA        Normal     

In [9]:
# find numeric null for train and categorical null for train
train_null_col= some_null_train_df.isnull().sum()[some_null_train_df.isnull().sum() > 0].index.tolist()
# all null training columns
train_col= sorted(some_null_train_df.columns.tolist())
# split numeric and categorical columns
train_num_col= some_null_train_df.select_dtypes(include= np.number).columns.tolist()
train_str_col= some_null_train_df.select_dtypes(exclude= np.number).columns.tolist()
# train null numeric
train_null_num_col= list(set(train_num_col).intersection(train_null_col))
print('numeric null training columns:{}'.format(train_null_num_col))
# train null categorical
train_null_str_col= list(set(train_str_col).intersection(train_null_col))
print('categorical null training columns:{}'.format(train_null_str_col))

numeric null training columns:['MasVnrArea']
categorical null training columns:['BsmtFinType1', 'BsmtCond', 'BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'Electrical']


In [10]:
# impute the numeric columns with median

if len(train_null_num_col) > 0:
    for i in train_null_num_col:
        mid1= some_null_train_df[i].median()
        some_null_train_df[i].fillna(mid1, inplace= True)
        # impute the numeric column in test
        test_df[i].fillna(mid1, inplace= True)


In [11]:
# impute the categorical columns with simpleimputer
imputer= SimpleImputer(strategy= 'most_frequent')
# fit and transform training
train_impute= imputer.fit_transform(some_null_train_df[train_null_str_col])
train_imputed_df= pd.DataFrame(train_impute, columns= train_null_str_col)
train_good= some_null_train_df.drop(columns= train_null_str_col)

# recombine training
train_good= pd.concat([train_good, train_imputed_df], axis= 1)


# transform test
test_impute= imputer.transform(test_drop_df[train_null_str_col])
test_imputed_df= pd.DataFrame(test_impute, columns= train_null_str_col)
test_good= test_drop_df.drop(columns= train_null_str_col)
# recombine test
test_good= pd.concat([test_good, test_imputed_df], axis= 1)


# verify all null are gone
print("remaining null in training is:{}".format(train_good.isnull().sum().sum()))

remaining null in training is:0


In [12]:
# check for remaining null in good_test df
test_null= (test_good.isnull().sum()[test_good.isnull().sum() > 0]).index.tolist()
print('remaining null test col are: {}'.format(test_null))

# drop null test col from  training and test
train_good_col= set(train_good.columns.tolist()) - set(test_null)
train_good= train_good[sorted(list(train_good_col))]
test_good_col=  set(test_good.columns.tolist()) - set(test_null)
test_good= test_good[sorted(list(test_good_col))]

remaining null test col are: ['TotalBsmtSF', 'BsmtFinSF2', 'KitchenQual', 'SaleType', 'MasVnrArea', 'Exterior2nd', 'Exterior1st', 'BsmtFinSF1', 'GarageArea', 'BsmtUnfSF', 'GarageCars', 'BsmtHalfBath', 'Utilities', 'Functional', 'MSZoning', 'BsmtFullBath']


In [13]:
# verify all nulls are gone from training and test
print('remaining null in training:{}'.format(train_good.isnull().sum().sum()))
print('remaining null in test:{}'.format(test_good.isnull().sum().sum()))

remaining null in training:0
remaining null in test:0


In [14]:
# look at statistics for training numeric
train_good.describe(include= np.number).T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1stFlrSF,1460.0,1162.626712,386.587738,334.0,882.0,1087.0,1391.25,4692.0
2ndFlrSF,1460.0,346.992466,436.528436,0.0,0.0,0.0,728.0,2065.0
3SsnPorch,1460.0,3.409589,29.317331,0.0,0.0,0.0,0.0,508.0
BedroomAbvGr,1460.0,2.866438,0.815778,0.0,2.0,3.0,3.0,8.0
EnclosedPorch,1460.0,21.95411,61.119149,0.0,0.0,0.0,0.0,552.0
Fireplaces,1460.0,0.613014,0.644666,0.0,0.0,1.0,1.0,3.0
FullBath,1460.0,1.565068,0.550916,0.0,1.0,2.0,2.0,3.0
GrLivArea,1460.0,1515.463699,525.480383,334.0,1129.5,1464.0,1776.75,5642.0
HalfBath,1460.0,0.382877,0.502885,0.0,0.0,0.0,1.0,2.0
Id,1460.0,730.5,421.610009,1.0,365.75,730.5,1095.25,1460.0


id can be dropped.  some columns are skewed including target.  Some are zero except outliers: bsmthalfbath, screenporch, bsmtfinsf2, and 3ssnporch.

In [None]:
# find the shape of the num cols
train_num_df= train_good.loc[:, train_good.select_dtypes(include= np.number).columns.tolist()]

print("shape of train num df is:{}".format(train_num_df.shape))

count1= 0
fig1, ax1= plt.subplots(7, 4, figsize= (40, 60))
for i in range(7):
    for j in range(4):
        if count1 < 27:
            avg1= train_num_df.iloc[:, count1].mean()
            mid1= train_num_df.iloc[:, count1].median()
            ax1[i, j].hist(train_num_df.iloc[:, count1], edgecolor= 'k')
            ax1[i, j].axvline(avg1, color= 'green', linewidth= 2, label= 'mean')
            ax1[i, j].axvline(mid1, color= 'orange', linewidth= 2, label= 'median')
            ax1[i, j].set_title(train_num_df.iloc[:, count1].name, fontsize= 32)
            ax1[i, j].tick_params(axis= 'x', labelsize= 14)
            ax1[i, j].tick_params(axis= 'y', labelsize= 14)
            count1= count1 + 1

shape of train num df is:(1460, 27)


In [None]:
# look at statistics for training categorical
train_good.describe(exclude= np.number).T

In [None]:
# split the data into feature and target
train1= train_good.copy()

y1= train1['SalePrice']
x1= train1.drop(['SalePrice', 'Id'], axis= 1)

# split data into training and validation
xtrain, xval, ytrain, yval= train_test_split(x1, y1, test_size= 0.2, shuffle= True, random_state= 24)

xtrain= xtrain.reset_index(drop= True)
xval= xval.reset_index(drop= True)
ytrain= ytrain.reset_index(drop= True)
yval= yval.reset_index(drop= True)

# look at shape of new dataframes
print('size of xtrain is:{}'.format(xtrain.shape))
print('size of xval is:{}'.format(xval.shape))
print('size of ytrain is:{}'.format(ytrain.shape))
print('size of yval is:{}'.format(yval.shape))

size of xtrain is:(1168, 51)
size of xval is:(292, 51)
size of ytrain is:(1168,)
size of yval is:(292,)


In [None]:
# numeric column for training and validation
train_num_col= xtrain.select_dtypes(include= np.number).columns.tolist()
# categorical column for training and validation
train_str_col= xtrain.select_dtypes(exclude= np.number).columns.tolist()

#separate the categorical and numeric for training
xtrain_num= xtrain[train_num_col]
xtrain_str= xtrain[train_str_col]

#separate the categorical and numeric for validation
xval_num= xval[train_num_col]
xval_str= xval[train_str_col]

# use minmaxscaler to scale training and validation
scaler= MinMaxScaler()
xtrain_scaled= scaler.fit_transform(xtrain_num)
xval_scaled= scaler.transform(xval_num)
# make dataframe
xtrain_scaled= pd.DataFrame(data= xtrain_scaled, columns= xtrain_num.columns.tolist())
xval_scaled= pd.DataFrame(data= xval_scaled, columns= xval_num.columns.tolist())


In [None]:
# make the one hot encoder
encoder= OneHotEncoder(sparse_output= False, handle_unknown= 'ignore')

# one hot encode training
xtrain1_encode= encoder.fit_transform(xtrain_str)
# one hot encode validation
xval1_encode= encoder.transform(xval_str)

# make dataframe
xtrain_encode= pd.DataFrame(data= xtrain1_encode, columns= encoder.get_feature_names_out())
xval_encode= pd.DataFrame(data= xval1_encode, columns= encoder.get_feature_names_out())

# recombine encoded with numeric for unscaled numeric
xtrain_unscaled= pd.concat([xtrain_num, xtrain_encode], axis= 1)
xval_unscaled= pd.concat([xval_num, xval_encode], axis= 1)

# recombine encoded with numeric for scaled numeric
xtrain_scaled= pd.concat([xtrain_scaled, xtrain_encode], axis= 1)
xval_scaled= pd.concat([xval_scaled, xval_encode], axis= 1)

# look at train dataframes encoded unscaled
print('This is the training dataframe encoded but unscaled')
print(xtrain_unscaled.head())

print('_'*70)
print('This is the training dataframe encoded and scaled')
print(xtrain_scaled.head())

This is the training dataframe encoded but unscaled
   1stFlrSF  2ndFlrSF  3SsnPorch  BedroomAbvGr  EnclosedPorch  Fireplaces  \
0      1035       371          0             3              0           0   
1      2069       574          0             3              0           1   
2      1630         0          0             3              0           1   
3      1629         0          0             3              0           1   
4      1344         0          0             2              0           1   

   FullBath  GrLivArea  HalfBath  KitchenAbvGr  LotArea  LowQualFinSF  \
0         1       1406         0             1     7920             0   
1         2       2643         1             1    13693             0   
2         2       1630         0             1     9750             0   
3         2       1629         0             1    10655             0   
4         1       1344         0             1     9503             0   

   MSSubClass  MiscVal  MoSold  OpenPorchSF  O

In [None]:
# make function to fit, train and compare models
# function needs to inverse the transform 

def metric_fn(name1, model, xtrain, ytrain, xval, yval):
    # fit model
    model.fit(xtrain, ytrain)
    # predict xtrain
    y_pred_train= model.predict(xtrain)
    # predict xval
    y_pred_val= model.predict(xval)
    
    # evaluate model on training
    metric_train= np.sqrt(mean_squared_log_error(ytrain, y_pred_train))
    # evaluate model on validation
    metric_val= np.sqrt(mean_squared_log_error(yval, y_pred_val))

    # make dataframe to hold root mean squared log error for training and validation
    metric_df= pd.DataFrame([[name1, metric_train, metric_val]], columns= ['model name', 'RMSLE train', 'RMSLE val'])
    
    return metric_df

In [None]:
#reset all dataframe index before fitting

xtrain_scaled= xtrain_scaled.reset_index(drop= True)
xval_scaled= xval_scaled.reset_index(drop= True)
xtrain_unscaled= xtrain_unscaled.reset_index(drop= True)
xval_unscaled= xval_unscaled.reset_index(drop= True)
ytrain= ytrain.reset_index(drop= True)
yval= yval.reset_index(drop= True)

In [None]:
# for decision tree

dtree_model= DecisionTreeRegressor(random_state= 24)

dtree_df= metric_fn('Decision Tree', dtree_model, xtrain_unscaled, ytrain, xval_unscaled, yval)

metric_df= dtree_df

print(metric_df.head())

      model name  RMSLE train  RMSLE val
0  Decision Tree          0.0   0.228329


In [23]:
# random forest

rforest_model= RandomForestRegressor(random_state= 24)

rforest_df= metric_fn('Random Forest', rforest_model, xtrain_unscaled, ytrain, xval_unscaled, yval)

metric_df= pd.concat([metric_df, rforest_df], axis= 0)

print(metric_df.head())

      model name  RMSLE train  RMSLE val
0  Decision Tree     0.000000   0.228329
0  Random Forest     0.062784   0.145923


In [24]:
# xgboost

xg_model= XGBRegressor(random_state= 24)

xgb_df= metric_fn('XGBoost', xg_model, xtrain_unscaled, ytrain, xval_unscaled, yval)

metric_df= pd.concat([metric_df, xgb_df], axis= 0)

print(metric_df.head())

      model name  RMSLE train  RMSLE val
0  Decision Tree     0.000000   0.228329
0  Random Forest     0.062784   0.145923
0        XGBoost     0.013982   0.148825


In [25]:
# hyperparameter tuning for xgboost
params= {'eta':[0.01, 0.03, 0.05], 'n_estimators':[250, 750, 1200]}

rmsle= 10
for i in range(3):
    for j in range(3):
        xg_model2= XGBRegressor(random_state= 24, eta= params['eta'][i], n_estimators= params['n_estimators'][j])
        xg_df2= metric_fn('XG_opt', xg_model2, xtrain_unscaled, ytrain, xval_unscaled, yval)
        print(f"eta is: {params['eta'][i]}, estimators is:{params['n_estimators'][j]} and val rmsle is {xg_df2.loc[0, 'RMSLE val']}")
        if xg_df2.loc[0, 'RMSLE val'] < rmsle:
            rmsle= xg_df2.loc[0, 'RMSLE val']
            best_eta= params['eta'][i]
            best_estimator= params['n_estimators'][j]

print(f"best eta is: {best_eta} and best estimators is: {best_estimator}")

eta is: 0.01, estimators is:250 and val rmsle is 0.1657096547517683
eta is: 0.01, estimators is:750 and val rmsle is 0.13329727011536494
eta is: 0.01, estimators is:1200 and val rmsle is 0.13171934192052112
eta is: 0.03, estimators is:250 and val rmsle is 0.1341681279311982
eta is: 0.03, estimators is:750 and val rmsle is 0.1318837951216614
eta is: 0.03, estimators is:1200 and val rmsle is 0.13179275875845045
eta is: 0.05, estimators is:250 and val rmsle is 0.12902304457577574
eta is: 0.05, estimators is:750 and val rmsle is 0.1291633505634754
eta is: 0.05, estimators is:1200 and val rmsle is 0.12955014828991387
best eta is: 0.05 and best estimators is: 250


In [26]:
# optimized xgboost using hyperparameters
# xgboost

xg_opt= XGBRegressor(random_state= 24, eta= best_eta, n_estimators= best_estimator)

xg_opt_df= metric_fn('XGBoost_Opt', xg_opt, xtrain_unscaled, ytrain, xval_unscaled, yval)

metric_df= pd.concat([metric_df, xg_opt_df], axis= 0)

print(metric_df.head())

      model name  RMSLE train  RMSLE val
0  Decision Tree     0.000000   0.228329
0  Random Forest     0.062784   0.145923
0        XGBoost     0.013982   0.148825
0    XGBoost_Opt     0.040799   0.129023


In [27]:
# hyperparameter tuning for random forest
params= {'max_features':[0.5, 0.75, 1], 'n_estimators':[100, 350, 800]}

rmsle= 10
for i in range(3):
    for j in range(3):
        rf_model2= RandomForestRegressor(random_state= 24, max_features= params['max_features'][i], n_estimators= params['n_estimators'][j])
        rf_df2= metric_fn('Random Forest_opt', rf_model2, xtrain_unscaled, ytrain, xval_unscaled, yval)
        print(f"max features is: {params['max_features'][i]}, estimators is:{params['n_estimators'][j]}, and val rmsle is {rf_df2.loc[0, 'RMSLE val']}")
        if rf_df2.loc[0, 'RMSLE val'] < rmsle:
            rmsle= rf_df2.loc[0, 'RMSLE val']
            best_rf_feature= params['max_features'][i]
            best_rf_estimator= params['n_estimators'][j]


print(f"best max_features is: {best_rf_feature} and best estimators is: {best_rf_estimator}")

max features is: 0.5, estimators is:100, and val rmsle is 0.14599041836815574
max features is: 0.5, estimators is:350, and val rmsle is 0.14456812615575573
max features is: 0.5, estimators is:800, and val rmsle is 0.14443632130323492
max features is: 0.75, estimators is:100, and val rmsle is 0.14324285323874367
max features is: 0.75, estimators is:350, and val rmsle is 0.14295761822977618
max features is: 0.75, estimators is:800, and val rmsle is 0.14407657036858273
max features is: 1, estimators is:100, and val rmsle is 0.21278965714630046
max features is: 1, estimators is:350, and val rmsle is 0.20858104131739116
max features is: 1, estimators is:800, and val rmsle is 0.20813267276651845
best max_features is: 0.75 and best estimators is: 350


In [28]:
# optimized random forest using hyperparameters


rf_opt= RandomForestRegressor(random_state= 24, max_features= best_rf_feature, n_estimators= best_rf_estimator)

rf_opt_df= metric_fn('Random Forest_Opt', rf_opt, xtrain_unscaled, ytrain, xval_unscaled, yval)

metric_df= pd.concat([metric_df, rf_opt_df], axis= 0)

print(metric_df.head())

          model name  RMSLE train  RMSLE val
0      Decision Tree     0.000000   0.228329
0      Random Forest     0.062784   0.145923
0            XGBoost     0.013982   0.148825
0        XGBoost_Opt     0.040799   0.129023
0  Random Forest_Opt     0.060261   0.142958


In [29]:
# fit full training on everything

train2= train_good.copy().reset_index(drop= True)
# make y train
ytrain_final= train2['SalePrice']
# make x train and drop id column
xtrain2= train2.drop(['SalePrice', 'Id'], axis= 1)

# split columns into categorical and numeric
train2_str_col= sorted(xtrain2.select_dtypes(exclude= np.number).columns.tolist())
train2_num_col= sorted(xtrain2.select_dtypes(include= np.number).columns.tolist())

# encode categorical columns
encoder= OneHotEncoder(sparse_output= False, handle_unknown= 'ignore')
xtrain_encoded= encoder.fit_transform(xtrain2[train2_str_col])
xtrain_encoded_df= pd.DataFrame(data= xtrain_encoded, columns= encoder.get_feature_names_out())

# recombine training numeric and string
xtrain_final= pd.concat([xtrain2[train2_num_col], xtrain_encoded_df], axis= 1)

# look at x train and y train dataframes
print(xtrain_final.head())
print('-'*80)
print(ytrain_final.head())

   1stFlrSF  2ndFlrSF  3SsnPorch  BedroomAbvGr  EnclosedPorch  Fireplaces  \
0       856       854          0             3              0           0   
1      1262         0          0             3              0           1   
2       920       866          0             3              0           1   
3       961       756          0             3            272           1   
4      1145      1053          0             4              0           1   

   FullBath  GrLivArea  HalfBath  KitchenAbvGr  LotArea  LowQualFinSF  \
0         2       1710         1             1     8450             0   
1         2       1262         0             1     9600             0   
2         2       1786         1             1    11250             0   
3         1       1717         0             1     9550             0   
4         2       2198         1             1    14260             0   

   MSSubClass  MiscVal  MoSold  OpenPorchSF  OverallCond  OverallQual  \
0          60        0   

In [30]:
# need to encode test
test2= test_good.copy()

# split columns into categorical and numeric
test2_str_col= sorted(test2.select_dtypes(exclude= np.number).columns.tolist())
test2_num_col= sorted(test2.select_dtypes(include= np.number).columns.tolist())

# encode categorical columns
test_encoded= encoder.transform(test2[test2_str_col])
test_encoded_df= pd.DataFrame(data= test_encoded, columns= encoder.get_feature_names_out())

# recombine training numeric and string
test_final= pd.concat([test2[test2_num_col], test_encoded_df], axis= 1)
# get id column
test_id= test_final['Id']
# drop id column from test final
test_final.drop(['Id'], axis= 1, inplace= True)

# look at final test dataframe
print(test_final.head())
print("shape of test final is :{}".format(test_final.shape))

   1stFlrSF  2ndFlrSF  3SsnPorch  BedroomAbvGr  EnclosedPorch  Fireplaces  \
0       896         0          0             2              0           0   
1      1329         0          0             3              0           0   
2       928       701          0             3              0           1   
3       926       678          0             3              0           1   
4      1280         0          0             2              0           0   

   FullBath  GrLivArea  HalfBath  KitchenAbvGr  LotArea  LowQualFinSF  \
0         1        896         0             1    11622             0   
1         1       1329         1             1    14267             0   
2         2       1629         1             1    13830             0   
3         2       1604         1             1     9978             0   
4         2       1280         0             1     5005             0   

   MSSubClass  MiscVal  MoSold  OpenPorchSF  OverallCond  OverallQual  \
0          20        0   

In [31]:
# train with full dataset and predict
xg_final= XGBRegressor(random_state= 24, eta= best_eta, n_estimators= best_estimator)

# fit model
xg_final.fit(xtrain_final, ytrain_final)

# predict test
y_pred_test= xg_final.predict(test_final)
# convert to series
y_pred_test= pd.Series(y_pred_test, name= 'SalePrice')

# combine predictions and id to make submission
final_df= pd.concat([test_id, y_pred_test], axis= 1)

# export submission file
final_df.to_csv("/kaggle/working/submission.csv", index= False)