In [1]:
#pip install tabulate

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from feature_handler import FeatureHandler as FH
import pickle

pd.set_option('display.max_columns', None)

In [2]:
def same_cat(val, groups, col):
    for i in range(len(groups)):
        if val in groups[i]:
            val = f"{col}_group{i + 1}"
            break
    return val

def root_mean_squared_log_error(y_valid, y_preds):
    """Calculate root mean squared error of log(y_true) and log(y_pred)"""
    if len(y_preds)!=len(y_valid): return 'error_mismatch'
    y_preds_new = [math.log(x) for x in y_preds]
    y_valid_new = [math.log(x) for x in y_valid]
    return mean_squared_error(y_valid_new, y_preds_new, squared=False)

def outlier_removal(val, upper_band, lower_band):
    if val >= upper_band:
        val = upper_band
    elif val <= lower_band:
        val = lower_band
    return val

def outlier_finder(val, upper_band, lower_band):
    if val >= upper_band or val <= lower_band:
        val = 1
    else:
        val = 0
    return val

def outlier_handler(df, col):
    iqr = df[col].quantile(0.75) - df[col].quantile(0.25)
    upper_band = df[col].mean() + 3 * iqr
    lower_band = df[col].mean() - 3 * iqr
    df[f"{col}_OutHandler"] = df[col]
    df[col] = df[col].apply(lambda val: outlier_removal(val, upper_band, lower_band))
    df[f"{col}_OutHandler"] = df[f"{col}_OutHandler"].apply(lambda val: outlier_finder(val, upper_band, lower_band))
    return df[col], df[f"{col}_OutHandler"]

In [3]:
df = pd.read_csv("Data/TrainSet.csv")

df.drop(["Id", "PoolQC", "MiscFeature", "Alley", "Fence", "Street", "Utilities", "LandSlope", "Condition2", 
         "RoofMatl", "Heating", "LandContour", "LotConfig", "BldgType", "RoofStyle",
         "3SsnPorch", "BsmtFinSF2", "BsmtFinType2", "BsmtHalfBath", "ExterCond",
         "MSSubClass", "MiscVal", "MoSold", "PoolArea", "YrSold", "1stFlrSF", "GarageCond"], axis = 1, inplace = True)

missing_df = FH(df).missing_info
missing_cat = missing_df[missing_df["Data Type"] == "Object"]["Var Name"]
strategy = ["missing"] * len(missing_cat)
df = FH(df).simple_cat_imputer(cols = missing_cat, strategy = strategy, rand_seed = 42)
# df = FH(df).simple_num_imputer(cols = ["GarageYrBlt", "LotFrontage", "MasVnrArea"], 
#                                strategy = ["mean"] * 3, num_is_int = [True] * 3, rand_seed = 42)

num_missed_cols = ["GarageYrBlt", "LotFrontage", "MasVnrArea"]

for col in num_missed_cols:
    df[col].fillna(0, inplace = True)

In [4]:
# df = pd.read_csv("Data/TestSet.csv")

# df.drop(["Id", "PoolQC", "MiscFeature", "Alley", "Fence", "Street", "Utilities", "LandSlope", "Condition2", 
#          "RoofMatl", "Heating", "LandContour", "LotConfig", "BldgType", "RoofStyle",
#          "3SsnPorch", "BsmtFinSF2", "BsmtFinType2", "BsmtHalfBath", "ExterCond",
#          "MSSubClass", "MiscVal", "MoSold", "PoolArea", "YrSold", "1stFlrSF", "GarageCond"], axis = 1, inplace = True)

# missing_df = FH(df).missing_info
# missing_cat = missing_df[missing_df["Data Type"] == "Object"]["Var Name"]
# strategy = ["missing"] * len(missing_cat)
# df = FH(df).simple_cat_imputer(cols = missing_cat, strategy = strategy, rand_seed = 42)

# missed_num_cols = list(FH(df).missing_info["Var Name"])

# num_miss = len(missed_num_cols)

# df = FH(df).simple_num_imputer(cols = missed_num_cols, strategy = ["random"] * num_miss, num_is_int = [True] * num_miss, 
#                                rand_seed = 42)

In [5]:
ordered_cat = ['LotShape', 'ExterQual', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
               'BsmtFinType1', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 
               'GarageFinish', 'GarageQual', 'PavedDrive', 'CentralAir']

dictLotShape = dict({'Reg':4,'IR1':3, 'IR2':2, 'IR3':1})
# dictExterCond = dict({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'Missing':0})
dictExterQual = dict({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'Missing':0})
dictBsmtQual = dict({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'Missing':0})
dictBsmtCond = dict({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'Missing':0})
dictBsmtExposure = dict({'Gd':4,'Av':3, 'Mn':2, 'No':1, 'Missing':0})
dictBsmtFinType1 = dict({'GLQ':6, 'ALQ':5,'BLQ':4, 'Rec':3,'LwQ':2 ,'Unf':1,'Missing':0})
# dictBsmtFinType2 = dict({'GLQ':6, 'ALQ':5,'BLQ':4, 'Rec':3,'LwQ':2 ,'Unf':1,'Missing':0})
dictHeatingQC = dict({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'Missing':0})
dictKitchenQual = dict({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'Missing':0})
dictFunctional = dict({'Typ':7, 'Min1':6,'Min2':5, 'Mod':4,'Maj1':3, 'Maj2':2, 'Sev':1})
dictFireplaceQu = dict({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'Missing':0})
# dictGarageCond = dict({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'Missing':0})
dictGarageFinish = dict({'Fin':3, 'RFn':2, 'Unf':1, 'Missing':0})
dictGarageQual = dict({'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'Missing':0})
dictPavedDrive = dict({'Y':3,'P':2,'N':1})
dictCentralAir = dict({'Y':1, 'N':0})

dicts = [dictLotShape, dictExterQual, dictBsmtQual, dictBsmtCond, dictBsmtExposure,
         dictBsmtFinType1, dictHeatingQC, dictKitchenQual, dictFunctional, dictFireplaceQu,
         dictGarageFinish, dictGarageQual, dictPavedDrive, dictCentralAir]

for i, j in zip(dicts, ordered_cat):
    df[j] = df[j].map(i)
    
MSZoning_group1 = ["FV", "RL"]
MSZoning_group2 = ["RH", "RM"]
MSZoning_groups = [MSZoning_group1, MSZoning_group2]


Neighborhood_group1 = ["Blmngtn", "ClearCr", "Crawfor", "SawyerW"] 
Neighborhood_group2 = ["Blueste", "BrkSide", "Mitchel", "NPKVill", "SWISU"]
Neighborhood_group3 = ["BrDale", "IDOTRR", "MeadowV"]
Neighborhood_group4 = ["NoRidge", "Timber", "Veenker"]
Neighborhood_groups = [Neighborhood_group1, Neighborhood_group2, Neighborhood_group3, Neighborhood_group4]

Condition1_group1 = ["Artery", "RRAn", "RRNn"]
Condition1_group2 = ["PosA", "PosN", "RRNe"]
Condition1_group3 = ["RRAe", "Feedr"]
Condition1_groups = [Condition1_group1, Condition1_group2, Condition1_group3]

HouseStyle_group1 = ["2.5Unf", "2.5Fin", "1Story"]
HouseStyle_groups = [HouseStyle_group1]

Exterior1st_group1 = ["AsphShn", "BrkComm", "CBlock"]
Exterior1st_group2 = ["ImStucc", "Stone"]
Exterior1st_groups = [Exterior1st_group1, Exterior1st_group2]

Exterior2nd_group1 = ["AsphShn", "Brk Cmn", "AsbShng", "CBlock"]
Exterior2nd_group2 = ["ImStucc", "Stone", "Stucco", "Wd Shng", "Other"]
Exterior2nd_groups = [Exterior2nd_group1, Exterior2nd_group2]

Foundation_group1 = ["Stone", "BrkTil"]
Foundation_group2 = ["Wood", "PConc"]
Foundation_groups = [Foundation_group1, Foundation_group2]

Electrical_group1 = ["Mix", "FuseP", "FuseF"]
Electrical_group2 = ["Missing", "SBrkr"]
Electrical_groups = [Electrical_group1, Electrical_group2]

GarageType_group1 = ["2Types", "CarPort", "Missing"]
GarageType_groups = [GarageType_group1]

SaleType_group1 = ["CWD", "New", "Con"]
SaleType_group2 = ["ConLD", "ConLw"]
SaleType_group3 = ["ConLI", "WD"]
SaleType_groups = [SaleType_group1, SaleType_group2, SaleType_group3]

SaleCondition_group1 = ["Alloca", "Family"]
SaleCondition_groups = [SaleCondition_group1]

#--------------------------------------------------------------------------------
to_be_changed_cols = ["MSZoning", "Neighborhood", "Condition1", "HouseStyle", 
                      "Exterior1st", "Exterior2nd", "Foundation", "Electrical", "GarageType", "SaleType", 
                      "SaleCondition"]
groups = [MSZoning_groups, Neighborhood_groups, Condition1_groups, 
          HouseStyle_groups, Exterior1st_groups, Exterior2nd_groups, Foundation_groups, Electrical_groups, 
          GarageType_groups, SaleType_groups, SaleCondition_groups]
#--------------------------------------------------------------------------------
group_numerator = 0
for col in to_be_changed_cols:
    df[col] = df[col].apply(lambda val: same_cat(val, groups[group_numerator], col))
    group_numerator += 1

In [6]:
feat_eng_df = df.copy()

In [7]:
feat_eng_df["Style_info"] = feat_eng_df["HouseStyle"] + "-" + feat_eng_df["GarageType"]

imp_lst = ['HouseStyle_group1-Attchd', '2Story-Attchd', 'HouseStyle_group1-Detchd', '2Story-Detchd', '1.5Fin-Detchd',
           '2Story-BuiltIn', '1.5Fin-Attchd', 'SLvl-Attchd']

feat_eng_df["Style_info"] = feat_eng_df["Style_info"].apply(lambda val: val if val in imp_lst else "Style_info_group1")

In [8]:
feat_eng_df["Sale_info"] = feat_eng_df["SaleType"] + "-" + feat_eng_df["SaleCondition"]

imp_lst = ['SaleType_group3-Normal', 'SaleType_group1-Partial', 'SaleType_group3-Abnorml']

feat_eng_df["Sale_info"] = feat_eng_df["Sale_info"].apply(lambda val: val if val in imp_lst else "Sale_info_group1")

In [9]:
feat_eng_df["Geography_info"] = (feat_eng_df["MSZoning"] + "-" + feat_eng_df["Neighborhood"]) 

imp_lst = ['MSZoning_group1-NAmes', 'MSZoning_group1-Neighborhood_group1', 'MSZoning_group1-CollgCr', 
           'MSZoning_group2-OldTown', 'MSZoning_group1-Neighborhood_group2', 'MSZoning_group1-Neighborhood_group4',
           'MSZoning_group1-Edwards', 'MSZoning_group1-Somerst', 'MSZoning_group1-Gilbert', 'MSZoning_group1-NridgHt', 
           'MSZoning_group1-NWAmes', 'MSZoning_group1-Sawyer', 'MSZoning_group2-Neighborhood_group3',
           'MSZoning_group2-Neighborhood_group2', 'MSZoning_group1-StoneBr', 'MSZoning_group1-OldTown']

feat_eng_df["Geography_info"] = feat_eng_df["Geography_info"].apply(lambda val: val if val in imp_lst 
                                                                                    else "Geography_info_group1")

In [10]:
feat_eng_df["Exterior_info"] = (feat_eng_df["Exterior1st"] + "-" + feat_eng_df["Exterior2nd"]) 

imp_lst = ['VinylSd-VinylSd', 'MetalSd-MetalSd', 'HdBoard-HdBoard', 'Wd Sdng-Wd Sdng', 'Plywood-Plywood', 'CemntBd-CmentBd',
            'Stucco-Exterior2nd_group2', 'BrkFace-BrkFace', 'HdBoard-Plywood', 'WdShing-Exterior2nd_group2',
            'AsbShng-Exterior2nd_group1', 'Wd Sdng-Exterior2nd_group2', 'BrkFace-Wd Sdng']

feat_eng_df["Exterior_info"] = feat_eng_df["Exterior_info"].apply(lambda val: val if val in imp_lst else "Exterior_info_group1")

In [11]:
feat_eng_df["Condition_info"] = feat_eng_df["Condition1"] + "-" + feat_eng_df["Electrical"]


imp_lst = ['Norm-Electrical_group2', 'Condition1_group3-Electrical_group2', 'Norm-FuseA', 'Condition1_group1-Electrical_group2',
           'Condition1_group2-Electrical_group2']
    
feat_eng_df["Condition_info"] = feat_eng_df["Condition_info"].apply(lambda val: val if val in imp_lst 
                                                                                    else "Condition_info_group1")

In [12]:
feat_eng_df["Material_info"] = feat_eng_df["Foundation"] + "-" + feat_eng_df["MasVnrType"]


imp_lst = ['CBlock-None', 'Foundation_group2-None', 'Foundation_group2-BrkFace', 'CBlock-BrkFace', 'Foundation_group1-None',
           'Foundation_group2-Stone', 'CBlock-Stone', 'Slab-None', 'CBlock-BrkCmn']

feat_eng_df["Material_info"] = feat_eng_df["Material_info"].apply(lambda val: val if val in imp_lst else "Material_info_group1")

In [13]:
cat_cols = ["HouseStyle", "GarageType", "SaleType", "SaleCondition", "MSZoning", "Neighborhood", "Exterior1st", "Exterior2nd",
            "Condition1", "Electrical", "Foundation", "MasVnrType"]

feat_eng_df.drop(cat_cols, axis = 1, inplace = True)

In [14]:
feat_eng_df["Bath_info"] = 0.5 * feat_eng_df["HalfBath"] + feat_eng_df["FullBath"]
feat_eng_df["Tot_Quality_info"] = (4 * feat_eng_df["OverallQual"] + feat_eng_df["ExterQual"] + 4 * feat_eng_df["KitchenQual"] - 
                                   feat_eng_df["LotShape"])
feat_eng_df["Kitchen_info"] = feat_eng_df["KitchenAbvGr"] * feat_eng_df["KitchenQual"]
feat_eng_df["BsmtFullBath"] = np.sign(feat_eng_df["BsmtFullBath"])
feat_eng_df["Modernity_info"] = feat_eng_df["YearRemodAdd"] - feat_eng_df["YearBuilt"]
feat_eng_df["Fireplace_info"] = feat_eng_df["FireplaceQu"] * feat_eng_df["Fireplaces"]
feat_eng_df['Geometry_info'] = feat_eng_df['LotArea'] / feat_eng_df['LotFrontage']
feat_eng_df['ValueProposition_info'] = feat_eng_df['YearBuilt'] * feat_eng_df['OverallQual']
feat_eng_df['FinishedBasement_info'] = np.sign(feat_eng_df['BsmtFinSF1'])
feat_eng_df['GarageValue_info'] = feat_eng_df['YearBuilt'] * feat_eng_df['GarageCars']
feat_eng_df['MiscValue_info'] = feat_eng_df['Fireplaces'] + feat_eng_df['OverallQual']

In [15]:
# feat_eng_df["EnclosedPorch"] = feat_eng_df["EnclosedPorch"].apply(lambda val: 1 if val > 0 else 0)
# feat_eng_df["ScreenPorch"]= feat_eng_df["ScreenPorch"].apply(lambda val: 1 if val > 0 else 0)
# feat_eng_df["Functional"]= feat_eng_df["Functional"].apply(lambda val: 1 if val == 7 else 0)

In [16]:
# outlier_cols = ["LotFrontage", "GarageArea", "OpenPorchSF", "WoodDeckSF", "TotalBsmtSF", "LotArea", "MasVnrArea",
#                   "BsmtFinSF1", "GrLivArea", "BsmtUnfSF", "2ndFlrSF"]

# for col in outlier_cols:
#     feat_eng_df[col], feat_eng_df[f"{col}_OutHandler"] = outlier_handler(feat_eng_df, col)

In [17]:
feat_eng_df.drop(["HalfBath", "FullBath", "OverallQual", "ExterQual", "LotShape", "KitchenAbvGr", "KitchenQual", 
                  "BsmtCond", "BsmtExposure", "YearRemodAdd", "YearBuilt", "Fireplaces", "FireplaceQu", "GarageQual", 
                  "BsmtFinSF1", "LowQualFinSF"], 
                  axis = 1, inplace = True)

# feat_eng_df.drop(["HalfBath", "FullBath", "OverallQual", "ExterQual", "LotShape", "KitchenAbvGr", "KitchenQual", 
#                   "BsmtCond", "BsmtExposure", "YearRemodAdd", "YearBuilt", "Fireplaces", "FireplaceQu", "GarageQual", 
#                   "BsmtFinSF1", "LowQualFinSF", "HeatingQC", "EnclosedPorch", "MasVnrArea", "BsmtUnfSF", 
#                   "WoodDeckSF", "LotFrontage", "PavedDrive", "GarageYrBlt", "OverallCond", "BsmtFinType1", 
#                   "OpenPorchSF"], 
#                   axis = 1, inplace = True)

In [18]:
X = feat_eng_df.copy()

categoric_cols = FH(X).categoric_cols
one_hot_cols = categoric_cols

for col in one_hot_cols:
    cat_encoded_df = pd.get_dummies(X[col])
    cat_encoded_cols = cat_encoded_df.columns
    for new_col in cat_encoded_cols:
        X[new_col] = cat_encoded_df[new_col]
    X = X.drop([col], axis = 1)

In [19]:
# X["LotFrontage"] = np.log(X["LotFrontage"])
X["LotArea"] = np.log(X["LotArea"])
X["GrLivArea"] = np.log(X["GrLivArea"])

In [20]:
X.drop("SalePrice", axis = 1, inplace = True)
y = df["SalePrice"]

In [21]:
len(X.columns)

95

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 42)

xgb_reg = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.015, max_delta_step=0, max_depth=6,
             min_child_weight=0.0, monotone_constraints='()',
             n_estimators=3000, n_jobs=0, num_parallel_tree=2, random_state=0,
             reg_alpha=0.5, reg_lambda=1, scale_pos_weight=1, subsample=0.3,
             tree_method='exact', validate_parameters=1, verbosity=None)

xgb_reg.fit(X_train, y_train)
y_preds = xgb_reg.predict(X_test)

pickle.dump(xgb_reg, open("Model.pkl", "wb"))

model = pickle.load(open("Model.pkl", "rb"))

y_preds = model.predict(X_test)

model.score(X_test, y_test), model.score(X_train, y_train), root_mean_squared_log_error(y_test, y_preds)

(0.8860565082160374, 0.9994087479743873, 0.10613491742581124)

In [27]:
xgb_reg = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.015, max_delta_step=0, max_depth=6,
             min_child_weight=0.0, monotone_constraints='()',
             n_estimators=3000, n_jobs=0, num_parallel_tree=2, random_state=0,
             reg_alpha=0.5, reg_lambda=1, scale_pos_weight=1, subsample=0.3,
             tree_method='exact', validate_parameters=1, verbosity=None)

xgb_reg.fit(X, y)

pickle.dump(xgb_reg, open("salehouse_model.pkl", "wb"))

model = pickle.load(open("salehouse_model.pkl", "rb"))

model.score(X, y)

0.9993200156870617

In [24]:
# model = pickle.load(open("Model.pkl", "rb"))

# y_preds = model.predict(X)

# eval_df = pd.DataFrame({"Predicted Values": y_preds})

# df = pd.read_csv("Data/TestSet.csv")

# output = pd.DataFrame({'Id': df.Id,
#                        'SalePrice': y_preds})
# output.to_csv('submission8.csv', index=False)

# np.round(eval_df)