In [1]:
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, ElasticNetCV, RidgeCV, Ridge, LassoCV, Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import scipy
from scipy import stats
from scipy.stats import skew, pearsonr

import optuna
from optuna import Trial
from optuna.samplers import TPESampler

import warnings
warnings.filterwarnings('ignore')

from CrossFoldEncoder import CrossFoldEncoder
from category_encoders import MEstimateEncoder, TargetEncoder, BinaryEncoder, OrdinalEncoder, OneHotEncoder


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from Project1 import *


In [11]:
# Set random seed
seed_val = 1160
np.random.seed(seed_val)

# linear_numeric_feats = ['Lot_Frontage', 'Lot_Area', 'Year_Built', 'Year_Remod_Add',
#                         'Mas_Vnr_Area', 'Bsmt_Unf_SF', 'Total_Bsmt_SF', 'First_Flr_SF',
#                         'Second_Flr_SF', 'Gr_Liv_Area', 'Garage_Area', 'Wood_Deck_SF',
#                         'Open_Porch_SF', 'Latitude', 'Longitude']

# linear_numeric_feats = ["Lot_Frontage", "Lot_Area", "Year_Built", "Year_Remod_Add", "Mas_Vnr_Area", "BsmtFin_SF_2", "Bsmt_Unf_SF", "Total_Bsmt_SF", "First_Flr_SF", "Second_Flr_SF",
#                         "Low_Qual_Fin_SF", "Gr_Liv_Area", "Garage_Yr_Blt", "Garage_Area", "Wood_Deck_SF", "Open_Porch_SF", "Enclosed_Porch", "Three_season_porch", "Screen_Porch", "Misc_Val", "Longitude", "Latitude"]

target_fold_dir = "fold1"

In [12]:
# Load the dataframes
X_train, y_train, X_test, y_test = load_dataframe(target_fold_dir)

# Make copies for processing
X_train_processed = X_train.copy()
X_test_processed = X_test.copy()
y_train_processed = y_train.copy()

# Log transform the training target
X_train_processed, y_train_processed, y_train_mean = transform_training_target(
    X_train_processed, y_train_processed)

# Clean the data
X_train_processed = clean(X_train_processed)
X_test_processed = clean(X_test_processed)

# Preprocess numerical features
X_train_processed, skewed_feats = process_numeric_features(
    X_train_processed)
# Remarks: apply the same transformation (on training data) to the test data
X_test_processed[skewed_feats] = np.log1p(X_test_processed[skewed_feats])

# Delete outliers in linear numerical features
numeric_feats = X_train_processed.dtypes[X_train_processed.dtypes != "object"].index
for feature in numeric_feats:
    if feature in X_train_processed.columns:
        X_train_processed, y_train_processed = delete_outliers(
            X_train_processed, y_train_processed, feature)
        
# Preprocess categorical features
# separate the categorical and numerical features
X_train_cat_cols = X_train_processed.select_dtypes(exclude=['number'])
X_train_num_cols = X_train_processed.select_dtypes(include=['number'])
X_test_cat_cols = X_test_processed.select_dtypes(exclude=['number'])
X_test_num_cols = X_test_processed.select_dtypes(include=['number'])

# method 1: dummy encoding
X_train_cat_cols = pd.get_dummies(X_train_cat_cols)
X_test_cat_cols = pd.get_dummies(X_test_cat_cols)
# If testing data don't have the feature, fill it with the mean of the training data
# find the missing columns
missing_cols = set(X_train_cat_cols.columns) - \
    set(X_test_cat_cols.columns)
print(f"Missing columns: {missing_cols}")
X_train_mean = X_train_cat_cols.mean()
# fill the missing columns with the mean of the training data
X_test_cat_cols = X_test_cat_cols.reindex(
    columns=X_train_cat_cols.columns)
for col in missing_cols:
    X_test_cat_cols[col] = X_train_mean[col]
    
print("X_test_cat_cols.columns", X_test_cat_cols.columns)

Drop 19 rows
Dropping categorical features with missing values: Misc_Feature    1956
Mas_Vnr_Type    1222
dtype: int64
Dropping categorical features with missing values: Misc_Feature    849
Mas_Vnr_Type    542
dtype: int64
Log transformed 21 skewed features
The skewed features are Index(['Lot_Area', 'Mas_Vnr_Area', 'BsmtFin_SF_2', 'Bsmt_Unf_SF',
       'First_Flr_SF', 'Second_Flr_SF', 'Low_Qual_Fin_SF', 'Gr_Liv_Area',
       'Bsmt_Full_Bath', 'Bsmt_Half_Bath', 'Half_Bath', 'Kitchen_AbvGr',
       'TotRms_AbvGrd', 'Fireplaces', 'Wood_Deck_SF', 'Open_Porch_SF',
       'Enclosed_Porch', 'Three_season_porch', 'Screen_Porch', 'Pool_Area',
       'Misc_Val'],
      dtype='object')
Missing columns: {'Exterior_1st_ImStucc', 'Overall_Qual_Very_Poor', 'Bsmt_Cond_Excellent', 'Exterior_1st_CBlock', 'Exterior_2nd_PreCast', 'Condition_2_RRAn', 'Heating_QC_Poor', 'Neighborhood_Landmark', 'Exterior_1st_Stone', 'Condition_2_PosN', 'Heating_OthW', 'Sale_Type_VWD', 'MS_SubClass_One_Story_with_Finished_At

In [13]:
print(X_test_cat_cols.head())

Unnamed: 0,MS_SubClass_Duplex_All_Styles_and_Ages,MS_SubClass_One_Story_1945_and_Older,MS_SubClass_One_Story_1946_and_Newer_All_Styles,MS_SubClass_One_Story_PUD_1946_and_Newer,MS_SubClass_One_Story_with_Finished_Attic_All_Ages,MS_SubClass_One_and_Half_Story_Finished_All_Ages,MS_SubClass_One_and_Half_Story_Unfinished_All_Ages,MS_SubClass_PUD_Multilevel_Split_Level_Foyer,MS_SubClass_Split_Foyer,MS_SubClass_Split_or_Multilevel,...,Sale_Type_New,Sale_Type_Oth,Sale_Type_VWD,Sale_Type_WD,Sale_Condition_Abnorml,Sale_Condition_AdjLand,Sale_Condition_Alloca,Sale_Condition_Family,Sale_Condition_Normal,Sale_Condition_Partial
0,False,False,True,False,0.002071,False,False,False,False,False,...,False,False,0.000518,False,True,False,False,False,False,False
1,False,False,False,False,0.002071,False,False,False,False,False,...,False,False,0.000518,True,False,False,False,False,True,False
2,False,False,False,False,0.002071,False,False,False,False,False,...,False,False,0.000518,True,False,False,False,False,True,False
3,False,False,True,False,0.002071,False,False,False,False,False,...,False,False,0.000518,True,False,False,False,False,True,False
4,False,False,False,False,0.002071,True,False,False,False,False,...,False,False,0.000518,True,False,False,False,False,True,False


In [None]:

# combine the categorical and numerical features
X_train_processed = pd.concat([X_train_num_cols, X_train_cat_cols], axis=1)
X_test_processed = pd.concat([X_test_num_cols, X_test_cat_cols], axis=1)

#################
# cols in this stage
print(X_train_processed.columns)
print(len(X_train_processed.columns))

# Train models
model_params = {
    'X_train': X_train_processed,
    'y_train': y_train_processed,
    'X_test': X_test_processed,
    'y_test': y_test
}
full_model_train_rmse, full_model_test_rmse = full_model(**model_params)
ridge_model_train_rmse, ridge_model_test_rmse = ridge_model(**model_params)
lasso_model_train_rmse, lasso_model_test_rmse = lasso_model(**model_params)
elasticnet_model_train_rmse, elasticnet_model_test_rmse = elasticnet_model(
    **model_params)
xgboost_model_train_rmse, xgboost_model_test_rmse = xgboost_model(
    **model_params)

# Hyperparameter Tuning
# print("Xgboost rmse before tuning: ", xgboost_model_rmse)
# print("Starting hyperparameter tuning for XGBoost...")
# tuned_params = tune_xgboost_params(
#     X_train_processed, y_train_processed, n_trials=100)
# print(f"Tuned params: {tuned_params}")
# xgb_tuned = XGBRegressor(**tuned_params)
# xgb_tuned.fit(X_train_processed, y_train_processed)
# y_pred_xgb_tuned = xgb_tuned.predict(X_test_processed)
# xgboost_model_rmse = rmse(y_test_log, y_pred_xgb_tuned)

# Conclusion
print("Current fold target: ", target_fold_dir)
print("Training Error:")
print(f"Full model score: {full_model_train_rmse:.5f} RMSE")
print(f"Lasso score with optimal alpha: {lasso_model_train_rmse:.5f} RMSE")
print(f"Ridge score with optimal alpha: {ridge_model_train_rmse:.5f} RMSE")
print(f"Elasticnet score: {elasticnet_model_train_rmse:.5f} RMSE")
print(f"Boosting Tree score: {xgboost_model_train_rmse:.5f} RMSE")

print("Test Error:")
print(f"Full model score: {full_model_test_rmse:.5f} RMSE")
print(f"Lasso score with optimal alpha: {lasso_model_test_rmse:.5f} RMSE")
print(f"Ridge score with optimal alpha: {ridge_model_test_rmse:.5f} RMSE")
print(f"Elasticnet score: {elasticnet_model_test_rmse:.5f} RMSE")
print(f"Boosting Tree score: {xgboost_model_test_rmse:.5f} RMSE")
