In [None]:
import pandas as pd
import os
import sklearn
import math
import joblib
from sklearn import ensemble 
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import impute
from sklearn import pipeline
from sklearn import compose
from sklearn import feature_selection
from sklearn import neighbors
from sklearn import linear_model
from sklearn import svm
from sklearn import ensemble
from sklearn import metrics
from sklearn.compose import TransformedTargetRegressor
from feature_engine.selection import DropDuplicateFeatures, DropCorrelatedFeatures
import numpy as np
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from shutil import rmtree

In [None]:
warnings.filterwarnings('ignore')
sklearn.set_config(display="diagram")
dir = "../data"
cachedir = os.path.join(dir, "pipeline_cache")

In [None]:
house_train = pd.read_csv(os.path.join(dir, "train.csv"))
house_train.shape
house_train.info()

In [None]:
#Create new features
house_train['TotalSF'] = house_train['TotalBsmtSF'] + house_train['GrLivArea'] + house_train['1stFlrSF'] + house_train['2ndFlrSF']
house_train['TotalBathRooms'] = house_train['FullBath'] + house_train['BsmtFullBath'] + 0.5 * house_train['HalfBath'] +  0.5 * house_train['BsmtHalfBath']
house_train['TotalPorchSF'] = house_train['OpenPorchSF'] + house_train['3SsnPorch'] + house_train['EnclosedPorch'] + house_train['ScreenPorch']

In [None]:
def cont_selector(df):
    return df.select_dtypes(include=['number']).columns

def cat_selector(df):
    return df.select_dtypes(exclude=['number']).columns

def cast_to_cat(df, features):
    for feature in features:
        df[feature] = df[feature].astype('category')

In [None]:
print(cont_selector(house_train))
print(cat_selector(house_train))

In [None]:
features_to_cast = ["MSSubClass"]
features_to_cast.extend(cat_selector(house_train))
cast_to_cat(house_train, features_to_cast)
print(cont_selector(house_train))
print(cat_selector(house_train))

In [None]:
def get_features_to_drop_on_missingdata(df, threshold) :
    tmp = df.isnull().sum()
    return list(tmp[tmp/float(df.shape[0]) > threshold].index)

def drop_features(df, features):
    return df.drop(features, axis=1, inplace=True)

In [None]:
missing_data_features_to_drop = get_features_to_drop_on_missingdata(house_train, 0.25)
print(missing_data_features_to_drop)
drop_features(house_train, missing_data_features_to_drop)
house_train.info()

In [None]:
target = house_train['SalePrice']
features_to_drop = ['Id', 'SalePrice']
drop_features(house_train, features_to_drop)
house_train.info()

In [None]:
#define train, target data and metric
X_train = house_train
y_train = target
scoring = metrics.make_scorer(metrics.root_mean_squared_log_error, greater_is_better=False)

In [None]:
# preprocessing pipeline for linear estimator
steps = [('imp', impute.SimpleImputer(strategy="most_frequent")), ('ohe', preprocessing.OneHotEncoder(sparse_output=False,  handle_unknown='ignore'))]
cat_linear_preprocessor = pipeline.Pipeline(steps)

steps = [('imp', impute.SimpleImputer()), ('scaler', preprocessing.StandardScaler())]
cont_linear_preprocessor = pipeline.Pipeline(steps)

linear_preprocessor = compose.ColumnTransformer(
    [ ("categorical", cat_linear_preprocessor, cat_selector(house_train)), ("continuous", cont_linear_preprocessor, cont_selector(house_train)) ]
)

In [None]:
#lasso pipeline-I
lasso_pipeline1 = pipeline.Pipeline([  
                    ('linear_preprocessor', linear_preprocessor),
                    ('zv_filter', feature_selection.VarianceThreshold()),
                    ('lasso', linear_model.Lasso())
                ], memory=cachedir)
lasso_pipeline1

In [None]:
#grid search of lasso pipeline-I
lasso_pipeline_params = {
                'lasso__alpha':[0, 0.0001, 0.0005, 0.0007, 0.0008, 0.0009, 0.001, 0.005, 0.05, 0.5, 0.75, 1, 2, 5, 10, 20]
            }
cv = model_selection.KFold(10)
lasso_pipeline_grid1 = model_selection.GridSearchCV(lasso_pipeline1, lasso_pipeline_params, cv=cv, scoring=scoring, n_jobs=4)
lasso_pipeline_grid1.fit(X_train, y_train)
print(lasso_pipeline_grid1.best_params_)
print(lasso_pipeline_grid1.best_score_)
print(lasso_pipeline_grid1.best_estimator_)

In [None]:
#lasso pipeline-II
target_transformer = preprocessing.PowerTransformer(method='box-cox')

lasso_with_trans_target = TransformedTargetRegressor(
    regressor = linear_model.Lasso(),
    transformer= target_transformer
)
lasso_pipeline2 = pipeline.Pipeline([  
                    ('linear_preprocessor', linear_preprocessor),
                    ('zv_filter', feature_selection.VarianceThreshold()),
                    ('lasso', lasso_with_trans_target)
                ], memory=cachedir)
lasso_pipeline2

In [None]:
#grid search of lasso pipeline-II
lasso_pipeline_params = {
                'lasso__regressor__alpha':[0, 0.0001, 0.0005, 0.0007, 0.0008, 0.0009, 0.001, 0.005, 0.05, 0.5, 0.75, 1, 2, 5, 10, 20]
            }
cv = model_selection.KFold(10)
lasso_pipeline_grid2 = model_selection.GridSearchCV(lasso_pipeline2, lasso_pipeline_params, cv=cv, scoring=scoring, n_jobs=4)
lasso_pipeline_grid2.fit(X_train, y_train)
print(lasso_pipeline_grid2.best_params_)
print(lasso_pipeline_grid2.best_score_)
print(lasso_pipeline_grid2.best_estimator_)

In [None]:
# preprocessing pipeline for linear estimator(skew handling)
steps = [('imp', impute.SimpleImputer(strategy="most_frequent")), ('ohe', preprocessing.OneHotEncoder(sparse_output=False,  handle_unknown='ignore'))]
cat_linear_preprocessor_skew = pipeline.Pipeline(steps)

steps = [('imp', impute.SimpleImputer()), ('skew', preprocessing.QuantileTransformer(output_distribution='normal'))]
cont_linear_preprocessor_skew = pipeline.Pipeline(steps)

linear_preprocessor_skew = compose.ColumnTransformer(
    [ ("categorical", cat_linear_preprocessor_skew, cat_selector(house_train)), ("continuous", cont_linear_preprocessor_skew, cont_selector(house_train)) ]
)

In [None]:
#lasso pipeline-III
lasso_with_trans_target = TransformedTargetRegressor(
    regressor = linear_model.Lasso(),
    transformer= target_transformer
)
lasso_pipeline3 = pipeline.Pipeline([  
                    ('linear_preprocessor', linear_preprocessor_skew),
                    ('zv_filter', feature_selection.VarianceThreshold()),
                    ('lasso', lasso_with_trans_target)
                ], memory=cachedir)
lasso_pipeline3

In [None]:
#grid search of lasso pipeline-III
lasso_pipeline_params = {
                'lasso__regressor__alpha':[0, 0.0001, 0.0005, 0.0007, 0.0008, 0.0009, 0.001, 0.005, 0.05, 0.5, 0.75, 1, 2, 5, 10, 20]
            }
cv = model_selection.KFold(10)
lasso_pipeline_grid3 = model_selection.GridSearchCV(lasso_pipeline3, lasso_pipeline_params, cv=cv, scoring=scoring, n_jobs=4)
lasso_pipeline_grid3.fit(X_train, y_train)
print(lasso_pipeline_grid3.best_params_)
print(lasso_pipeline_grid3.best_score_)
print(lasso_pipeline_grid3.best_estimator_)

In [None]:
feature_names = lasso_pipeline_grid2.best_estimator_[0].get_feature_names_out()
print(len(feature_names), feature_names)
feature_names = lasso_pipeline_grid2.best_estimator_[1].get_feature_names_out(input_features = feature_names)
print(len(feature_names), feature_names)

In [None]:
def plot_feature_importances(model, feature_names):
    features = pd.DataFrame({'feature': feature_names, 'importance':np.abs(model.coef_)})
    features.sort_values(by=['importance'], ascending=True, inplace=True)
    features.set_index('feature', inplace=True)
    features.plot(kind='barh', figsize=(50, 50))
    return features
    
tmp = plot_feature_importances(lasso_pipeline_grid2.best_estimator_[2].regressor_, feature_names)
tmp.tail(50)

In [None]:
#ridge pipeline
ridge_with_trans_target = TransformedTargetRegressor(
    regressor = linear_model.Ridge(),
    transformer= target_transformer
)
ridge_pipeline = pipeline.Pipeline([  
                    ('linear_preprocessor', linear_preprocessor),
                    ('zv_filter', feature_selection.VarianceThreshold()),
                    ('ridge', ridge_with_trans_target)
                ], memory=cachedir)
ridge_pipeline

In [None]:
#grid search of ridge pipeline
ridge_pipeline_params = {
                'ridge__regressor__alpha':[0, 0.0001, 0.0005, 0.0007, 0.0008, 0.0009, 0.001, 0.005, 0.05, 0.5, 0.75, 1, 2, 5, 10, 20]
            }
cv = model_selection.KFold(10)
ridge_pipeline_grid = model_selection.GridSearchCV(ridge_pipeline, ridge_pipeline_params, cv=cv, scoring=scoring, n_jobs=4)
ridge_pipeline_grid.fit(X_train, y_train)
print(ridge_pipeline_grid.best_params_)
print(ridge_pipeline_grid.best_score_)
print(ridge_pipeline_grid.best_estimator_)

In [None]:
tmp = plot_feature_importances(ridge_pipeline_grid.best_estimator_[2].regressor_, feature_names)
tmp.tail(50)

In [None]:
joblib.dump(lasso_pipeline_grid2.best_estimator_, os.path.join(dir, "houseprice_lasso.pkl"))
joblib.dump(ridge_pipeline_grid.best_estimator_, os.path.join(dir, "houseprice_ridge.pkl"))

In [None]:
house_test = pd.read_csv(os.path.join(dir, "test.csv"))
print(house_test.shape)
house_test.head()

In [None]:
#Create new features
house_test['TotalSF'] = house_test['TotalBsmtSF'] + house_test['GrLivArea'] + house_test['1stFlrSF'] + house_train['2ndFlrSF']
house_test['TotalBathRooms'] = house_test['FullBath'] + house_test['BsmtFullBath'] + 0.5 * house_test['HalfBath'] +  0.5 * house_test['BsmtHalfBath']
house_test['TotalPorchSF'] = house_test['OpenPorchSF'] + house_test['3SsnPorch'] + house_test['EnclosedPorch'] + house_test['ScreenPorch']

In [None]:
cast_to_cat(house_test, features_to_cast)

In [None]:
house_test.info()

In [None]:
drop_features(house_test, missing_data_features_to_drop)

In [None]:
house_test['SalePrice'] = np.round(ridge_pipeline_grid.best_estimator_.predict(house_test), 2)
house_test.head()

In [None]:
house_test.to_csv(os.path.join(dir, "house_prices_submit.csv"), columns=["Id", "SalePrice"], index=False)

In [None]:
rmtree(cachedir)