#### Importing the libraries

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer

from xgboost import XGBRegressor

#### Obtaining the dataset

In [2]:
def get_train_test_datasets_from_path(path):
    contents = os.listdir(path) 

    for file_name in ['test.csv', 'train.csv']:
        if ( not(file_name in contents)):
            raise FileNotFoundError(f"The required file {file_name} was not found") 

    pd_train = pd.read_csv(path + '/train.csv', index_col=0)
    pd_test = pd.read_csv(path + '/test.csv', index_col=0)

    return pd_train, pd_test

In [3]:
df, df_test = get_train_test_datasets_from_path('.')

In [4]:
df.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
#df['BsmtExposure'].value_counts()

In [6]:
#df['BsmtExposure'].isna().sum()

In [7]:
#quality_dict = {category: i+1 for i, category in enumerate(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'])}

In [8]:
#df['BsmtCond'] = df['BsmtCond'].fillna('NA')
#df['BsmtCond'].apply(lambda x: quality_dict[x])

#### Transforming the dataset

In [9]:
#Lets take a look at which columns have NA in them
def print_column_names_and_types(df):
    columns_with_nan_mask = df.isna().any()
    columns_with_nan = df.columns[columns_with_nan_mask].tolist()

    for column_name in columns_with_nan:
        dtype = df[column_name].dtype
        print(column_name, dtype, df[column_name].nunique(), end = ', ')

print_column_names_and_types(df)

LotFrontage float64 110, Alley object 2, MasVnrType object 3, MasVnrArea float64 327, BsmtQual object 4, BsmtCond object 4, BsmtExposure object 4, BsmtFinType1 object 6, BsmtFinType2 object 6, Electrical object 5, FireplaceQu object 5, GarageType object 6, GarageYrBlt float64 97, GarageFinish object 3, GarageQual object 5, GarageCond object 5, PoolQC object 3, Fence object 4, MiscFeature object 4, 

In [49]:
# Custom transformer for this dataset 
# which fills NA values with mode values and memorizes them 

df_object_encoded = []

class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, use_categorial=False):
        self.specific_categories = ['BsmtCond', 'BsmtQual', 'ExterCond', 'ExterQual', 'HeatingQC',
                              'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
        
        self.column_name_to_fill_value = {}
        self.use_categorial = use_categorial
        self.enc = OneHotEncoder(handle_unknown='ignore')
        pass

    def memorize_fill_values(self, df):
        discrete_cols = df.select_dtypes(include=['int64','bool', 'object']).columns
        
        for col in discrete_cols:
            mode_value = df[col].mode()
            if not mode_value.empty:  # Check if mode exists
                #df[col].fillna(mode_value[0], inplace=True) 
                self.column_name_to_fill_value[col] = mode_value[0]
            else:
                raise Exception("Unable to find any mode")
    
        float_cols = df.select_dtypes(include=['float64','float16', 'float32']).columns
    
        for col in float_cols:
            mean_value = df[col].mean()
            if mean_value:  # Check if mode exists
                #df[col].fillna(mean_value, inplace=True)
                self.column_name_to_fill_value[col] = mean_value
            else:
                raise Exception("Unable to find any mean " + str(mean_value))


    def transform_specific_columns(self, df):
        quality_dict = {category: i+1 for i, category in enumerate(['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex'])}
        quality_categories = ['BsmtCond', 'BsmtQual', 'ExterCond', 'ExterQual', 'HeatingQC',
                              'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']

        for col in quality_categories:
            df[col] = df[col].fillna('NA')
            df[col] = df[col].apply(lambda x: quality_dict[x])
    
    def fit(self, X, y=None):
        """
        Learn parameters from the training data.
        Return self for method chaining.
        """

        self.memorize_fill_values(X)
        
        if self.use_categorial:
            self.object_cols = df.select_dtypes(include=['object']).columns

            self.ohe_cols_list = list(set(self.object_cols) - set(self.specific_categories))
            
            self.enc.fit(X[self.ohe_cols_list])
        
        return self
    
    def transform(self, X):
        """
        Apply the transformation to new data.
        """

        df = X.copy()

        # use specific transformations
        # where NA has certain meaning
        #  object columns ---> integer
        self.transform_specific_columns(df)    

        # for other columns, fill the missing data
        # with what was learned
        for col in df.columns:
            df[col].fillna(self.column_name_to_fill_value[col], inplace=True) 

        # detect object columns
        object_cols = df.select_dtypes(include=['object']).columns

        # they are whether dropped
        if not(self.use_categorial): 
             df_obj_dropped = df.drop(object_cols, axis=1)

             return df_obj_dropped.to_numpy()

        # or one-hot encoded
        else:
            #One-hot encode columns with target values
            df_object_encoded = self.enc.transform(df[self.ohe_cols_list])
            df = df.drop(self.ohe_cols_list, axis=1)

            #res_df = pd.concat([df, df_object_encoded], axis=1)
            
            
            return np.concatenate([df.to_numpy(), df_object_encoded.toarray()], axis=1)

#### Train test split

In [50]:
y = df['SalePrice']

In [51]:
X = df.drop('SalePrice', axis=1)

In [52]:
print(X.shape)

(1460, 79)


In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#### Transforming categorical features

Here I ll make an extensive study on how to process the data in the best way:

Categorical features are mostly ordered, as it is described in data_description.txt

In [54]:
def print_object_column_names_and_nunique(df):
    discrete_cols = df.select_dtypes(include=['object']).columns

    for column_name in discrete_cols:
        dtype = df[column_name].dtype
        print(column_name, df[column_name].nunique())

In [55]:
print_object_column_names_and_nunique(df)

MSZoning 5
Street 2
Alley 2
LotShape 4
LandContour 4
Utilities 2
LotConfig 5
LandSlope 3
Neighborhood 25
Condition1 9
Condition2 8
BldgType 5
HouseStyle 8
RoofStyle 6
RoofMatl 8
Exterior1st 15
Exterior2nd 16
MasVnrType 3
ExterQual 4
ExterCond 5
Foundation 6
BsmtQual 4
BsmtCond 4
BsmtExposure 4
BsmtFinType1 6
BsmtFinType2 6
Heating 6
HeatingQC 5
CentralAir 2
Electrical 5
KitchenQual 4
Functional 7
FireplaceQu 5
GarageType 6
GarageFinish 3
GarageQual 5
GarageCond 5
PavedDrive 3
PoolQC 3
Fence 4
MiscFeature 4
SaleType 9
SaleCondition 6


#### Model fitting and prediction

In [56]:
def my_custom_loss_func(ground_truth, predictions):
    rmse_of_log = np.sqrt(  np.mean ( (np.log(ground_truth) - np.log(predictions))**2 ) )
    return rmse_of_log

In [57]:
custom_scorer = make_scorer(my_custom_loss_func,  greater_is_better=False)

In [58]:
def evaluate_results(model):
    y_predicted = model.predict(X_test)
    return custom_scorer._score_func(y_predicted, y_test)

In [59]:
pipeline = Pipeline([
    ('preprocessor', CustomTransformer(use_categorial=True)),
    ('regressor', XGBRegressor(random_state=42, n_estimators=5000))
])

In [60]:
df['MSSubClass'].dtype

dtype('int64')

In [61]:
pipeline.fit(X_train, y_train)


In [62]:
evaluate_results(pipeline)

0.11378346562200682

In [63]:
pipeline['preprocessor'].transform(X_train).shape

(1314, 262)

#### Grid search

In [64]:
preprocessors = {
    'none': CustomTransformer(use_categorial=True),
}

# Pipeline
pipeline = Pipeline([
    ('preprocessor', StandardScaler()),
    ('regressor', XGBRegressor(random_state=42))
])

# Parameter grid
param_grid = {
    'preprocessor': list(preprocessors.values()), 
 
    'regressor__n_estimators': range(100, 1100, 500),
    'regressor__learning_rate': [1e-3, 1e-2, 1e-1, 1],
    'regressor__max_depth': range(3, 11, 3)#,
    #'regressor__subsample': [i / 10 for i in range(5, 11, 3)],
    #'regressor__colsample_bytree':  [i / 10 for i in range(5, 11, 5)] 
}

In [65]:
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring=custom_scorer, verbose=0)
grid_search.fit(X_train, y_train)

In [76]:
grid_search.best_params_

{'preprocessor': CustomTransformer(use_categorial=True),
 'regressor__learning_rate': 0.1,
 'regressor__max_depth': 3,
 'regressor__n_estimators': 600}

#### Evaluate results

In [66]:
evaluate_results(grid_search)

0.1007343711561113

#### Refitting on the whole dataset provided for training

In [68]:
best_model = grid_search.best_estimator_
best_model.fit(X, y)

#### Preparing the submissions file

In [70]:
y_predicted = best_model.predict(df_test)

In [72]:
df_test_index = df_test.index
answer_df = pd.DataFrame({'Id': df_test_index, 'SalePrice': y_predicted})

In [73]:
answer_df

Unnamed: 0,Id,SalePrice
0,1461,119939.585938
1,1462,159399.484375
2,1463,184224.687500
3,1464,190247.562500
4,1465,177405.828125
...,...,...
1454,2915,78192.593750
1455,2916,71693.187500
1456,2917,165701.125000
1457,2918,122187.648438


In [74]:
answer_df.to_csv('submission.csv', index=False)