In [1]:
# importing libraries 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# extreme gradient boosting library
from xgboost import XGBRegressor

In [2]:
# reading data
X = pd.read_csv('Data/train.csv', index_col='Id')
X_test_full = pd.read_csv('Data/test.csv', index_col='Id')

In [3]:
X.shape

(1460, 80)

In [4]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
# clear rows without target parameter 'Price'
X.dropna(axis=0, subset=['SalePrice'], inplace=True)

# set target
y = X.SalePrice

# drop target from features set
X.drop(['SalePrice'], axis=1, inplace=True)

# split data
X_train_full, X_valid_full, y_train_full, y_valid_full = train_test_split(
    X, 
    y, 
    train_size=0.8,
    test_size=0.2,
    random_state=0
)

In [6]:
# select categorical features with low cardinality
low_cardinality_cols = [card_col for card_col in X_train_full.columns if 
                        X_train_full[card_col].nunique() < 10 and
                        X_train_full[card_col].dtype == 'object']
# select numeric features
numeric_cols = [n_col for n_col in X_train_full.columns if 
                X_test_full[n_col].dtype in ['int64', 'float64']]

In [7]:
# imputation function for categorical features
def imputation_cat_features(train_cat, valid_cat, test_cat, strategy='most_frequent'):
    cat_imputer = SimpleImputer(strategy=strategy)
    
    imp_cat_train = pd.DataFrame(cat_imputer.fit_transform(train_cat))
    imp_cat_valid = pd.DataFrame(cat_imputer.transform(valid_cat))
    imp_cat_test = pd.DataFrame(cat_imputer.transform(test_cat))
    
    imp_cat_train.columns = train_cat.columns
    imp_cat_valid.columns = valid_cat.columns
    imp_cat_test.columns = test_cat.columns
    
    return [imp_cat_train, imp_cat_valid, imp_cat_test]

In [8]:
# imputation function for numerical features
def imputation_num_features(train, valid, test, strategy):
    num_imputer = SimpleImputer(strategy=strategy)
    
    imp_train = pd.DataFrame(num_imputer.fit_transform(train))
    imp_valid = pd.DataFrame(num_imputer.transform(valid))
    imp_test = pd.DataFrame(num_imputer.transform(test))
    
    imp_train.columns = train.columns
    imp_valid.columns = valid.columns
    imp_test.columns = test.columns
    
    return [imp_train, imp_valid, imp_test]

In [10]:
# defining categorical features for train, valid and test data subsets
X_train_cat = X_train_full[low_cardinality_cols]
X_valid_cat = X_valid_full[low_cardinality_cols]
X_test_cat = X_test_full[low_cardinality_cols]

# imputation categorical features
imp_X_train_cat, imp_X_valid_cat, imp_X_test_cat = imputation_cat_features(
    X_train_cat, X_valid_cat, X_test_cat
)

In [13]:
print(imp_X_train_cat.shape, imp_X_valid_cat.shape, imp_X_test_cat.shape)

(1168, 40) (292, 40) (1459, 40)


In [11]:
# defining numerical features for train, valid and test data subsets
X_train_num = X_train_full[numeric_cols]
X_valid_num = X_valid_full[numeric_cols]
X_test_num = X_test_full[numeric_cols]

# imputation numerical features
imp_X_train_num, imp_X_valid_num, imp_X_test_num = imputation_num_features(
    X_train_num, X_valid_num, X_test_num, 'mean'
)

In [15]:
# concatenation of numerical and categorical features
X_train = pd.concat([imp_X_train_num, imp_X_train_cat], axis=1)
X_valid = pd.concat([imp_X_valid_num, imp_X_valid_cat], axis=1)
X_test = pd.concat([imp_X_test_num, imp_X_test_cat], axis=1)

In [16]:
# one-hot encoder (for categorical data) on full data set
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)

In [18]:
# checking data before alignment
print(f"X_train: {X_train.shape}\nX_valid: {X_valid.shape}\nX_test: {X_test.shape}")

X_train: (1168, 227)
X_valid: (292, 208)
X_test: (1459, 217)


In [21]:
# align datasets for the same number of columns
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

In [22]:
# checking data after alignment
print(f"X_train: {X_train.shape}\nX_valid: {X_valid.shape}\nX_test: {X_test.shape}")

X_train: (1168, 227)
X_valid: (292, 227)
X_test: (1459, 227)
