In [None]:
# box cox transform
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

The data, descriptions of the variables and some examples can be found here:
-> link to kaggle

In [None]:
# read in data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train_ID = train['Id']
test_ID = test['Id']

train.drop('Id', axis = 1, inplace = True)
test.drop('Id', axis = 1, inplace = True)

SalePrice = train['SalePrice']
train.drop('SalePrice', axis=1, inplace = True)

data = pd.concat((train, test))
data.reset_index(drop = True, inplace = True)

# categorical and numericalvariables:
categorical = [var for var in data.columns if data[var].dtype=='O']
numerical = [var for var in data.columns if data[var].dtype!='O']

# missing values:
# for categorical data, missing values often is the absence of a feature
# categorical data has no metric, so in a later step each level will be a variable on its own
# with ones where the respective category and zeros otherwise. This is called one-hot encoding
# or dummy-coding
data[categorical] = data[categorical].fillna('None')

## next, we substitute missing values with the mean of the variable and form new variables
## indicating the missing values. Sometimes data is not missing at random and the fact that
## data is missing might contain valuable information
variables_na = []
for val in numerical:
    data[val + '_na'] = pd.isnull(data[val])
    variables_na.append(val + '_na')
    data[val].fillna(data[val].mean(), inplace = True)

## box-cox transform is variance stabilizing. It is meant to make 
## the variable more normaly distributed    
box_cox = []
for val in numerical:
    new_vals, lamb = boxcox(data[val] + 1)
    if np.abs(lamb) < 8:
        data[val + '_box_cox'] = new_vals
        box_cox.append(val)

# as already announced, categorical data is one-hot encoded (dummy-coded)        
data_base = pd.get_dummies(data[[col for col in data.columns if col not in variables_na]])
data_na = pd.get_dummies(data[variables_na])

# we have to cast every variable's data type to float32 for our next 'trick' 
data_base = data_base.astype(np.float32)
data_na = data_na.astype(np.float32)

data = pd.concat([data_base, data_na], axis = 1)
# the number of variables is quiet high. We want to add interaction terms for the most important
# variables. Therefore, we want to compute some variable-importance measure. This is
# done by the help of gradient boosted trees:
gbm = GradientBoostingRegressor(n_estimators = 32, max_depth = 4)
gbm.fit(data[: len(train_ID)].values, SalePrice.values)

# we sort the variables (indizes) by variable importance
indizes = np.argsort(gbm.feature_importances_)
# import a tool for getting all possible n over 2 combinations of these variables
from itertools import combinations
# and add the interactions
interactions = []
for comb in list(combinations(data.columns[indizes[-55:]], 2)):
    data[comb[0] + '_x_' + comb[1]] = data[comb[0]] * data[comb[1]]
    interactions.append(comb[0] + '_x_' + comb[1])

data_interactions = data[interactions]

## 1.
# now, we have different data sets
# the base set with missing values imputed by the mean and no other feature engineering
# box-cox transformed variables are removen
base = data_base[[col for col in data_base.columns if not col.endswith('_box_cox')]]
## 2.
# box_cox is admitted; original variables removed
with_box_cox = data_base[[col for col in data_base.columns if not col in box_cox]]
## 3.
# variables indicating formerly missing values are included
with_na = pd.concat([with_box_cox, data_na], axis = 1)
## 4.
# all interaction terms of the 55 most important variables are added
with_interactions = pd.concat([with_na, data_interactions], axis = 1)

## the target variable is log-transformed
y = np.log1p(SalePrice)

## since we want to try elasticnet, we have to find the optimal parameter for 
# lambda (amount of regularization) and for alpha (ratio of lasso and ridge mixing)
lamb = 10**(np.linspace(-1, 0.2, 15))
# ratio
ratio = np.linspace(0, 1, 10)



  x = um.multiply(x, x, out=x)
  tmp2 = (x - v) * (fx - fw)
  return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)
  tmp1 = (x - w) * (fx - fv)


In [None]:
error = []
best_parameters = []
# we iterate over list of data-sets
for d in [base, with_box_cox, with_na, with_interactions]:
    # scale variables
    scaler = StandardScaler()
    scaler.fit(d) #  fit  the scale        

    X_train = scaler.transform(d[:len(train_ID)])
    
    # the function cross_val_score computes the model passed to it for cv=5-fold 
    # cross validation; we compute the mean over the 5 folds
    get_results = [(l, r, np.mean(np.sqrt(-cross_val_score(ElasticNet(alpha = l,
                                                            l1_ratio = r),
            X_train, y , scoring = 'neg_mean_squared_error',
            cv = 5, n_jobs = -1))))
                for l in lamb for r in ratio]
    
    # the least error is extracted
    least_error = np.min([i[2] for i in get_results])
    error.append(least_error)
    # the parameters belonging to the best result
    parameters = [i[0:2] for i in get_results if i[2] == least_error]
    best_parameters.append(parameters)
    print(f'least error is: {least_error}, best parameters are: {parameters}')


least error is: 0.1377372220579576, best parameters are: [(0.3981071705534972, 0.0)]
least error is: 0.13051112935526582, best parameters are: [(0.1, 0.1111111111111111)]
least error is: 0.13048976504373572, best parameters are: [(0.2682695795279726, 0.0)]


  sqr = np.multiply(arr, arr, out=arr)


least error is: 0.12082892004020285, best parameters are: [(0.2682695795279726, 0.0)]
[0.1377372220579576, 0.13051112935526582, 0.13048976504373572, 0.12082892004020285]
[[(0.3981071705534972, 0.0)], [(0.1, 0.1111111111111111)], [(0.2682695795279726, 0.0)], [(0.2682695795279726, 0.0)]]


We conclude:
1. The error for the base data set (only missing values imputed) is: 0.1377 (mse); The corresponding lambda is 0.3981, i.e. the amount of regularization; the l1_ratio =0.0; the kind of regularization was pure ridge (l2-penalty)
2. The error with some of the numeric variables box-cox transformed is 0.1305 (mse); the amount of regularization is far less than before (0.1); we have 11% l1-penalty and 89% l2-penalty
3. Indicator variables for formerly missing values are included in the data-set; The error (0.13049) shrinks by an insignificant amount. The lambda parameter is 0.2683; no l1-penalty is used
4. adding the interaction terms has the most pronounced effect. The error drops to 0.1208; The best parameters are as before.

One additional note: By including the interaction terms, we have __more variables (1831) than observations (1460)__ in the training set. This situation is not admissable in classical statistics. For machine learning algorithms with regularization, it does not mean any problem.

In [None]:
# in a final step we compute bootstrap confidence intervals for the prediction means
from random import choices
from sklearn.linear_model import Ridge

scaler = StandardScaler()
scaler.fit(with_interactions) #  fit  the scale        

X_train = scaler.transform(with_interactions[:len(train_ID)])
indices = np.arange(0, X_train.shape[0])
sampler = (choices(indices, k = len(indices)) for i in range(200))


  sqr = np.multiply(arr, arr, out=arr)


1460

In [None]:

CIS = np.percentile(np.array([Ridge(alpha=0.2683, fit_intercept=True).fit(X_train[drew,:], y.values[drew])\
                              .predict(X_train).tolist() for drew in sampler]), [2.5, 97.5], axis = 0)

  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)


  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)


  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)


  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)
  overwrite_a=False)


In [None]:
type(y)

pandas.core.series.Series