The approach we will take is the following: impute missing data (use mean for numerical values, most frequent for strings); add one-hot encoded columns for categorical variables; run gradient-boosted regression.

We will also iterate through different parameter values to find the best performing model before generating our predictions on the test set.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# Read the data
X = pd.read_csv('../input/train.csv', index_col='Id') 
X_test = pd.read_csv('../input/test.csv', index_col='Id')

y = X.SalePrice
X.drop(columns=['SalePrice'], inplace=True)

In [None]:
categorical_cols = [col for col in X.columns if X[col].dtype == 'object']
numerical_cols = [col for col in X.columns if (X[col].dtype == 'int64' or X[col].dtype == 'float64')]
null_cols = [col for col in X.columns if X[col].isnull().any()] # not needed for now

In [None]:
numerical_transformer = SimpleImputer()
categorical_transformer = Pipeline(steps=
                                   [('imputer', SimpleImputer(strategy='most_frequent')),
                                    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=
                                 [('num', numerical_transformer, numerical_cols), 
                                  ('cat', categorical_transformer, categorical_cols)])

def get_scores(n_estimators, learning_rate, n_iter_no_change):
    model = GradientBoostingRegressor(n_estimators=n_estimators, 
                                      learning_rate=learning_rate, 
                                      n_iter_no_change=n_iter_no_change,
                                     random_state=0)
    pipeline = Pipeline(steps=
                   [('preprocess', preprocessor),
                   ('model', model)])
    
    scores = -1 * cross_val_score(pipeline, X, y,
                        cv = 5,
                        scoring='neg_mean_absolute_error')
    
    return scores

In [None]:
list_n_estimators = [100, 1000, 2000]
list_learning_rate = [0.5, 0.1, 0.05]
list_n_iter_no_change = [0, 5, 15]

results={}
best_score = 100000
for n_estimators in list_n_estimators:
    for learning_rate in list_learning_rate:
        for n_iter_no_change in list_n_iter_no_change:
            score = get_scores(n_estimators, learning_rate, n_iter_no_change).mean()
            best_score = min(best_score, score)
            print("Best score so far: %f" % (best_score))
            
            results[str(n_estimators) + "," + str(learning_rate) + "," + str(n_iter_no_change)] = score            

In [None]:
for run in results:
    print(f"{run}:{results[run]}")

In [None]:
model = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.1, n_iter_no_change=15)
pipeline = Pipeline(steps=
                   [('preprocess', preprocessor),
                   ('model', model)])

#Retrain model on all of X and y
pipeline.fit(X, y)
predictions = pipeline.predict(X_test)

# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': predictions})
output.to_csv('submission.csv', index=False)
