In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# for TargetEncoder
!pip install category_encoders

DATA PREPROCESSING

In [None]:
# Numerical columns must be separated from categorical ones
# Categorical ones can be one hot encoded or ordinal encoded
# First, delete columns with >80% null values if its correlation to target variable is <10%
# Second, null values in numerical columns must be replaced with the median value using SimpleImputer median strategy
# Third, null values in categorical columns must be replaced with the most common value, adding another column to say if the value was missing
# Fourth, non null entries in categorical columns must be one hot encoded if unique values are <= 3
# Otherwise, non entries in categorical columns must be ordinal encoded if a ranking exists
# Else, apply frequency encoding and normalize the values
# Numerical values must be normalized from 0 to 1, by taking each value and dividing it to the max value or MinMaxScaler
# Put all of this in a sklearn pipeline
# Finally, all the columns must be concatenated

In [None]:
# read the data
housing = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
housing.info()

In [None]:
# separate the target from the predictors
y = housing.SalePrice
X = housing.drop(["SalePrice"], axis=1)

In [None]:
# divide data into training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
# delete the id column, which provides no useful information and may pollute the model
X_train.drop('Id', axis=1, inplace=True)
X_valid.drop('Id', axis=1, inplace=True)

In [None]:
# filter out the columns with >70% of missing values
X_train = X_train[ [col for col in X_train.columns if X_train[col].notnull().sum() > 0.3 * X_train.shape[0]]]
X_valid = X_valid[X_train.columns]

In [None]:
# separate between numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=["int64","float64"]).columns
categorical_cols = X_train.select_dtypes(include=["object"]).columns

In [None]:
# separate between high and low cardinality categorical columns
ohe_cols = [col for col in categorical_cols if X_train[col].nunique() <= 3]
ordinal_cols = [col for col in categorical_cols if 4 <= X_train[col].nunique() < 10]
high_cardinality_cols = [col for col in categorical_cols if X_train[col].nunique() >= 10]

In [None]:
# drop columns that dont fit the criteria
final_columns = list(numerical_cols) + ohe_cols + ordinal_cols + high_cardinality_cols

In [None]:
X_train = X_train[final_columns]
X_valid = X_valid[final_columns]

In [None]:
# pipeline for numerical columns
from sklearn.impute import SimpleImputer

numerical_transformer = SimpleImputer(strategy='median')

In [None]:
# pipelines for categorical columns
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder

ohe_cols_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',
                             sparse_output=False))
]) 

ordinal_cols_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ))
])

high_cardinality_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target', TargetEncoder())
])

In [None]:
# bundle preprocessing
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('ohe', ohe_cols_transformer, ohe_cols),
        ('ord', ordinal_cols_transformer, ordinal_cols),
        ('hcc', high_cardinality_transformer, high_cardinality_cols)
    ])

In [None]:
# gradient boosting (XGBoost)
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators = 1000,
    learning_rate = 0.05,
    n_jobs = 4,
    random_state=0
)

In [None]:
# train the model
from sklearn.metrics import mean_absolute_error

# preprocessing and model all at once pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)
                          ])

# preprocess data, fit model
pipeline.fit(X_train, y_train)

# preprocess validation data, get predictions
predictions = pipeline.predict(X_valid)

# evaluate the model
score = mean_absolute_error(y_valid, predictions)

print(f"Score: {score}")

In [None]:
# get the predictions on the test dataset
housing_test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

X_test = housing_test[final_columns]

final_predictions = pipeline.predict(X_test)

In [None]:
# submit the predictions
submission = pd.DataFrame({
    "Id": housing_test["Id"],
    "SalePrice": final_predictions
})

submission.to_csv("submission.csv", index=False)