# INITIALIZE

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm, skew

pd.set_option("display.max_rows", 60)
pd.set_option("display.float_format", lambda x: "{:.3f}".format(x))

In [2]:
INSTALL_NOT_PRESENT_MODULES = True
IS_LOCAL = False
SEED = 14
if not IS_LOCAL:
    GOOGLE_DRIVE_PATH = "./drive/MyDrive/kaggle/house_price_prediction/"
    from google.colab import drive
    drive.mount('/content/drive')

PATH_TO_DATA = "./data/"
PATH_TO_MODELS = "./models/"
PATH_TO_SUBMISSIONS = "./submissions/"
if not IS_LOCAL:
    PATH_TO_DATA = GOOGLE_DRIVE_PATH + PATH_TO_DATA
    PATH_TO_MODELS = GOOGLE_DRIVE_PATH + PATH_TO_MODELS
    PATH_TO_SUBMISSIONS = GOOGLE_DRIVE_PATH + PATH_TO_SUBMISSIONS

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# LOAD DATA

In [3]:
# TODO: maybe use Id columns as index?
train = pd.read_csv(os.path.join(PATH_TO_DATA, "./train.csv"))
test = pd.read_csv(os.path.join(PATH_TO_DATA, "./test.csv"))
print("train shape:", train.shape)
print("test shape:", test.shape)

train shape: (1460, 81)
test shape: (1459, 80)


In [4]:
train_ID = train["Id"]
test_ID = test["Id"]

train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

print("\nThe train data size after dropping Id feature is : {} ".format(train.shape))
print("The test data size after dropping Id feature is : {} ".format(test.shape))


The train data size after dropping Id feature is : (1460, 80) 
The test data size after dropping Id feature is : (1459, 79) 


# Prepare data

In [5]:
# delete outliers
train = train.drop(
    train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index
)

train["SalePrice"] = np.log1p(train["SalePrice"])

ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train["SalePrice"].values

all_data = pd.concat((train, test)).reset_index(drop=True)
all_data.drop(["SalePrice"], axis=1, inplace=True)

# TODO: think about it
# can delete all data like these
all_data = all_data.drop(["Utilities"], axis=1)

mb_drop = [
    col
    for col in all_data.columns
    if (all_data[col].value_counts(normalize=True) > 0.95).any()
]
all_data.drop(mb_drop, axis=1, inplace=True)

print("all_data size is : {}".format(all_data.shape))

all_data size is : (2917, 67)


## Change types
Some columns in numerical type, but they sense categorical

In [6]:
to_change_type_to_str = {
    col: str for col in ["MSSubClass", "OverallCond", "YrSold", "MoSold"]
}
all_data = all_data.astype(to_change_type_to_str)

## Fill nans with zero

In [7]:
# Fill NaN values in these columns by 0 for numerical types of data
# or with "None" string/object types of data

# at first, situations when some object just can to not exists
# garage
garages_cols = [col for col in all_data.columns if col.startswith("Garage")]

# basement
basement_cols = [col for col in all_data.columns if col.startswith("Bsmt")] + [
    "TotalBsmtSF"
]

# and other columns with a similar property, but which we cant group like below

# some of these cols hasnt nan values, added just for generalization understanding
objects_can_not_to_exists = [
    "Alley",
    "Fence",
    "PoolQC",
    "PoolArea",
    "MasVnrType",
    "MasVnrArea",
    "Fireplaces",
    "FireplaceQu",
    "MSSubClass",
    "MiscFeature",
]

cols_fillna_zero = garages_cols + basement_cols + objects_can_not_to_exists

In [8]:
na_value_depends_type = lambda df, col: 0 if df[col].dtype != "object" else "None"

value_fillna_zero = {
    col: na_value_depends_type(all_data, col)
    for col in cols_fillna_zero
    if col in all_data.columns
}
# "if" statement can delete in future it depends to mb_drop

## Some cols need to be filled with mode

In [9]:
cols_fillna_mode = [
    all_data.columns[idx] for idx, val in enumerate(all_data.isnull().sum()) if val
]
value_fillna_mode = {
    col: all_data[col].mode()[0] for col in cols_fillna_mode if col in all_data.columns
}
# "if" statement can delete in future it depends to mb_drop

In [10]:
values_fillna = {**value_fillna_zero, **value_fillna_mode}

all_data.fillna(value=values_fillna, inplace=True)

In [11]:
# # Filling 'MSZoning' according to MSSubClass.

all_data["MSZoning"] = all_data.groupby("MSSubClass")["MSZoning"].apply(
    lambda x: x.fillna(x.mode()[0])
)

# Filling 'LotFrontage' according to Neighborhood.

all_data["LotFrontage"] = all_data.groupby(["Neighborhood"])["LotFrontage"].apply(
    lambda x: x.fillna(x.median())
)

## Process some features with LabelEncoder

In [13]:
from sklearn.preprocessing import LabelEncoder

## full list of categorical data
# num_types = [np.int64, np.float64]
# categorical_cols = [
#     col for col in all_data.columns if all_data[col].dtype not in num_types
# ]

# with processing only these cols have better result
cols = (
    "FireplaceQu",
    "BsmtQual",
    "BsmtCond",
    "GarageQual",
    "GarageCond",
    "ExterQual",
    "ExterCond",
    "HeatingQC",
    "PoolQC",
    "KitchenQual",
    "BsmtFinType1",
    "BsmtFinType2",
    "Functional",
    "Fence",
    "BsmtExposure",
    "GarageFinish",
    "LandSlope",
    "LotShape",
    "PavedDrive",
    "Street",
    "Alley",
    "CentralAir",
    "MSSubClass",
    "OverallCond",
    "YrSold",
    "MoSold",
)

In [14]:
for col in cols:
    if col in all_data.columns:
        all_data[col] = LabelEncoder().fit_transform(all_data[col])

In [15]:
print("Shape all_data: {}".format(all_data.shape))

Shape all_data: (2917, 67)


In [16]:
# Adding total sqfootage feature
all_data["TotalSF"] = (
    all_data["TotalBsmtSF"] + all_data["1stFlrSF"] + all_data["2ndFlrSF"]
)

## Box Cox Transformation of (highly) skewed features


In [17]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = (
    all_data[numeric_feats]
    .apply(lambda x: skew(x.dropna()))
    .sort_values(ascending=False)
)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({"Skew": skewed_feats})

skewness = skewness[abs(skewness) > 0.75]
print(
    "There are {} skewed numerical features to Box Cox transform".format(
        skewness.shape[0]
    )
)


Skew in numerical features: 

There are 51 skewed numerical features to Box Cox transform


## Apply Cox Box transformation and create cleaned train & test data

In [18]:
from scipy.special import boxcox1p

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    # depends of mb_drop
    if feat in all_data.columns:
        all_data[feat] = boxcox1p(all_data[feat], lam)

all_data = pd.get_dummies(all_data)
print(all_data.shape)

train = all_data[:ntrain]
test = all_data[ntrain:]

(2917, 189)


# Train model

In [19]:
try:
    import h2o
    from h2o.automl import H2OAutoML
except ModuleNotFoundError:
    if INSTALL_NOT_PRESENT_MODULES:
        print('h2o is installing!')
        !pip install h2o
        print("h2o is installed! h2o version is f'{h2o.__version__}'")
        import h2o
        from h2o.automl import H2OAutoML
    else:
        print("You need to install h2o!")

In [25]:
%%time

h2o.init()

LOAD_MODEL = True

train["SalePrice"] = y_train
htrain = h2o.H2OFrame(train)
htest = h2o.H2OFrame(test)
x = htrain.columns
y = "SalePrice"
x.remove(y)

if LOAD_MODEL:
    lb = h2o.import_file(path=os.path.join(PATH_TO_MODELS, "aml_leaderboard.h2o"))
    best_model_name = lb[0, 0]
    best_model = h2o.load_model(os.path.join(PATH_TO_MODELS, best_model_name, best_model_name))
else:
    aml = H2OAutoML(max_runtime_secs=20, seed=SEED)
    aml.train(x=x, y=y, training_frame=htrain)
    lb = aml.leaderboard
    best_model = aml.leader

print(lb)
print("generate predictions")

# if SAVE_MODEL:
#     h2o.export_file(lb, os.path.join(PATH_TO_MODELS, "aml_leaderboard.h2o"), force=FORCE)
#     model_ids = lb['model_id'].as_data_frame()['model_id'].tolist()
#     for m_id in model_ids:
#         mdl = h2o.get_model(m_id)
#         h2o.save_model(model=mdl, path=os.path.join(PATH_TO_MODELS, f'{m_id}'), force=FORCE)

# test_y = aml.leader.predict(htest)
# test_y = test_y.as_data_frame()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,45 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,29 days
H2O_cluster_name:,H2O_from_python_unknownUser_1yhdu0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.177 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
GLM_1_AutoML_20201217_134734,0.0135223,0.116285,0.0135223,0.0799886,0.0090628
GBM_grid__1_AutoML_20201217_134734_model_14,0.0144202,0.120084,0.0144202,0.0818738,0.00937156
GBM_grid__1_AutoML_20201217_134734_model_23,0.0146046,0.120849,0.0146046,0.083408,0.00942221
GBM_grid__1_AutoML_20201217_134734_model_8,0.0147604,0.121492,0.0147604,0.0825519,0.00947184
GBM_grid__1_AutoML_20201217_134734_model_22,0.0149217,0.122154,0.0149217,0.0846175,0.00952103
GBM_grid__1_AutoML_20201217_134734_model_28,0.0150048,0.122494,0.0150048,0.0845943,0.00955038
GBM_grid__1_AutoML_20201217_134734_model_3,0.0150177,0.122547,0.0150177,0.0836764,0.00956237
GBM_grid__1_AutoML_20201217_134734_model_24,0.015363,0.123947,0.015363,0.0846162,0.00967979
GBM_grid__1_AutoML_20201217_134734_model_25,0.0154721,0.124387,0.0154721,0.0843125,0.00969252
GBM_3_AutoML_20201217_134734,0.0154992,0.124496,0.0154992,0.0853455,0.00970789



generate predictions
CPU times: user 609 ms, sys: 48.4 ms, total: 657 ms
Wall time: 1.56 s


## SAVE MODEL

In [26]:
SAVE_MODEL = False
FORCE = True

if SAVE_MODEL:
    h2o.export_file(lb, os.path.join(PATH_TO_MODELS, "aml_leaderboard.h2o"), force=FORCE)
    model_ids = lb['model_id'].as_data_frame()['model_id'].tolist()
    for m_id in model_ids:
        mdl = h2o.get_model(m_id)
        h2o.save_model(model=mdl, path=os.path.join(PATH_TO_MODELS, f'{m_id}'), force=FORCE)

# CREATE SUMBISSION

In [27]:
test_y = best_model.predict(htest).as_data_frame()

glm prediction progress: |████████████████████████████████████████████████| 100%


In [28]:
ss = pd.read_csv(os.path.join(PATH_TO_DATA, "sample_submission.csv"), index_col="Id")
ss["SalePrice"] = np.expm1(test_y['predict'].values)

In [29]:
import datetime

df_submission = ss.copy()
submission_filename = "submission_{}.csv".format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)
df_submission.to_csv(os.path.join(PATH_TO_SUBMISSIONS, submission_filename))
print("Submission saved to {}".format(submission_filename))

Submission saved to submission_2020-12-17_15-18-08.csv
