In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.linear_model import LassoCV
from scipy.stats import skew

# Load the dataset

In [None]:
train_file_path = "/kaggle/input/house-prices-advanced-regression-techniques/train.csv"
train_df = pd.read_csv(train_file_path)
train_df.head()

In [None]:
test_file_path = "/kaggle/input/house-prices-advanced-regression-techniques/test.csv"
test_df = pd.read_csv(test_file_path)
test_df.head()

In [None]:
train_df = train_df.drop('Id', axis=1)
test_ids = test_df.pop("Id")

In [None]:
print(train_df.shape)

In [None]:
train_df.info()

# Data Analysis

In [None]:
print(train_df['SalePrice'].describe())
plt.figure(figsize=(9, 8))
sns.distplot(train_df['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

In [None]:
df_num = train_df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)

# Data processing

In [None]:
all_data = pd.concat((train_df.loc[:,'MSSubClass':'SaleCondition'],
                      test_df.loc[:,'MSSubClass':'SaleCondition']))

In [None]:
train = train_df.copy() 

In [None]:
train["SalePrice"] = np.log1p(train["SalePrice"])
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

In [None]:
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())

In [None]:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

In [None]:
train_ds_pd, valid_ds_pd = train_test_split(train_df, test_size=0.30, random_state=42)

In [None]:
label = 'SalePrice'
train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(train_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)
valid_ds = tfdf.keras.pd_dataframe_to_tf_dataset(valid_ds_pd, label=label, task = tfdf.keras.Task.REGRESSION)

# Model training

In [None]:
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

##  Random forest

In [None]:
tfdf.keras.get_all_models()

In [None]:
model_tf = tfdf.keras.RandomForestModel(task = tfdf.keras.Task.REGRESSION)
model_tf.compile(metrics=["mse"])

In [None]:
model_tf.fit(x=train_ds)

In [None]:
tfdf.model_plotter.plot_model_in_colab(model_tf, tree_idx=0, max_depth=3)

In [None]:
evaluation = model_tf.evaluate(x=valid_ds, return_dict=True)

for name, value in evaluation.items():
    print(f"{name}: {value:.4f}")

In [None]:
inspector = model_tf.make_inspector()
plt.figure(figsize=(12, 4))

# Mean decrease in AUC of the class 1 vs the others.
variable_importance_metric = "NUM_AS_ROOT"
variable_importances = inspector.variable_importances()[variable_importance_metric]

# Extract the feature name and importance values.
#
# `variable_importances` is a list of <feature, importance> tuples.
feature_names = [vi[0].name for vi in variable_importances]
feature_importances = [vi[1] for vi in variable_importances]
# The feature are ordered in decreasing importance value.
feature_ranks = range(len(feature_names))

bar = plt.barh(feature_ranks, feature_importances, label=[str(x) for x in feature_ranks])
plt.yticks(feature_ranks, feature_names)
plt.gca().invert_yaxis()

# TODO: Replace with "plt.bar_label()" when available.
# Label each bar with values
for importance, patch in zip(feature_importances, bar.patches):
    plt.text(patch.get_x() + patch.get_width(), patch.get_y(), f"{importance:.4f}", va="top")

plt.xlabel(variable_importance_metric)
plt.title("NUM AS ROOT of the class 1 vs the others")
plt.tight_layout()
plt.show()

## XGBoost

In [None]:
model_xgb = xgb.XGBRegressor(n_estimators=360, max_depth=2, learning_rate=0.1) 
model_xgb.fit(X_train, y)

In [None]:
rmse_cv(model_xgb).mean()

## Lasso 

In [None]:
model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y)

In [None]:
rmse_cv(model_lasso).mean()

# Submission

In [None]:
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
    test_df,
    task = tfdf.keras.Task.REGRESSION)

preds_tf = model_tf.predict(test_ds)
output = pd.DataFrame({'Id': test_ids,
                       'SalePrice': preds_tf.squeeze()})

output.head()

In [None]:
xgb_preds = np.expm1(model_xgb.predict(X_test))
lasso_preds = np.expm1(model_lasso.predict(X_test))
preds = 0.7*lasso_preds + 0.3*xgb_preds

In [None]:
sample_submission_df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')
sample_submission_df['SalePrice'] = preds
sample_submission_df.to_csv('submission.csv', index=False)
sample_submission_df.head()