In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# Import data

In [None]:
train = pd.read_csv("/kaggle/input/playground-series-s3e6/train.csv", index_col = "id")
test = pd.read_csv("/kaggle/input/playground-series-s3e6/test.csv", index_col = "id")
train.head()

# EDA

In [None]:
train.info()

No missings. Hooray!

In [None]:
train.plot(kind = "box")

In [None]:
train.iloc[:, 1:-1].plot(kind = "box")

In [None]:
for col in train.columns:
    print(f"Histplot for {col}")
    sns.histplot(data = train, x = col)
    plt.show()

In [None]:
outlier_check = ["squareMeters", "floors", "cityCode", "made", "basement", "attic", "garage"]

for col in outlier_check:
    train[[col]].plot(kind = "box")
    plt.show()

Seems like all of them are really outliers.

In [None]:
sns.heatmap(train.corr(method = "spearman"))

In [None]:
sns.pairplot(data = train.head(200))

In [None]:
train[["squareMeters", "price"]].corr()

# Machine Learning

In [None]:
from sklearn.preprocessing import StandardScaler, PowerTransformer, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor, AdaBoostRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor

import xgboost as xgb

seed = 19

In [None]:
X_temp, X_test, y_temp, y_test = train_test_split(
    train.drop(columns = ["price"]), 
    train.price,
    random_state = seed,
    test_size = .2
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    random_state = seed,
    test_size = .2
)

In [None]:
outlier_check_copy = outlier_check.copy()
outlier_check_copy[-1] = "price"
outlier_check_copy

In [None]:
normal_and_binary = train.drop(columns = outlier_check_copy).columns.values

In [None]:
zscaler = StandardScaler()
ptransformer = PowerTransformer()
rscaler = RobustScaler(quantile_range = (0,99))
kbest = SelectKBest(score_func = f_regression)

col_transformer = ColumnTransformer([
    ("normal_and_binary", zscaler, normal_and_binary),
    ("outlier", rscaler, outlier_check)
])

pre_proc_pipe = Pipeline(steps = [
    ("transformer", col_transformer),
    ("selector", kbest)
])

In [None]:
linreg = LinearRegression()
dtree = DecisionTreeRegressor(random_state = seed)
xgbr = xgb.XGBRegressor()

rfr = RandomForestRegressor(random_state = seed)
gbr = GradientBoostingRegressor(random_state = seed)
svr = LinearSVR(random_state = seed)

knn = KNeighborsRegressor()
ada = AdaBoostRegressor(
    base_estimator = DecisionTreeRegressor(random_state = seed),
    random_state = seed
)



lr_pipe = Pipeline(steps = [
    ("pre_proc", pre_proc_pipe),
    ("estimator", linreg)
])

dt_pipe = Pipeline(steps = [
    ("pre_proc", pre_proc_pipe),
    ("estimator", dtree)
])

xgb_pipe = Pipeline(steps = [
    ("pre_proc", pre_proc_pipe),
    ("estimator", xgbr)
])

rfr_pipe = Pipeline(steps = [
    ("pre_proc", pre_proc_pipe),
    ("estimator", rfr)
])

gbr_pipe = Pipeline(steps = [
    ("pre_proc", pre_proc_pipe),
    ("estimator", gbr)
])

svr_pipe = Pipeline(steps = [
    ("pre_proc", pre_proc_pipe),
    ("estimator", svr)
])



knn_pipe = Pipeline(steps = [
    ("pre_proc", pre_proc_pipe),
    ("estimator", knn)
])

ada_pipe = Pipeline(steps = [
    ("pre_proc", pre_proc_pipe),
    ("estimator", ada)
])

In [None]:
params = {"pre_proc__selector__k": [6, 8, 10]}

lr_gridcv = GridSearchCV(
    lr_pipe,
    params,
    n_jobs = -1,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

lr_gridcv.fit(X_train, y_train)

print(f"Train-Score: {lr_gridcv.score(X_train, y_train)}")
print(f"Val-Score: {lr_gridcv.score(X_val, y_val)}")
lr_gridcv.best_params_

In [None]:
params = {"pre_proc__selector__k": [6, 8, 10]}

knn_gridcv = GridSearchCV(
    knn_pipe,
    params,
    n_jobs = -1,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

knn_gridcv.fit(X_train, y_train)

print(f"Train-Score: {knn_gridcv.score(X_train, y_train)}")
print(f"Val-Score: {knn_gridcv.score(X_val, y_val)}")
knn_gridcv.best_params_

In [None]:
params = {"pre_proc__selector__k": [7, 8, 9],
          "estimator__n_estimators": [80, 90, 100, 110],
          "estimator__base_estimator__max_depth": [6, 7, 8, 9],
          "estimator__learning_rate": [.5, .75, 1.0],
          "estimator__loss": ["linear", "square"]}

ada_gridcv = GridSearchCV(
    ada_pipe,
    params,
    n_jobs = -1,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

ada_gridcv.fit(X_train, y_train)

print(f"Train-Score: {ada_gridcv.score(X_train, y_train)}")
print(f"Val-Score: {ada_gridcv.score(X_val, y_val)}")
ada_gridcv.best_params_

In [None]:
params = {"pre_proc__selector__k": [8, 9, 10],
          "estimator__max_depth": [10, 11, 12],
          "estimator__min_samples_split": [8, 10, 12]}

dt_gridcv = GridSearchCV(
    dt_pipe,
    params,
    n_jobs = -1,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

dt_gridcv.fit(X_train, y_train)

print(f"Train-Score: {dt_gridcv.score(X_train, y_train)}")
print(f"Val-Score: {dt_gridcv.score(X_val, y_val)}")
dt_gridcv.best_params_

In [None]:
params = {"pre_proc__selector__k": [6, 8],
          "estimator__n_estimators": [40, 50, 60, 70],
          "estimator__subsample": [.7, .8, .9],
          "estimator__colsample_bytree": [.6, .8, 1.]}

xgb_gridcv = GridSearchCV(
    xgb_pipe,
    params,
    n_jobs = -1,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

xgb_gridcv.fit(X_train, y_train)

print(f"Train-Score: {xgb_gridcv.score(X_train, y_train)}")
print(f"Val-Score: {xgb_gridcv.score(X_val, y_val)}")

xgb_gridcv.best_params_

In [None]:
params = {"pre_proc__selector__k": [4, 6],
          "estimator__n_estimators": [80, 90, 100],
          "estimator__subsample": [.7, .8, .9],
          "estimator__max_depth": [2, 3, 4],
          "estimator__learning_rate": [.6, .8, 1.0]}

gbr_gridcv = GridSearchCV(
    gbr_pipe,
    params,
    n_jobs = -1,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

gbr_gridcv.fit(X_train, y_train)

print(f"Train-Score: {gbr_gridcv.score(X_train, y_train)}")
print(f"Val-Score: {gbr_gridcv.score(X_val, y_val)}")

gbr_gridcv.best_params_

In [None]:
params = {"pre_proc__selector__k": [2, 4, 6, 8],
          "estimator__n_estimators": [110, 130, 150, 170],
          "estimator__max_depth": [8, 10, 12, 14]}

rfr_gridcv = GridSearchCV(
    rfr_pipe,
    params,
    n_jobs = -1,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

rfr_gridcv.fit(X_train, y_train)

print(f"Train-Score: {rfr_gridcv.score(X_train, y_train)}")
print(f"Val-Score: {rfr_gridcv.score(X_val, y_val)}")

rfr_gridcv.best_params_

In [None]:
params = {"pre_proc__selector__k": [2, 4, 6, 8]}

svr_gridcv = GridSearchCV(
    svr_pipe,
    params,
    n_jobs = -1,
    cv = 5,
    scoring = "neg_mean_squared_error"
)

svr_gridcv.fit(X_train, y_train)

print(f"Train-Score: {svr_gridcv.score(X_train, y_train)}")
print(f"Val-Score: {svr_gridcv.score(X_val, y_val)}")

svr_gridcv.best_params_

In [None]:
pre_proc_pipe.fit(X_train, y_train)

fit_params = {
    #"early_stopping_rounds": 10,
    #"eval_metric": "rmse",
    "eval_set": [[pre_proc_pipe.transform(X_val), y_val]],
    "verbose": False}

params = {
    "n_estimators": [500],
    "subsample": [.8],
    "colsample_bytree": [.8],
    "max_depth": [4]
}

xgbr_es = xgb.XGBRegressor(
    eval_metric = mean_squared_error,
    early_stopping_rounds = 9
)

xgb_gridcv_es = GridSearchCV(
    xgbr_es, 
    params, 
    n_jobs = -1, 
    cv = 5,
    scoring = "neg_mean_squared_error"
)

#xgb_gridcv_es.fit(pre_proc_pipe.transform(X_train), 
#                  y_train, **fit_params)

#print(f"Train-Score: {xgb_gridcv_es.score(pre_proc_pipe.transform(X_train), y_train)}")
#print(f"Val-Score: {xgb_gridcv_es.score(pre_proc_pipe.transform(X_val), y_val)}")
#xgb_gridcv_es.best_params_

In [None]:
estimators = [("dt", dt_gridcv),
              ("xgb", xgb_gridcv),
              ("gb", gbr_pipe),
              ("rf", rfr_gridcv),
              #("knn", knn_gridcv),
              ("ada", ada_gridcv)]

stackingr = StackingRegressor(
    estimators = estimators,
    #final_estimator = RandomForestRegressor(random_state = seed),
    final_estimator = xgb.XGBRegressor(),
    cv = 5,
    n_jobs = -1
)

votingr = VotingRegressor(
    estimators = estimators,
    n_jobs = -1
)

In [None]:
stackingr.fit(X_train, y_train)

In [None]:
print(f"Train-Score: {stackingr.score(X_train, y_train)}")
print(f"Val-Score: {stackingr.score(X_val, y_val)}")

In [None]:
votingr.fit(X_train, y_train)

In [None]:
print(f"Train-Score: {votingr.score(X_train, y_train)}")
print(f"Val-Score: {votingr.score(X_val, y_val)}")

In [None]:
def submit(model, test_set, pre_proc = None):
    test_index = test_set.index
    if pre_proc != None:
        test_set = pre_proc.transform(test_set)
    y_pred = model.predict(test_set)
    submit_dict = {
        "id": test_index, 
        "price": y_pred
    }
    
    submit_df = pd.DataFrame(submit_dict)
    submit_df.to_csv("submission.csv", index = False)
    return submit_df

In [None]:
#submit(xgb_gridcv_es, test, pre_proc_pipe)
submit(votingr, test)