<a href="https://www.kaggle.com/code/mmellinger66/ps3e6-paris-housing-models?scriptVersionId=119594655" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 6 - Paris Housing Prices</h1>
</div>

## Problem Type

Regression

## Evaluation Metric


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="price"
ID="id"

In [3]:
class Config:
    path:str = "../input/playground-series-s3e6/"
    gpu:bool = True
    optimize:bool = False
    n_optuna_trials:int = 5
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 500  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    "Deprecated: Use summary_statistics()"
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    lgbm_dict["device"] = "gpu"
    lgbm_dict["boosting_type"] = "gbdt"
    lgbm_dict["gpu_platform_id"] = 0
    lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
        "eval_metric": "auc",  # auc, rmse, mae
        "objective": "binary:logistic",
        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["hist"]
        ),  # hist, gpu_hist
#         "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.accuracy_score(y_valid, oof)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)

def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
        #         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=22730, Columns=18
 test data : Rows=15154, Columns=17
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,0,34291,24,1,0,47,35693,2,1,2000,0,1,8,5196,369,0,3,3436795.2
1,1,95145,60,0,1,60,34773,1,4,2000,0,1,729,4496,277,0,6,9519958.0
2,2,92661,45,1,1,62,45457,4,8,2020,1,1,7473,8953,245,1,9,9276448.1
3,3,97184,99,0,0,59,15113,1,1,2000,0,1,6424,8522,256,1,9,9725732.2
4,4,61752,100,0,0,57,64245,8,4,2018,1,0,7151,2786,863,0,7,6181908.8



[1m[91m=== Train Column Names ===[0m



Index(['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors',
       'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt',
       'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom',
       'hasGuestRoom', 'price'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors', 'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt', 'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom', 'price']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

floors               85.122328
squareMeters         79.253314
made                 66.934411
basement              3.336637
attic                 2.809963
garage                1.378759
cityCode              0.245796
hasPool               0.189863
hasStormProtector     0.160490
hasStorageRoom        0.153740
price                 0.131400
isNewBuilt            0.128385
numberOfRooms         0.115080
hasYard               0.096555
id                    0.000000
numPrevOwners        -0.077110
cityPartRange        -0.093056
hasGuestRoom         -0.117772
dt

In [8]:
train.head()

Unnamed: 0,id,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,0,34291,24,1,0,47,35693,2,1,2000,0,1,8,5196,369,0,3,3436795.2
1,1,95145,60,0,1,60,34773,1,4,2000,0,1,729,4496,277,0,6,9519958.0
2,2,92661,45,1,1,62,45457,4,8,2020,1,1,7473,8953,245,1,9,9276448.1
3,3,97184,99,0,0,59,15113,1,1,2000,0,1,6424,8522,256,1,9,9725732.2
4,4,61752,100,0,0,57,64245,8,4,2018,1,0,7151,2786,863,0,7,6181908.8


In [9]:
original = pd.read_csv("../input/paris-housing-price-prediction/ParisHousing.csv")

original.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


In [10]:
train['origin']    = 0
test['origin']     = 0
original['origin'] = 1
combined = pd.concat([train, original], ignore_index=True)
train = combined

In [11]:
combined.head()

Unnamed: 0,id,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,origin
0,0.0,34291,24,1,0,47,35693,2,1,2000,0,1,8,5196,369,0,3,3436795.2,0
1,1.0,95145,60,0,1,60,34773,1,4,2000,0,1,729,4496,277,0,6,9519958.0,0
2,2.0,92661,45,1,1,62,45457,4,8,2020,1,1,7473,8953,245,1,9,9276448.1,0
3,3.0,97184,99,0,0,59,15113,1,1,2000,0,1,6424,8522,256,1,9,9725732.2,0
4,4.0,61752,100,0,0,57,64245,8,4,2018,1,0,7151,2786,863,0,7,6181908.8,0


In [12]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
squareMeters,32730.0,47589.55,44252.7,89.0,21581.0,46132.0,72545.0,6071330.0,1958301742.77,77.09,10489.2
numberOfRooms,32730.0,48.89,28.42,1.0,25.0,48.0,75.0,100.0,807.94,0.09,-1.16
hasYard,32730.0,0.49,0.5,0.0,0.0,0.0,1.0,1.0,0.25,0.06,-2.0
hasPool,32730.0,0.47,0.5,0.0,0.0,0.0,1.0,1.0,0.25,0.14,-1.98
floors,32730.0,48.21,42.92,1.0,25.0,46.0,72.0,6000.0,1842.06,81.51,11298.59
cityCode,32730.0,50078.47,29704.41,3.0,23446.0,50452.0,76229.0,491100.0,882351901.11,0.17,1.43
cityPartRange,32730.0,5.56,2.78,1.0,3.0,6.0,8.0,10.0,7.73,-0.06,-1.17
numPrevOwners,32730.0,5.59,2.76,1.0,3.0,6.0,8.0,10.0,7.61,-0.05,-1.14
made,32730.0,2007.24,99.16,1990.0,2000.0,2006.0,2014.0,10000.0,9833.46,80.0,6446.0
isNewBuilt,32730.0,0.48,0.5,0.0,0.0,0.0,1.0,1.0,0.25,0.09,-1.99


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

In [13]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features.remove(TARGET)
cont_features.remove(ID)
FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors', 'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt', 'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom', 'price', 'origin']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['squareMeters',
 'numberOfRooms',
 'hasYard',
 'hasPool',
 'floors',
 'cityCode',
 'cityPartRange',
 'numPrevOwners',
 'made',
 'isNewBuilt',
 'hasStormProtector',
 'basement',
 'attic',
 'garage',
 'hasStorageRoom',
 'hasGuestRoom',
 'origin']

In [14]:
excluded_features = [TARGET, ID, "fold"]

In [15]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors', 'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt', 'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom', 'price', 'origin']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['squareMeters',
 'numberOfRooms',
 'hasYard',
 'hasPool',
 'floors',
 'cityCode',
 'cityPartRange',
 'numPrevOwners',
 'made',
 'isNewBuilt',
 'hasStormProtector',
 'basement',
 'attic',
 'garage',
 'hasStorageRoom',
 'hasGuestRoom',
 'origin']

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [16]:
train = create_folds(train, Config.N_FOLDS)
# train = create_strat_folds(train, TARGET, Config.N_FOLDS)

n_folds=5, seed=42


In [17]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index(ID, inplace=True)
oof.head()

Unnamed: 0_level_0,price,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3436795.2,2
1.0,9519958.0,3
2.0,9276448.1,3
3.0,9725732.2,1
4.0,6181908.8,1


In [18]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [19]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [20]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [21]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 13 µs, sys: 2 µs, total: 15 µs
Wall time: 18.4 µs


In [22]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params = gpu_ify_lgbm(lgbm_params)
# if Config.gpu:
#     lgbm_params["device"] = "gpu"
#     lgbm_params["boosting_type"] = "gbdt"
#     lgbm_params["gpu_platform_id"] = 0
#     lgbm_params["gpu_device_id"] = 0

In [23]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror",
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.01,  # 0.01
}

if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [24]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)
# if Config.gpu:
#     cb_params["task_type"] = "GPU"

In [25]:
lgbm_params = {
    "n_estimators": Config.GPU_N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}

if Config.gpu:
    lgbm_params["device"] = "gpu"
    lgbm_params["boosting_type"] = "gbdt"
    lgbm_params["gpu_platform_id"] = 0
    lgbm_params["gpu_device_id"] = 0

In [26]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
#     "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat2": cb.CatBoostRegressor(**cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm1": lgb.LGBMRegressor(),
    "lgbm1": lgb.LGBMRegressor(),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.1,
        max_depth=10,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),

    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),

}

## Tree Models

In [27]:
%%time

model_lst = ["xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
# model_lst = ["lgbm1"]
# model_lst = = []
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=xgb1
{}
fold: 1, Score: 15677.753552219981, Run Time: 4.27
fold: 2, Score: 17241.61817187978, Run Time: 3.15
fold: 3, Score: 15608.95733567613, Run Time: 3.59
fold: 4, Score: 15685.165887594765, Run Time: 3.28
fold: 5, Score: 16417.40473676076, Run Time: 3.56
Scores -> Adjusted: 15494.97265336 , mean: 16126.17993683, std: 631.20728346

=== Model Feature Importance ===
squareMeters 0.9884608
cityCode 0.0030258032
numberOfRooms 0.0027894522
hasYard 0.0013670647
basement 0.0007639131
hasGuestRoom 0.0006733019
hasStormProtector 0.0006299667
attic 0.00051491364
floors 0.00046935648
isNewBuilt 0.0004657685
garage 0.0003627945
numPrevOwners 0.00026509762
made 8.053128e-05
hasPool 5.6212346e-05
cityPartRange 4.2638992e-05
hasStorageRoom 2.6788844e-05
origin 5.6216822e-06


Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,3428836.25
1.0,9523482.0
2.0,9277940.0
3.0,9731493.0
4.0,6178096.0


Mode
=== Target Value Counts ===
Model Run Time: 18.34
Model=xgb2
{}
fold: 1, Score: 204154.6867025187, Run Time: 15.46
fold: 2, Score: 207018.27373542168, Run Time: 15.52
fold: 3, Score: 210274.72191872602, Run Time: 15.59
fold: 4, Score: 208200.01026258882, Run Time: 15.58
fold: 5, Score: 211200.17989779066, Run Time: 16.39
Scores -> Adjusted: 205677.10751088 , mean: 208169.57450341, std: 2492.46699253

=== Model Feature Importance ===
squareMeters 0.88587743
origin 0.019808166
made 0.013330605
garage 0.008865632
hasGuestRoom 0.008756373
numberOfRooms 0.008130222
basement 0.0067814854
floors 0.0067378106
attic 0.0058341
cityCode 0.0057191458
hasStorageRoom 0.0056243152
cityPartRange 0.0051012333
isNewBuilt 0.0046826275
hasStormProtector 0.00467953
numPrevOwners 0.004046494
hasPool 0.003140449
hasYard 0.002884478


Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,3499491.0
1.0,9025952.0
2.0,8940477.0
3.0,9377347.0
4.0,6075912.0


Mode
=== Target Value Counts ===
Model Run Time: 79.22
Model=lgbm1
{}
fold: 1, Score: 22611.40656041205, Run Time: 0.79
fold: 2, Score: 21714.162652559568, Run Time: 0.86
fold: 3, Score: 24539.451285051764, Run Time: 0.98
fold: 4, Score: 22686.03478636097, Run Time: 1.16
fold: 5, Score: 24699.65708985631, Run Time: 1.31
Scores -> Adjusted: 22079.77117528 , mean: 23250.14247485, std: 1170.37129957

=== Model Feature Importance ===
squareMeters 0.48933333333333334
cityCode 0.15933333333333333
garage 0.09
basement 0.051666666666666666
floors 0.04566666666666667
attic 0.037333333333333336
numberOfRooms 0.03133333333333333
numPrevOwners 0.029333333333333333
hasGuestRoom 0.021333333333333333
made 0.018333333333333333
cityPartRange 0.011333333333333334
hasStorageRoom 0.0036666666666666666
hasStormProtector 0.0033333333333333335
isNewBuilt 0.0026666666666666666
hasPool 0.0026666666666666666
hasYard 0.0023333333333333335
origin 0.0003333333333333333


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,3456161.0
1.0,9530641.0
2.0,9287803.0
3.0,9742008.0
4.0,6181994.0


Mode
=== Target Value Counts ===
Model Run Time: 5.59
Model=lgbm2
{}
fold: 1, Score: 132861.90525455517, Run Time: 0.47
fold: 2, Score: 135824.28324714638, Run Time: 0.58
fold: 3, Score: 135422.4093747816, Run Time: 0.75
fold: 4, Score: 133312.08695455687, Run Time: 0.87
fold: 5, Score: 136223.1051328132, Run Time: 1.20
Scores -> Adjusted: 133357.15329940 , mean: 134728.75799277, std: 1371.60469337

=== Model Feature Importance ===
squareMeters 0.291
numberOfRooms 0.113
cityCode 0.11
garage 0.091
floors 0.075
basement 0.067
made 0.066
attic 0.057
hasGuestRoom 0.047
origin 0.029
cityPartRange 0.02
numPrevOwners 0.017
hasYard 0.006
hasStormProtector 0.004
isNewBuilt 0.003
hasPool 0.003
hasStorageRoom 0.001


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,3484311.0
1.0,9268219.0
2.0,9115622.0
3.0,9518307.0
4.0,6238731.0


Mode
=== Target Value Counts ===
Model Run Time: 4.36
Model=cat1
{}
fold: 1, Score: 25061.14226563662, Run Time: 5.83
fold: 2, Score: 30553.459575677152, Run Time: 6.96
fold: 3, Score: 23532.826993237948, Run Time: 6.04
fold: 4, Score: 31724.243313266637, Run Time: 6.52
fold: 5, Score: 28086.94060068197, Run Time: 6.20
Scores -> Adjusted: 24668.47665151 , mean: 27791.72254970, std: 3123.24589819

=== Model Feature Importance ===
squareMeters 0.9888872175269282
cityCode 0.005979827297795195
garage 0.0012881081323308455
made 0.0008353739777722523
floors 0.0006588609926145709
numPrevOwners 0.0005520180555096735
basement 0.0005424694245041897
hasGuestRoom 0.00044726866236198813
numberOfRooms 0.0002287417888964674
cityPartRange 0.0002144879101075843
attic 0.00012250399882071487
isNewBuilt 0.00011862233192912664
hasYard 5.0871015450449244e-05
hasStorageRoom 2.20444907706587e-05
hasStormProtector 2.197096774144499e-05
origin 1.723879450045907e-05
hasPool 1.2374631966220996e-05


Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,3424681.0
1.0,9493447.0
2.0,9213559.0
3.0,9675429.0
4.0,6202617.0


Mode
=== Target Value Counts ===
Model Run Time: 32.25
Model=cat2
{}
fold: 1, Score: 30430.679926842953, Run Time: 93.17
fold: 2, Score: 28250.151697177393, Run Time: 12.41
fold: 3, Score: 32010.943172404845, Run Time: 12.07
fold: 4, Score: 31230.10872669618, Run Time: 10.76
fold: 5, Score: 30224.365652985667, Run Time: 12.78
Scores -> Adjusted: 29169.60091227 , mean: 30429.24983522, std: 1259.64892295

=== Model Feature Importance ===
squareMeters 0.9950209064678252
garage 0.001022216556804149
attic 0.0009744396224884201
numberOfRooms 0.0006505984814233498
cityCode 0.0005734770564457671
numPrevOwners 0.00037232693986742946
hasGuestRoom 0.00036270982002706536
hasStormProtector 0.00016385478649483427
made 0.0001529651678845163
cityPartRange 0.0001381369911547242
basement 0.00012684362909380558
isNewBuilt 0.0001172462413410128
hasYard 0.00010623727124916963
hasPool 0.00010072427217502258
hasStorageRoom 6.698074955757577e-05
origin 4.345631056021892e-05
floors 6.879635607831954e-06


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,3458130.0
1.0,9527846.0
2.0,9256632.0
3.0,9708726.0
4.0,6186779.0


Mode
=== Target Value Counts ===
Model Run Time: 141.76
CPU times: user 5min 28s, sys: 15.1 s, total: 5min 43s
Wall time: 4min 41s


Unnamed: 0,Model,Score,StdDev,RunTime
1,xgb2,208169.574503,2492.466993,79.220202
3,lgbm2,134728.757993,1371.604693,4.363957
5,cat2,30429.249835,1259.648923,141.760629
4,cat1,27791.72255,3123.245898,32.248047
2,lgbm1,23250.142475,1170.3713,5.586618
0,xgb1,16126.179937,631.207283,18.343693


## Linear Models

In [28]:
model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
model_lst = ["lasso", "ridge",  "ridge_50"]
# model_lst = []
# all_cv_scores = run_models4features(model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

all_cv_scores.head()

Model=lasso
fold: 1, Score: 101668.01857014652, Run Time: 0.23
fold: 2, Score: 1555960.381932136, Run Time: 0.35
fold: 3, Score: 1545943.3884681908, Run Time: 0.53
fold: 4, Score: 1542973.866259576, Run Time: 0.68
fold: 5, Score: 1564210.2398360632, Run Time: 0.80
Scores -> Adjusted: 681860.89140941 , mean: 1262151.17901322, std: 580290.28760381


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,4326000.0
1.0,6872513.0
2.0,6543563.0
3.0,6966207.0
4.0,5129541.0


Mode
=== Target Value Counts ===
Model Run Time: 3.11
Model=ridge
fold: 1, Score: 101725.83405533405, Run Time: 0.21
fold: 2, Score: 1555995.6662928977, Run Time: 0.31
fold: 3, Score: 1545978.7169589894, Run Time: 0.44
fold: 4, Score: 1543009.239798425, Run Time: 0.59
fold: 5, Score: 1564246.0213334684, Run Time: 0.76
Scores -> Adjusted: 681909.75507459 , mean: 1262191.09568782, std: 580281.34061323


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,4326020.0
1.0,6872445.0
2.0,6543496.0
3.0,6966133.0
4.0,5129523.0


Mode
=== Target Value Counts ===
Model Run Time: 2.81
Model=ridge_50
fold: 1, Score: 101697.53879253323, Run Time: 0.20
fold: 2, Score: 1555977.7997797143, Run Time: 0.31
fold: 3, Score: 1545960.81448352, Run Time: 0.43
fold: 4, Score: 1542991.2942600911, Run Time: 0.59
fold: 5, Score: 1564227.9024945346, Run Time: 0.76
Scores -> Adjusted: 681885.59563395 , mean: 1262171.06996208, std: 580285.47432813


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,4326013.0
1.0,6872483.0
2.0,6543529.0
3.0,6966173.0
4.0,5129531.0


Mode
=== Target Value Counts ===
Model Run Time: 2.81


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb1,16126.179937,631.207283,18.343693
1,xgb2,208169.574503,2492.466993,79.220202
2,lgbm1,23250.142475,1170.3713,5.586618
3,lgbm2,134728.757993,1371.604693,4.363957
4,cat1,27791.72255,3123.245898,32.248047


In [29]:
sample_submission.head(20)

Unnamed: 0,id,price,target_xgb1,target_xgb2,target_lgbm1,target_lgbm2,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,22730,4634456.897,4749634.0,4647858.0,4750601.0,4644946.0,4691995.0,4723559.0,4760710.0,4760714.0,4760711.0
1,22731,4634456.897,6185594.0,6004880.0,6133394.0,6155582.0,6184954.0,6175971.0,5255679.0,5255655.0,5255665.0
2,22732,4634456.897,9053127.0,8742800.0,9037301.0,8836637.0,9037440.0,9056976.0,6526611.0,6526554.0,6526586.0
3,22733,4634456.897,1635051.0,1659628.0,1630426.0,1583464.0,1637741.0,1624769.0,1650430.0,1650567.0,1650501.0
4,22734,4634456.897,6758168.0,6528512.0,6746451.0,6584292.0,6681462.0,6707776.0,5336714.0,5336686.0,5336699.0
5,22735,4634456.897,111452.7,466231.8,106769.2,363188.4,109251.8,97006.74,112137.7,112325.4,112232.6
6,22736,4634456.897,9906826.0,9307672.0,9861614.0,9559080.0,9896072.0,9824252.0,6451685.0,6451618.0,6451652.0
7,22737,4634456.897,5257102.0,5232446.0,5254784.0,5182166.0,5255094.0,5251186.0,4744833.0,4744832.0,4744834.0
8,22738,4634456.897,5552873.0,5408779.0,5567969.0,5455231.0,5525384.0,5547176.0,4821712.0,4821698.0,4821703.0
9,22739,4634456.897,9822446.0,9150266.0,9790663.0,9263228.0,9782829.0,9736031.0,6257426.0,6257362.0,6257395.0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [30]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [31]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 2 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/5

sample_submission[TARGET] = sample_submission[TARGET] #.astype(int)

In [32]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,price
15146,37876,6840892.0
15147,37877,2179012.0
15148,37878,3071161.0
15149,37879,8619113.0
15150,37880,4525703.0
15151,37881,7334043.0
15152,37882,7743583.0
15153,37883,1778920.0


In [33]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
7,ridge,1262191.0,580281.340613,2.811826
8,ridge_50,1262171.0,580285.474328,2.80922
6,lasso,1262151.0,580290.287604,3.108472
1,xgb2,208169.6,2492.466993,79.220202
3,lgbm2,134728.8,1371.604693,4.363957
5,cat2,30429.25,1259.648923,141.760629
4,cat1,27791.72,3123.245898,32.248047
2,lgbm1,23250.14,1170.3713,5.586618
0,xgb1,16126.18,631.207283,18.343693


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [34]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [35]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

Processing train_pred_cat1, train_pred_cat1.csv
    id     pred_cat1
0  0.0  3.424681e+06
1  1.0  9.493447e+06
2  2.0  9.213559e+06
3  3.0  9.675429e+06
4  4.0  6.202617e+06
Processing train_pred_cat2, train_pred_cat2.csv
    id     pred_cat2
0  0.0  3.458130e+06
1  1.0  9.527846e+06
2  2.0  9.256632e+06
3  3.0  9.708726e+06
4  4.0  6.186779e+06
Processing train_pred_lgbm1, train_pred_lgbm1.csv
    id    pred_lgbm1
0  0.0  3.456161e+06
1  1.0  9.530641e+06
2  2.0  9.287803e+06
3  3.0  9.742008e+06
4  4.0  6.181994e+06
Processing train_pred_lgbm2, train_pred_lgbm2.csv
    id    pred_lgbm2
0  0.0  3.484311e+06
1  1.0  9.268219e+06
2  2.0  9.115622e+06
3  3.0  9.518307e+06
4  4.0  6.238731e+06
Processing train_pred_xgb1, train_pred_xgb1.csv
    id  pred_xgb1
0  0.0  3428836.2
1  1.0  9523482.0
2  2.0  9277940.0
3  3.0  9731493.0
4  4.0  6178096.0
submission_cat1, submission_cat1.csv
      id         price
0  22730  4.691995e+06
1  22731  6.184954e+06
2  22732  9.037440e+06
3  22733  1.637

In [36]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,3424681.0,3458130.0,3456161.0,3484311.0,3428836.2
1,9493447.0,9527846.0,9530641.0,9268219.0,9523482.0
2,9213559.0,9256632.0,9287803.0,9115622.0,9277940.0
3,9675429.0,9708726.0,9742008.0,9518307.0,9731493.0
4,6202617.0,6186779.0,6181994.0,6238731.0,6178096.0


In [37]:
def run_lr(useful_features:List[str], train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions
