<a href="https://www.kaggle.com/code/mmellinger66/ps3e6-paris-housing-models?scriptVersionId=119685424" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

 <div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 6 - Paris Housing Prices</h1>
</div>

## Problem Type

Regression

## Evaluation Metric


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import datasets

import xgboost as xgb
import catboost as cb
import lightgbm as lgb

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="price"
ID="id"

In [3]:
class Config:
    path:str = "../input/playground-series-s3e6/"
    gpu:bool = True
    optimize:bool = False
    n_optuna_trials:int = 5
    fast_render:bool = False
    calc_probability:bool = False
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 2000  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 2000 # Want models to run fast during dev
    N_FOLDS:int = 5

In [4]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

In [5]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds #.astype(int)

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    "Deprecated: Use summary_statistics()"
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    lgbm_dict["device"] = "gpu"
    lgbm_dict["boosting_type"] = "gbdt"
    lgbm_dict["gpu_platform_id"] = 0
    lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Optuna Hyperparameter Optimization Library</h1>
</div>

In [6]:
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):

    xgb_params = {
        #         "objective": trial.suggest_categorical("objective", ["multi:softmax"]),
        #         "eval_metric": "mlogloss",
        #         "objective": "multi:softmax",
        "eval_metric": "auc",  # auc, rmse, mae
        "objective": "binary:logistic",
        #         "enable_categorical": trial.suggest_categorical("use_label_encoder", [True]),
        "use_label_encoder": trial.suggest_categorical("use_label_encoder", [False]),
        "n_estimators": trial.suggest_int("n_estimators", 1000, 5000, 100),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "max_depth": trial.suggest_int("max_depth", 1, 20),  # 10
        "gamma": trial.suggest_float("gamma", 0, 100, step=0.1),
        "booster": trial.suggest_categorical("booster", ["gbtree"]),
        "tree_method": trial.suggest_categorical(
            "tree_method", ["hist"]
        ),  # hist, gpu_hist
#         "predictor": "gpu_predictor",
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 100),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }

    # Model loading and training
    model = xgb.XGBClassifier(**xgb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=5000,
        verbose=0,
    )

    print(f"Number of boosting rounds: {model.best_iteration}")
    #     oof = model.predict_proba(X_valid)[:, 1] # Probability
    oof = model.predict(X_valid)  # Classification: 0,1

    return metrics.accuracy_score(y_valid, oof)


def objective_lgbm(trial, X_train, X_valid, y_train, y_valid):

    params = {
        "boosting_type": "gbdt",
        # "objective": trial.suggest_categorical("objective", ["mae", "rmse"]),
        #         "objective": trial.suggest_categorical("objective", ["multi:softprob"]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [1_000]),
        #         "n_estimators": trial.suggest_categorical("n_estimators", [5000]),
        "n_estimators": trial.suggest_int("n_estimators", 700, 1000),
        "importance_type": "gain",
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 10.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1, step=0.01),
        "num_leaves": trial.suggest_int("num_leaves", 2, 1000),
        "feature_fraction": trial.suggest_uniform("feature_fraction", 0.1, 1.0),
        "bagging_fraction": trial.suggest_uniform("bagging_fraction", 0.1, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 1, 300),
        "subsample": trial.suggest_float("subsample", 0.1, 1, step=0.01),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-2, 0.25),
        "max_depth": trial.suggest_int("max_depth", 1, 100),
        "random_state": trial.suggest_categorical("random_state", [42]),
        "n_jobs": trial.suggest_categorical("n_jobs", [4]),
        #         'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-1, 1e3),
        # "min_child_weight": trial.suggest_categorical("min_child_weight", [256]),
    }
    if Config.gpu:
        params["device_type"] = "gpu"

    # Model loading and training
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="mae",
        callbacks=[
            lgb.log_evaluation(500),
            lgb.early_stopping(500, False, True),
        ],
    )

    #     print(f"Number of boosting rounds: {model.best_iteration}")
    oof = model.predict(X_valid)

    #     return accuracy_score(y_valid, oof)
    return metrics.roc_auc_score(y_valid, oof)

def objective_cb(trial, X_train, X_valid, y_train, y_valid):

    cb_params = {
        "iterations": 10,  # 1000
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.1, 1.0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 100),
        "bagging_temperature": trial.suggest_loguniform(
            "bagging_temperature", 0.1, 20.0
        ),
        "random_strength": trial.suggest_float("random_strength", 1.0, 2.0),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 300),
        "use_best_model": True,
        #         "task_type": "GPU",
        "random_seed": 42,
    }

    # Model loading and training
    model = cb.CatBoostClassifier(**cb_params)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        # eval_metric="accuracy",
        early_stopping_rounds=500,
        verbose=False,
    )

    # print(f"Number of boosting rounds: {model.best_iteration}")
    # oof = model.predict_proba(X_valid)[:, 1]
    oof = model.predict(X_valid)  # Classification

    return metrics.accuracy_score(y_valid, oof)

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)                                

[1m[91m=== Shape of Data ===[0m
 train data: Rows=22730, Columns=18
 test data : Rows=15154, Columns=17
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,0,34291,24,1,0,47,35693,2,1,2000,0,1,8,5196,369,0,3,3436795.2
1,1,95145,60,0,1,60,34773,1,4,2000,0,1,729,4496,277,0,6,9519958.0
2,2,92661,45,1,1,62,45457,4,8,2020,1,1,7473,8953,245,1,9,9276448.1
3,3,97184,99,0,0,59,15113,1,1,2000,0,1,6424,8522,256,1,9,9725732.2
4,4,61752,100,0,0,57,64245,8,4,2018,1,0,7151,2786,863,0,7,6181908.8



[1m[91m=== Train Column Names ===[0m



Index(['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors',
       'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt',
       'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom',
       'hasGuestRoom', 'price'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors', 'cityCode', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt', 'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom', 'price']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

floors               85.12233
squareMeters         79.25331
made                 66.93441
basement              3.33664
attic                 2.80996
garage                1.37876
cityCode              0.24580
hasPool               0.18986
hasStormProtector     0.16049
hasStorageRoom        0.15374
price                 0.13140
isNewBuilt            0.12838
numberOfRooms         0.11508
hasYard               0.09656
id                    0.00000
numPrevOwners        -0.07711
cityPartRange        -0.09306
hasGuestRoom         -0.11777
dtype: float64
CPU t

In [8]:
train.head()

Unnamed: 0,id,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,0,34291,24,1,0,47,35693,2,1,2000,0,1,8,5196,369,0,3,3436795.2
1,1,95145,60,0,1,60,34773,1,4,2000,0,1,729,4496,277,0,6,9519958.0
2,2,92661,45,1,1,62,45457,4,8,2020,1,1,7473,8953,245,1,9,9276448.1
3,3,97184,99,0,0,59,15113,1,1,2000,0,1,6424,8522,256,1,9,9725732.2
4,4,61752,100,0,0,57,64245,8,4,2018,1,0,7151,2786,863,0,7,6181908.8


In [9]:
original = pd.read_csv("../input/paris-housing-price-prediction/ParisHousing.csv")

original.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


In [10]:
train['origin']    = 0
test['origin']     = 0
original['origin'] = 1
combined = pd.concat([train, original], ignore_index=True)
train = combined

In [11]:
combined.head()

Unnamed: 0,id,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price,origin
0,0.0,34291,24,1,0,47,35693,2,1,2000,0,1,8,5196,369,0,3,3436795.2,0
1,1.0,95145,60,0,1,60,34773,1,4,2000,0,1,729,4496,277,0,6,9519958.0,0
2,2.0,92661,45,1,1,62,45457,4,8,2020,1,1,7473,8953,245,1,9,9276448.1,0
3,3.0,97184,99,0,0,59,15113,1,1,2000,0,1,6424,8522,256,1,9,9725732.2,0
4,4.0,61752,100,0,0,57,64245,8,4,2018,1,0,7151,2786,863,0,7,6181908.8,0


In [12]:
summary_statistics(train.drop(columns=[ID], axis=1), enhanced=True)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
squareMeters,32730.0,47589.55,44252.7,89.0,21581.0,46132.0,72545.0,6071330.0,1958301742.77,77.09,10489.2
numberOfRooms,32730.0,48.89,28.42,1.0,25.0,48.0,75.0,100.0,807.94,0.09,-1.16
hasYard,32730.0,0.49,0.5,0.0,0.0,0.0,1.0,1.0,0.25,0.06,-2.0
hasPool,32730.0,0.47,0.5,0.0,0.0,0.0,1.0,1.0,0.25,0.14,-1.98
floors,32730.0,48.21,42.92,1.0,25.0,46.0,72.0,6000.0,1842.06,81.51,11298.59
cityCode,32730.0,50078.47,29704.41,3.0,23446.0,50452.0,76229.0,491100.0,882351901.11,0.17,1.43
cityPartRange,32730.0,5.56,2.78,1.0,3.0,6.0,8.0,10.0,7.73,-0.06,-1.17
numPrevOwners,32730.0,5.59,2.76,1.0,3.0,6.0,8.0,10.0,7.61,-0.05,-1.14
made,32730.0,2007.24,99.16,1990.0,2000.0,2006.0,2014.0,10000.0,9833.46,80.0,6446.0
isNewBuilt,32730.0,0.48,0.5,0.0,0.0,0.0,1.0,1.0,0.25,0.09,-1.99


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Feature Engineering</h1>
</div>

## Categorical/Numerical Variables

In [13]:
train.drop(['cityCode'], axis=1, inplace=True)
test.drop(['cityCode'], axis=1, inplace=True)


In [14]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features.remove(TARGET)
cont_features.remove(ID)
FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt', 'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom', 'price', 'origin']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['squareMeters',
 'numberOfRooms',
 'hasYard',
 'hasPool',
 'floors',
 'cityPartRange',
 'numPrevOwners',
 'made',
 'isNewBuilt',
 'hasStormProtector',
 'basement',
 'attic',
 'garage',
 'hasStorageRoom',
 'hasGuestRoom',
 'origin']

In [15]:
excluded_features = [TARGET, ID, "fold"]

In [16]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'squareMeters', 'numberOfRooms', 'hasYard', 'hasPool', 'floors', 'cityPartRange', 'numPrevOwners', 'made', 'isNewBuilt', 'hasStormProtector', 'basement', 'attic', 'garage', 'hasStorageRoom', 'hasGuestRoom', 'price', 'origin']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['squareMeters',
 'numberOfRooms',
 'hasYard',
 'hasPool',
 'floors',
 'cityPartRange',
 'numPrevOwners',
 'made',
 'isNewBuilt',
 'hasStormProtector',
 'basement',
 'attic',
 'garage',
 'hasStorageRoom',
 'hasGuestRoom',
 'origin']

<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Train Models with Cross Validation</h1>
</div>

In [17]:
train = create_folds(train, Config.N_FOLDS)
# train = create_strat_folds(train, TARGET, Config.N_FOLDS)

n_folds=5, seed=42


In [18]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[[ID, TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index(ID, inplace=True)
oof.head()

Unnamed: 0_level_0,price,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3436795.2,2
1.0,9519958.0,3
2.0,9276448.1,3
3.0,9725732.2,1
4.0,6181908.8,1


In [19]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, ID, model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result #.astype(int)
    #     submission_df.head(10)
    ss = submission_df[[ID, f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    print("=== Target Value Counts ===")
#     display(ss[TARGET].value_counts())
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [20]:
def train_cv_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid,
    params,
    n_folds:int=5,
    seed:int=42,
):

    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        scaler = preprocessing.StandardScaler()
#         scaler = preprocessing.MinMaxScaler()
        xtrain = scaler.fit(xtrain).transform(xtrain)
        xvalid = scaler.transform(xvalid)
        xtest = scaler.transform(xtest)

        model = get_model_fn # ()

        model.fit(
            xtrain,
            ytrain,
        )
        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid)))

#         fold_score = metrics.accuracy_score(yvalid, preds_valid_class)  # Validation Set Score
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        ) 
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)

#         fold_score = metrics.roc_auc_score(yvalid, preds_valid)  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)
        #         importance_list.append(model.coef_.ravel())

        fi = []
        # Feature importance
#         fi = pd.DataFrame(
#             index=FEATURES,
#             data=model.coef_.ravel(),
#             columns=[f"{fold}_importance"],
#         )
        
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )


def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=0,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

#         fold_score = metrics.cohen_kappa_score(yvalid,  preds_valid_class, weights = "quadratic")
#         fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
#         show_classification_scores(yvalid.values, preds_valid_class)
        fold_score = metrics.mean_absolute_error(
            yvalid, preds_valid
        )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [21]:
def run_linear_model(model_dict, model_name:str, features:List[str], oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_cv_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        False, #Config.calc_probability,
        ID,
        {},
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof


def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        ID,
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)

    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [22]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[[ID, target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index(ID, inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 10 µs, sys: 1 µs, total: 11 µs
Wall time: 14.1 µs


In [23]:
lgbm_params = {'n_estimators': Config.N_ESTIMATORS,
                 'num_rounds': 404,
                 'learning_rate': 0.19,
                 'num_leaves': 17,
                 'max_depth': 8,
                 'min_data_in_leaf': 36,
                 'lambda_l1': 0.96,
                 'lambda_l2': 0.01,
                 'min_gain_to_split': 11.32,
                 'bagging_fraction': 0.6,
                 'feature_fraction': 0.9}


lgbm_params = gpu_ify_lgbm(lgbm_params)
# if Config.gpu:
#     lgbm_params["device"] = "gpu"
#     lgbm_params["boosting_type"] = "gbdt"
#     lgbm_params["gpu_platform_id"] = 0
#     lgbm_params["gpu_device_id"] = 0

In [24]:
xgb_params = {
    "n_estimators": Config.N_ESTIMATORS,  # 10_000,
    "max_depth": 10,  # 10
    "objective": "reg:squarederror",
    #     "enable_categorical": True,  # Only works with gpu_hist
    #     "eval_metric": "mae",
    #     "metric": "mae",
    #     "enable_categorical": True,
    "n_jobs": 8,  # 4
    "seed": Config.seed,
    "tree_method": "hist",
    #         "gpu_id": 0,
    "subsample": 0.9,  # 0.7
    "colsample_bytree": 0.7,
    "use_label_encoder": False,
    "learning_rate": 0.01,  # 0.01
}

if Config.gpu:
    xgb_params["tree_method"] = "gpu_hist"
else:
    xgb_params["tree_method"] = "hist"

In [25]:
cb_params = {
    #     "learning_rate": 0.3277295792305584,
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 10,
    "min_data_in_leaf": 93,
    # "iterations": 100,  # 10000
    "n_estimators": Config.N_ESTIMATORS,  # 10000
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

cb_params = gpu_ify_cb(cb_params)
# if Config.gpu:
#     cb_params["task_type"] = "GPU"

In [26]:
lgbm_params = {
    "n_estimators": Config.GPU_N_ESTIMATORS,
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 28,
    
    'seed': Config.seed,
    'objective': 'regression',
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
    'metric': 'rmse',
    'verbose': -1
}

if Config.gpu:
    lgbm_params["device"] = "gpu"
    lgbm_params["boosting_type"] = "gbdt"
    lgbm_params["gpu_platform_id"] = 0
    lgbm_params["gpu_device_id"] = 0

In [27]:
model_estimator_dict = {
    "xgb2": xgb.XGBRegressor(**xgb_params),
#     "lgbm1": lgb.LGBMRegressor(**lgbm_params),

    "cat2": cb.CatBoostRegressor(**cb_params),

    "xgb1": xgb.XGBRegressor(),
    "lgbm1": lgb.LGBMRegressor(),
    "lgbm1": lgb.LGBMRegressor(),
    "lgbm2": lgb.LGBMRegressor(
        learning_rate=0.05,
        max_depth=15,
        num_leaves=11,
        feature_fraction=0.3,
        subsample=0.1,
        n_jobs=-1,
    ),
    "lgbm3": lgb.LGBMRegressor(**lgbm_params),

    "cat1": cb.CatBoostRegressor(),

    "lin_reg": linear_model.LinearRegression(),
    "lasso": linear_model.Lasso(),
    "ridge": linear_model.Ridge(max_iter=7000),
    "ridge_25": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.25, max_iter=7000),
    "ridge_50": linear_model.Ridge(fit_intercept=True, solver='auto', alpha=0.5, max_iter=7000),

}

## Tree Models

In [28]:
%%time

model_lst = ["xgb1", "xgb2", "lgbm1", "lgbm2", "cat1", "cat2"]
# model_lst = ["lgbm1"]
# model_lst = = []
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

all_cv_scores.sort_values(by=["Score"], ascending=False)

Model=xgb1
{}
fold: 1, Score: 17577.262720405353, Run Time: 3.96
fold: 2, Score: 17193.8903175734, Run Time: 2.87
fold: 3, Score: 16674.26346570186, Run Time: 3.32
fold: 4, Score: 16629.915791579544, Run Time: 3.07
fold: 5, Score: 18430.308658984697, Run Time: 3.28
Scores -> Adjusted: 16636.85652382 , mean: 17301.12819085, std: 664.27166703

=== Model Feature Importance ===
squareMeters 0.98882294
numPrevOwners 0.004286385
numberOfRooms 0.0022135295
hasYard 0.0012227069
basement 0.0007427436
attic 0.00073237886
garage 0.00073111686
floors 0.00043429388
hasGuestRoom 0.00029682546
cityPartRange 0.00015362339
made 0.00014366602
hasPool 9.519807e-05
isNewBuilt 7.3293064e-05
hasStorageRoom 2.269525e-05
origin 1.7690812e-05
hasStormProtector 1.0897483e-05


Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,3429240.0
1.0,9518231.0
2.0,9268216.0
3.0,9764527.0
4.0,6179530.0


Mode
=== Target Value Counts ===
Model Run Time: 16.97
Model=xgb2
{}
fold: 1, Score: 110331.1001748444, Run Time: 73.76
fold: 2, Score: 112522.32160556513, Run Time: 75.23
fold: 3, Score: 111950.6805344509, Run Time: 75.10
fold: 4, Score: 109067.5183204602, Run Time: 74.27
fold: 5, Score: 112114.86319558632, Run Time: 75.65
Scores -> Adjusted: 109897.18455238 , mean: 111197.29676618, std: 1300.11221380

=== Model Feature Importance ===
squareMeters 0.894993
origin 0.023949524
made 0.013308596
numberOfRooms 0.008259555
garage 0.008191497
hasGuestRoom 0.00781933
floors 0.007475444
basement 0.006396409
attic 0.0057558366
cityPartRange 0.004070074
numPrevOwners 0.004029697
hasStormProtector 0.003719851
isNewBuilt 0.0036359918
hasStorageRoom 0.0031936984
hasPool 0.0030200565
hasYard 0.0021814404


Unnamed: 0_level_0,pred_xgb2
id,Unnamed: 1_level_1
0.0,3562294.75
1.0,9344837.0
2.0,9189840.0
3.0,9593472.0
4.0,6160285.0


Mode
=== Target Value Counts ===
Model Run Time: 375.80
Model=lgbm1
{}
fold: 1, Score: 23557.021301756893, Run Time: 0.75
fold: 2, Score: 22927.351267612314, Run Time: 0.83
fold: 3, Score: 24380.1547534273, Run Time: 0.96
fold: 4, Score: 24043.87862298305, Run Time: 1.18
fold: 5, Score: 26139.38098780234, Run Time: 1.29
Scores -> Adjusted: 23127.83395444 , mean: 24209.55738672, std: 1081.72343227

=== Model Feature Importance ===
squareMeters 0.5136666666666667
garage 0.105
basement 0.08366666666666667
attic 0.06366666666666666
floors 0.056666666666666664
numberOfRooms 0.043666666666666666
numPrevOwners 0.03933333333333333
made 0.034
hasGuestRoom 0.022333333333333334
cityPartRange 0.017
hasStorageRoom 0.006
hasPool 0.004
hasStormProtector 0.0036666666666666666
isNewBuilt 0.0036666666666666666
hasYard 0.0023333333333333335
origin 0.0013333333333333333


Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,3455080.0
1.0,9529130.0
2.0,9290590.0
3.0,9745980.0
4.0,6179610.0


Mode
=== Target Value Counts ===
Model Run Time: 5.50
Model=lgbm2
{}
fold: 1, Score: 888525.0002754522, Run Time: 0.43
fold: 2, Score: 901793.4857514688, Run Time: 0.54
fold: 3, Score: 899351.9110854829, Run Time: 0.67
fold: 4, Score: 894175.8862485467, Run Time: 0.83
fold: 5, Score: 905186.0213285397, Run Time: 0.99
Scores -> Adjusted: 891940.12853707 , mean: 897806.46093790, std: 5866.33240083

=== Model Feature Importance ===
squareMeters 0.197
made 0.104
numberOfRooms 0.101
garage 0.1
basement 0.096
attic 0.092
floors 0.089
origin 0.085
numPrevOwners 0.037
hasGuestRoom 0.035
cityPartRange 0.028
hasStormProtector 0.016
hasYard 0.007
hasPool 0.005
hasStorageRoom 0.004
isNewBuilt 0.004


Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,3815410.0
1.0,7602610.0
2.0,7711020.0
3.0,7880600.0
4.0,5627590.0


Mode
=== Target Value Counts ===
Model Run Time: 4.04
Model=cat1
{}
fold: 1, Score: 30484.07902015963, Run Time: 6.44
fold: 2, Score: 28842.31460952947, Run Time: 5.46
fold: 3, Score: 35862.65781742862, Run Time: 6.05
fold: 4, Score: 27648.643057464924, Run Time: 6.20
fold: 5, Score: 33776.75252158439, Run Time: 6.01
Scores -> Adjusted: 28257.34273838 , mean: 31322.88940523, std: 3065.54666686

=== Model Feature Importance ===
squareMeters 0.9958598446136466
made 0.0007300467794965642
garage 0.0006798975653166463
basement 0.0005897760660228577
numberOfRooms 0.0004976590497504062
numPrevOwners 0.00042791358611680676
attic 0.00038972243189940085
floors 0.0002123419882391154
hasStormProtector 0.00018459250476685467
cityPartRange 0.000181817685942552
hasGuestRoom 0.0001419370936794741
hasStorageRoom 3.44907722972048e-05
hasPool 3.3933422254825555e-05
hasYard 1.569740050087824e-05
origin 1.3572601526945236e-05
isNewBuilt 6.756438542821153e-06


Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,3432650.0
1.0,9510990.0
2.0,9281030.0
3.0,9681020.0
4.0,6125380.0


Mode
=== Target Value Counts ===
Model Run Time: 30.73
Model=cat2
{}
fold: 1, Score: 29719.48382289648, Run Time: 116.41
fold: 2, Score: 28814.88524723147, Run Time: 30.48
fold: 3, Score: 31117.239359758543, Run Time: 43.02
fold: 4, Score: 30756.170532706692, Run Time: 28.49
fold: 5, Score: 30343.624652389863, Run Time: 28.88
Scores -> Adjusted: 29336.84844092 , mean: 30150.28072300, std: 813.43228207

=== Model Feature Importance ===
squareMeters 0.9961055084059857
garage 0.000966378326638056
attic 0.0006523441239625887
hasGuestRoom 0.0004991788237923899
numberOfRooms 0.00032562583926993704
hasYard 0.0002632553097360402
numPrevOwners 0.00023201356636480883
cityPartRange 0.00020920603986457424
basement 0.00017529091836569473
hasStormProtector 0.00017391266094047043
hasStorageRoom 0.000117649362097847
origin 8.570372665422757e-05
hasPool 8.078745207039244e-05
isNewBuilt 4.665237096261061e-05
floors 4.394588245531039e-05
made 2.2547190839444937e-05


Unnamed: 0_level_0,pred_cat2
id,Unnamed: 1_level_1
0.0,3469370.0
1.0,9504820.0
2.0,9268390.0
3.0,9688800.0
4.0,6168380.0


Mode
=== Target Value Counts ===
Model Run Time: 247.85
CPU times: user 12min 23s, sys: 42.9 s, total: 13min 6s
Wall time: 11min 20s


Unnamed: 0,Model,Score,StdDev,RunTime
3,lgbm2,897806.46094,5866.3324,4.04268
1,xgb2,111197.29677,1300.11221,375.79771
4,cat1,31322.88941,3065.54667,30.72667
5,cat2,30150.28072,813.43228,247.85097
2,lgbm1,24209.55739,1081.72343,5.49647
0,xgb1,17301.12819,664.27167,16.97112


## Linear Models

In [29]:
model_lst = ["lin_reg", "lasso", "ridge", "ridge_25", "ridge_50"]
model_lst = ["lasso", "ridge",  "ridge_50"]
# model_lst = []
# all_cv_scores = run_models4features(model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    
all_cv_scores = run_models4features(model_estimator_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=True)    

all_cv_scores.head()

Model=lasso
fold: 1, Score: 101625.26299581607, Run Time: 0.19
fold: 2, Score: 1556084.7606959804, Run Time: 0.30
fold: 3, Score: 1545878.3858512293, Run Time: 0.47
fold: 4, Score: 1543038.2759907155, Run Time: 0.60
fold: 5, Score: 1564073.158003011, Run Time: 0.75
Scores -> Adjusted: 681834.37298968 , mean: 1262139.96870735, std: 580305.59571767


Unnamed: 0_level_0,pred_lasso
id,Unnamed: 1_level_1
0.0,4335780.0
1.0,6881170.0
2.0,6546510.0
3.0,6984780.0
4.0,5121890.0


Mode
=== Target Value Counts ===
Model Run Time: 2.83
Model=ridge
fold: 1, Score: 101683.15383446707, Run Time: 0.21
fold: 2, Score: 1556120.0506844728, Run Time: 0.31
fold: 3, Score: 1545913.716666436, Run Time: 0.43
fold: 4, Score: 1543073.64435113, Run Time: 0.61
fold: 5, Score: 1564108.9079791966, Run Time: 0.92
Scores -> Adjusted: 681883.27912793 , mean: 1262179.89470314, std: 580296.61557521


Unnamed: 0_level_0,pred_ridge
id,Unnamed: 1_level_1
0.0,4335800.0
1.0,6881110.0
2.0,6546450.0
3.0,6984710.0
4.0,5121880.0


Mode
=== Target Value Counts ===
Model Run Time: 3.09
Model=ridge_50
fold: 1, Score: 101654.73414884658, Run Time: 0.20
fold: 2, Score: 1556102.1862014532, Run Time: 0.31
fold: 3, Score: 1545895.8076678624, Run Time: 0.43
fold: 4, Score: 1543055.7028033254, Run Time: 0.59
fold: 5, Score: 1564090.8047879785, Run Time: 0.78
Scores -> Adjusted: 681859.04647050 , mean: 1262159.84712189, std: 580300.80065140


Unnamed: 0_level_0,pred_ridge_50
id,Unnamed: 1_level_1
0.0,4335800.0
1.0,6881150.0
2.0,6546480.0
3.0,6984750.0
4.0,5121880.0


Mode
=== Target Value Counts ===
Model Run Time: 2.98


Unnamed: 0,Model,Score,StdDev,RunTime
0,xgb1,17301.12819,664.27167,16.97112
1,xgb2,111197.29677,1300.11221,375.79771
2,lgbm1,24209.55739,1081.72343,5.49647
3,lgbm2,897806.46094,5866.3324,4.04268
4,cat1,31322.88941,3065.54667,30.72667


In [30]:
sample_submission.head(20)

Unnamed: 0,id,price,target_xgb1,target_xgb2,target_lgbm1,target_lgbm2,target_cat1,target_cat2,target_lasso,target_ridge,target_ridge_50
0,22730,4634460.0,4739610.0,4691850.0,4750850.0,4324720.0,4758470.0,4711550.0,4760890.0,4760890.0,4760890.0
1,22731,4634460.0,6196200.0,6153840.0,6190860.0,5563370.0,6145450.0,6180160.0,5279980.0,5279950.0,5279960.0
2,22732,4634460.0,9057930.0,9013880.0,9038840.0,7318470.0,9048330.0,9069420.0,6519690.0,6519630.0,6519670.0
3,22733,4634460.0,1621520.0,1558260.0,1601900.0,2779530.0,1609460.0,1623190.0,1648940.0,1649070.0,1649010.0
4,22734,4634460.0,6764480.0,6599970.0,6745890.0,6037570.0,6687050.0,6704030.0,5350530.0,5350510.0,5350520.0
5,22735,4634460.0,108245.0,283495.0,103186.0,2185090.0,103126.0,88389.9,113674.0,113860.0,113769.0
6,22736,4634460.0,9910640.0,9518530.0,9869150.0,7405540.0,9893830.0,9846140.0,6441010.0,6440940.0,6440980.0
7,22737,4634460.0,5250570.0,5331700.0,5244460.0,5061120.0,5233410.0,5248310.0,4730450.0,4730450.0,4730450.0
8,22738,4634460.0,5554960.0,5522740.0,5566250.0,5136580.0,5581620.0,5534960.0,4835170.0,4835160.0,4835160.0
9,22739,4634460.0,9812260.0,9531920.0,9789900.0,7446870.0,9790730.0,9739780.0,6239970.0,6239910.0,6239940.0


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Blend Models</h1>
</div>

In [31]:
all_blend_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
    }
)

In [32]:
sample_submission[TARGET] = (
#     (sample_submission["target_xgb_bp"] * 2 )
#     + (sample_submission["target_lgbm_bp"]  )
    (sample_submission["target_xgb1"] * 2 )
    + (sample_submission["target_lgbm1"])
#     + (sample_submission["target_lgbm2"])    
#     + (sample_submission["target_lgbm2"])
    + (sample_submission["target_cat1"] )
    + (sample_submission["target_cat2"] )    
#     + (sample_submission["target_cat_bp"] )
#     + (sample_submission["target_svc"] )
#     + (sample_submission["target_log_reg3"] )
#     + (sample_submission["target_cat2"] )
)/5

sample_submission[TARGET] = sample_submission[TARGET] #.astype(int)

In [33]:
sample_submission[[ID, TARGET]].to_csv("submission_wt_avg.csv", index=False)
sample_submission[[ID, TARGET]].tail(8)

Unnamed: 0,id,price
15146,37876,6829580.0
15147,37877,2173370.0
15148,37878,3076100.0
15149,37879,8620270.0
15150,37880,4533690.0
15151,37881,7338520.0
15152,37882,7750560.0
15153,37883,1776440.0


In [34]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
7,ridge,1262180.0,580296.61558,3.08913
8,ridge_50,1262160.0,580300.80065,2.9838
6,lasso,1262140.0,580305.59572,2.82594
3,lgbm2,897806.0,5866.3324,4.04268
1,xgb2,111197.0,1300.11221,375.79771
4,cat1,31322.9,3065.54667,30.72667
5,cat2,30150.3,813.43228,247.85097
2,lgbm1,24209.6,1081.72343,5.49647
0,xgb1,17301.1,664.27167,16.97112


<div style="background-color:rgba(177, 156, 217, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Level 1 Stack Models</h1>
</div>

In [35]:
## TODO: Generate these dictionaries from model names

train_oof_dict = {
    "train_pred_cat1": "train_pred_cat1.csv",
    "train_pred_cat2": "train_pred_cat2.csv",
    "train_pred_lgbm1": "train_pred_lgbm1.csv",    
    "train_pred_lgbm2": "train_pred_lgbm2.csv",    
    "train_pred_xgb1": "train_pred_xgb1.csv"
}

test_pred_dict = {
    "submission_cat1": "submission_cat1.csv",
    "submission_cat2": "submission_cat2.csv",
    "submission_lgbm1": "submission_lgbm1.csv",
    "submission_lgbm2": "submission_lgbm2.csv",
    "submission_xgb1": "submission_xgb1.csv",
}

In [36]:
def blend_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
# (oof_df, preds_df) = blend_results(train_oof_dict, test_pred_dict)    

In [37]:
def load_oof_results(train_oof_dict, test_pred_dict):
    oof_df = pd.DataFrame()
    test_preds_df = pd.DataFrame()

    for name, train_oof_fname in train_oof_dict.items():
        fname = "../working/" + train_oof_fname
        print(f"Processing {name}, {train_oof_fname}")
        df = pd.read_csv(fname)
        print(df.head())
#         print(df.iloc[:,1])
        preds = pd.Series(df.iloc[:,1], name=name)
#         print(preds[:5])
        oof_df = pd.concat([oof_df, preds], axis=1)
    #     oof_df = pd.concat([oof_df, pd.Series(np.load(TRAIN_PATH / train_oof), name=name)], axis=1)

    for name, test_pred_fname in test_pred_dict.items():
        fname = "../working/" + test_pred_fname
        print(f"{name}, {test_pred_fname}")
        df = pd.read_csv(fname)
        print(df.head())
        preds = pd.Series(df.iloc[:,1], name=name)
        test_preds_df = pd.concat([test_preds_df, preds], axis=1)

    print("=== oof ===")
    print(oof_df.head())
    print("=== test_preds ===")
    print(test_preds_df.head())
    return oof_df, test_preds_df
    
(oof_df, preds_df) = load_oof_results(train_oof_dict, test_pred_dict) 

Processing train_pred_cat1, train_pred_cat1.csv
    id    pred_cat1
0  0.0  3.43265e+06
1  1.0  9.51099e+06
2  2.0  9.28103e+06
3  3.0  9.68102e+06
4  4.0  6.12538e+06
Processing train_pred_cat2, train_pred_cat2.csv
    id    pred_cat2
0  0.0  3.46937e+06
1  1.0  9.50482e+06
2  2.0  9.26839e+06
3  3.0  9.68880e+06
4  4.0  6.16838e+06
Processing train_pred_lgbm1, train_pred_lgbm1.csv
    id   pred_lgbm1
0  0.0  3.45508e+06
1  1.0  9.52913e+06
2  2.0  9.29059e+06
3  3.0  9.74598e+06
4  4.0  6.17961e+06
Processing train_pred_lgbm2, train_pred_lgbm2.csv
    id   pred_lgbm2
0  0.0  3.81541e+06
1  1.0  7.60261e+06
2  2.0  7.71102e+06
3  3.0  7.88060e+06
4  4.0  5.62759e+06
Processing train_pred_xgb1, train_pred_xgb1.csv
    id  pred_xgb1
0  0.0  3429240.0
1  1.0  9518231.0
2  2.0  9268216.0
3  3.0  9764527.0
4  4.0  6179530.0
submission_cat1, submission_cat1.csv
      id        price
0  22730  4.75847e+06
1  22731  6.14545e+06
2  22732  9.04833e+06
3  22733  1.60946e+06
4  22734  6.68705e+06

In [38]:
oof_df.head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,3432650.0,3469370.0,3455080.0,3815410.0,3429240.0
1,9510990.0,9504820.0,9529130.0,7602610.0,9518231.0
2,9281030.0,9268390.0,9290590.0,7711020.0,9268216.0
3,9681020.0,9688800.0,9745980.0,7880600.0,9764527.0
4,6125380.0,6168380.0,6179610.0,5627590.0,6179530.0


In [39]:
preds_df.head()

Unnamed: 0,submission_cat1,submission_cat2,submission_lgbm1,submission_lgbm2,submission_xgb1
0,4758470.0,4711550.0,4750850.0,4324720.0,4739607.5
1,6145450.0,6180160.0,6190860.0,5563370.0,6196195.5
2,9048330.0,9069420.0,9038840.0,7318470.0,9057930.0
3,1609460.0,1623190.0,1601900.0,2779530.0,1621524.5
4,6687050.0,6704030.0,6745890.0,6037570.0,6764482.0


In [40]:
type(preds_df)

pandas.core.frame.DataFrame

In [41]:
def run_lr(useful_features:List[str], TARGET:str, train_df:pd.DataFrame, test_df:pd.DataFrame) -> (List[float],List[float]):
    final_predictions = []
    scores = []

    kfold = model_selection.KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.seed)

    for fold, (train_idx, valid_idx) in enumerate(kfold.split(train_df)):
        xtrain = train_df.iloc[train_idx].reset_index(drop=True)
        xvalid = train_df.iloc[valid_idx].reset_index(drop=True)

        xtest = test_df[useful_features].copy()

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[useful_features]
        xvalid = xvalid[useful_features]

#         model = LogisticRegression()
        model = linear_model.LinearRegression()
        # Smaller C means more regularization; default=1.0
        # 2947.0517025518097
#         model = LogisticRegression(max_iter=500, C=2947.0517025518097, penalty='l2',solver='newton-cg')
#         model = LogisticRegression(C = 2947.0517025518097,
#                         max_iter = 500,
#                         penalty = 'l2',
#                         solver = 'liblinear')
        model.fit(xtrain, ytrain)

        preds_valid = model.predict_proba(xvalid)[:,-1]
        test_preds = model.predict_proba(xtest)[:,-1]

        final_predictions.append(test_preds)
#         score = metrics.roc_auc_score(yvalid, preds_valid)
        score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        print(f"Fold={fold}, Score={score}")
        scores.append(score)
    return scores, final_predictions


In [42]:
# useful_features = ["pred_lda", "pred_gbc","pred_gbc2", "pred_cat_bp", "pred_cat1", "pred_lgbm1", "pred_lgbm2", "pred_lgbm_bp", "pred_xgb1", "pred_xgb_bp"]
useful_features = [ "train_pred_cat1", "train_pred_cat2", "train_pred_lgbm1", "train_pred_lgbm2", "train_pred_xgb1"]

In [43]:
oof_df[useful_features].head()

Unnamed: 0,train_pred_cat1,train_pred_cat2,train_pred_lgbm1,train_pred_lgbm2,train_pred_xgb1
0,3432650.0,3469370.0,3455080.0,3815410.0,3429240.0
1,9510990.0,9504820.0,9529130.0,7602610.0,9518231.0
2,9281030.0,9268390.0,9290590.0,7711020.0,9268216.0
3,9681020.0,9688800.0,9745980.0,7880600.0,9764527.0
4,6125380.0,6168380.0,6179610.0,5627590.0,6179530.0


In [44]:
# preds_df[useful_features].head()

In [45]:
# fold_scores, final_predictions = run_lr(useful_features, TARGET, oof_df, preds_df)
# test_preds = np.mean(np.column_stack(final_predictions), axis=1)
# cv_score, std_dev = show_fold_scores(fold_scores)
# create_submission("level1_lr", TARGET, test_preds)

In [46]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 999)
pd.set_option("display.precision", 5)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_colwidth



100

In [47]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
7,ridge,1262179.89,580296.62,3.09
8,ridge_50,1262159.85,580300.8,2.98
6,lasso,1262139.97,580305.6,2.83
3,lgbm2,897806.46,5866.33,4.04
1,xgb2,111197.3,1300.11,375.8
4,cat1,31322.89,3065.55,30.73
5,cat2,30150.28,813.43,247.85
2,lgbm1,24209.56,1081.72,5.5
0,xgb1,17301.13,664.27,16.97
