In [1]:
import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve, KFold, RepeatedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.compose import ColumnTransformer

import category_encoders as ce

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest, ExtraTreesRegressor, GradientBoostingRegressor as GBR

from scipy import stats

from itertools import combinations

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
import optuna
from optuna.samplers import TPESampler

import category_encoders as ce
import shap
!pip install -q BorutaShap
from BorutaShap import BorutaShap

seed = 123
np.random.seed(seed)
clear_output()

In [2]:
path = r'../input/california-housing-prices/housing.csv'
TARGET_NAME = 'median_house_value'

In [3]:
df = pd.read_csv(path)
df.sample(5)

In [4]:
def reduce_memory(df, verbose=0):
    if verbose != 0:
        start_mem = df.memory_usage().sum() / 1024 ** 2
        print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object and str(col_type)[:4] != 'uint' and str(col_type) != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif str(col_type)[:4] != 'uint':
            df[col] = df[col].astype('category')
    if verbose != 0:
        end_mem = df.memory_usage().sum() / 1024 ** 2
        print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

def statistic_anova_output(factor, target, alpha=0.05, sample_size=0, method='pearson'):
    data = df[[factor, target]].copy()
    categories = data[factor].unique().tolist()
    if sample_size == 0:
        sample_size = int(0.05 * len(data))
    subsets = []
    for category in categories:
        try:
            subset = data.loc[data[factor] == category, target].sample(sample_size, random_state=seed)
        except ValueError:
            subset = data.loc[data[factor] == category, target]
        subsets.append(subset)
    normal_dist = np.array([stats.shapiro(subset.values)[1] >= alpha for subset in subsets])
    # we wont check other oneway dependecies
    is_oneway = True if np.all(normal_dist) else False
    if is_oneway:
        _, pvalue = stats.f_oneway(*subsets)
    else:
        _, pvalue = stats.kruskal(*subsets)
    if pvalue < alpha:
        print(f'Factor "{column}" has statistical influence on target.')
    else:
        print(f'Factor "{column}" does not affect target.')

def cross_validation(clf, X, y, scoring=None):
    scores = cross_val_score(estimator=clf, X=X, y=y, cv=10, scoring=scoring, n_jobs=-1)
    print(f'Меры правильности перекрекстной оценки: {scores}')
    print(f'Точность перекретсной оценки: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')
    return scores

def evaluate_preds(true_values, pred_values, plot=True):
    print("R2:\t" + str(round(r2_score(true_values, pred_values), 3)) + "\n" +
          "MAE:\t" + str(round(mean_absolute_error(true_values, pred_values), 3)) + "\n" +
          "RMSE:\t" + str(round(mean_squared_error(true_values, pred_values, squared=False), 3)))
    if plot:
        plt.figure(figsize=(8,8))
        sns.scatterplot(x=pred_values, y=true_values, alpha=0.6)
        plt.plot([0, 500000], [0, 500000], linestyle='--', color='red', lw=3)
        plt.xlabel('Predicted values')
        plt.ylabel('True values')
        plt.title('True vs Predicted values')
        plt.show()                    

In [5]:
df.describe()

In [6]:
df.describe(include=['object'])

In [7]:
df.info()

In [8]:
df.duplicated().sum()

In [9]:
numerical_cols = df.select_dtypes(include=[np.float64]).columns.tolist()

plt.figure(figsize=(20, 18))
for idx, column in enumerate(numerical_cols):
    plt.subplot(3, 3, idx + 1)
    dist = 'Normal Distribution' if stats.shapiro(df[column].sample(200))[1] > 0.05 else 'Not normal distribution'
    plt.title(f'{column}: {dist}')
    sns.histplot(data=df, x=column, bins=50, kde=True)
plt.subplots_adjust(hspace=0.4, wspace=0.4)
plt.show()

# the data is skewed
# median_age, median_house_value (target) has possible outliers
# total_rooms, population, total_bedrooms, households?

In [10]:
import folium
from folium import Marker
from folium.plugins import MarkerCluster

coords = df.loc[df['longitude'] == df['longitude'].median(), ['latitude','longitude']].values[0].tolist()
ca_map = folium.Map(location=coords, tiles='cartodbpositron', zoom_start=10)

mc = MarkerCluster()
for idx, row in df.iterrows():
    mc.add_child(Marker([row['latitude'], row['longitude']]))
ca_map.add_child(mc)
ca_map

In [11]:
df[df[TARGET_NAME] >= 500000]

In [12]:
plt.figure(figsize=(20, 18))
for idx, column in enumerate(numerical_cols):
    plt.subplot(3, 3, idx + 1)
    plt.title(f'{column}')
    sns.scatterplot(data=df, x=column, y=TARGET_NAME)
plt.subplots_adjust(hspace=0.4, wspace=0.4)
plt.show()
# there are some possible outliers

In [13]:
plt.figure(figsize=(16,18))
for idx, column in enumerate(numerical_cols, 1):
    plt.subplot(3, 3, idx)
    sns.boxplot(y=df[column], data=df)
    plt.title(f'{column}')
plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.tight_layout()
plt.show()

In [14]:
plt.figure(figsize = (14,12))
corr_matrix = df.corr()
corr_matrix = np.round(corr_matrix, 2)
corr_matrix[np.abs(corr_matrix) < 0.3] = 0
sns.heatmap(corr_matrix, annot=True, linewidths=.5, cmap='coolwarm')
plt.title('Correlation matrix')
plt.show()
# lots of significant linear correlations, some of these features shoud be dropped
# by now we cannot decide wich ones as there are very few features

In [15]:
statistic_anova_output('ocean_proximity', TARGET_NAME)

#### Some basic EDA in one cell: here we had compared different approaches with outliers, NaNs, new features and non-linear transformations and base models (LinearRegression, RandomForest (Extra), LGBMRegressor)

In [240]:
df = pd.read_csv(path)
df = reduce_memory(df)

#df.loc[df['ocean_proximity'] == 'ISLAND', 'ocean_proximity'] = 'NEAR OCEAN'
# it is obvious that price == 500001 is an outlier, we cannot replace it efficiently as we have to predict it
df = df[df[TARGET_NAME] < 500001]
df.dropna(inplace=True)
# population > 25000, total_bedrooms > 5500, total_rooms > 35000, households > 5500: possible outliers
df['lonlat'] = df['longitude'] + df['latitude']
df['living_rooms'] = df['total_rooms'] - df['total_bedrooms']

df['population_per_room'] = df['population'] / df['total_rooms'] # not for LR
df['is_city'] = 0 # not for LR
df.loc[((df['longitude'].between(-119, -117)) | (df['longitude'].between(-123, -121))), 'is_city'] = 1 # not for LR
df['is_city'] = df['is_city'].astype(np.uint8)

X, y = df.drop(TARGET_NAME, axis=1), df[TARGET_NAME]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=seed)

xx = pd.concat([X_train, y_train], axis=1)
xxx = xx.groupby(['ocean_proximity', 'housing_median_age'])[TARGET_NAME].median().reset_index()
xxx.rename(columns={TARGET_NAME: 'ocean_age_price'}, inplace=True)
X_train = X_train.merge(xxx, on=['ocean_proximity', 'housing_median_age'], how='left')
ocean_age_price_median = X_train['ocean_age_price'].median()
del xx
X_test = X_test.merge(xxx, on=['ocean_proximity', 'housing_median_age'], how='left')
X_test['ocean_age_price'].fillna(ocean_age_price_median, inplace=True)
del xxx

X_train = pd.get_dummies(X_train, prefix=['Location'])
X_test = pd.get_dummies(X_test, prefix=['Location'])

#X_train.drop(X_train[outliers_train == -1].index, axis=0)
#X_test.drop(X_test[outliers_test == -1].index, axis=0)

# IQR scores: not needed for gradient boosting
for column in ['population', 'total_bedrooms', 'total_rooms', 'households']:
    q1 = np.quantile(X_train[column], 0.25)
    q3 = np.quantile(X_train[column], 0.75)
    iq_low = q1 - 1.5 * (q3 - q1)
    iq_high = q3 + 1.5 * (q3 - q1)
    X_train.loc[X_train[column] > iq_high, column] = iq_high
    X_train.loc[X_train[column] < iq_low, column] = iq_low
    
    X_test.loc[X_test[column] > iq_high, column] = iq_high
    X_test.loc[X_test[column] < iq_low, column] = iq_low

numerical_cols = X.select_dtypes(include=[np.float32]).columns.tolist()

trans = ColumnTransformer(transformers=[('scaler', MinMaxScaler(), numerical_cols)],
                       remainder='passthrough')
pipe = Pipeline(steps=[('scaling', trans),
                       #('impute', IterativeImputer(random_state=seed)),  # just dropna better
                       ('poly', PolynomialFeatures(degree=2, interaction_only=False)),
                       ('reg', LinearRegression())])

# totally base estimator
forest = LGBMRegressor(verbose=-1)
forest.fit(X_train, y_train)
train_pred = forest.predict(X_train)
test_pred = forest.predict(X_test)

evaluate_preds(y_train, train_pred, plot=False)
evaluate_preds(y_test, test_pred, plot=False)

In [86]:
iso = IsolationForest(n_estimators=200, contamination=0.001, n_jobs=-1, random_state=seed)
outliers_train = iso.fit_predict(X_train)
outliers_test = iso.predict(X_test)
X_train[outliers_train == -1]
# actually it does not seem like outliers, as in big houses there are lot of bedrooms and population

### Feature selection

In [176]:
selector = BorutaShap(model=LGBMRegressor(verbose=-1), importance_measure='shap', classification=False)
selector.fit(X_train, y_train, n_trials=50, sample=False, verbose=False)

In [177]:
selector.features_to_remove
# so ocean_proximity, total_rooms and living_rooms are supposed to be weak features according to shap values
# but nevertheless they have some impact on target variable

#### Define Feature preprocessing and engeneering class, selecting class

In [5]:
class BestSet(BaseEstimator, TransformerMixin):
    def __init__(self, estimator, k_features=12, scoring=r2_score, test_size=0.2):
        self.scoring = scoring
        self.k_features = k_features
        self.test_size = test_size
        self.estimator = clone(estimator)
        self.fit_params = {}
        if self.estimator.__class__.__name__ == 'LGBMClassifier':
            self.fit_params.update({'verbose': False})

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
            X, y = X.values, y.values
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y, 
                                                            test_size=self.test_size, 
                                                            shuffle=True,
                                                            random_state=seed)
        dim = X_train.shape[1]
        self.indices_ = tuple(range(dim))
        self.subsets_ = [self.indices_]
        score = self._calc_score(X_train, y_train, X_test, y_test, self.indices_)
        self.scores_ = [score]

        while dim > self.k_features:
            scores, subsets = [], []
            for p in combinations(self.indices_, r=dim-1):
                score = self._calc_score(X_train, y_train, X_test, y_test, p)
                scores.append(score)
                subsets.append(p)
            best = np.argmax(scores)
            self.indices_ = subsets[best]
            self.subsets_.append(self.indices_)
            dim -= 1
            self.scores_.append(scores[best])
        return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.values
        best_indices = self.subsets_[np.argmax(self.scores_)]
        return X[:, best_indices]

    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train, **self.fit_params)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score
    
class FeatureCompose(BaseEstimator, TransformerMixin):
    def __init__(self, catb=True):
        self.df_grouped = None
        self.ocean_age_price_median = None
        self.catb = catb
    
    def fit(self, X, y=None):
        df = pd.concat([X, y], axis=1)
        self.df_grouped = df.groupby(['ocean_proximity', 'housing_median_age'])[TARGET_NAME].median().reset_index()
        self.df_grouped.rename(columns={TARGET_NAME: 'ocean_age_price'}, inplace=True)
        self.ocean_age_price_median = self.df_grouped['ocean_age_price'].median()
        del df
        return self
    
    def transform(self, X):
        X_ = X.copy()
        X_ = reduce_memory(X_)
                
        X_['lonlat'] = X_['longitude'] + X_['latitude']
        X_['living_rooms'] = X_['total_rooms'] - X_['total_bedrooms']

        X_['population_per_room'] = X_['population'] / X_['total_rooms'] 
        X_['is_city'] = 0
        X_.loc[((X_['longitude'].between(-119, -117)) | (X_['longitude'].between(-123, -121))), 'is_city'] = 1
        X_['is_city'] = X_['is_city'].astype(np.uint8)
        
        X_ = X_.merge(self.df_grouped, on=['ocean_proximity', 'housing_median_age'], how='left')
        X_['ocean_age_price'].fillna(self.ocean_age_price_median, inplace=True)
        
        # IQR adjustinng skipped as we will use gradient boosting models, no significant improve
        
        if not self.catb:
            X_ = pd.get_dummies(X_, prefix=['Location'])
        
        return X_

In [48]:
base_pipe = Pipeline(steps=[('eda_engine', FeatureCompose(catb=False)),
                            #('selector', BestSet(estimator=LGBMRegressor(verbose=-1, max_depth=5), k_features=10))
                           ])

df = pd.read_csv(path)
df = df[df[TARGET_NAME] < 500001]
df.dropna(inplace=True)
    
X, y = df.drop(TARGET_NAME, axis=1), df[TARGET_NAME]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=seed+1)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=1500, shuffle=True, random_state=seed)
print(f'Train size: {X_train.shape[0]}, Validation size: {X_valid.shape[0]}, Test size: {X_test.shape[0]}')


X_train = base_pipe.fit_transform(X_train, y_train)
X_valid = base_pipe.transform(X_valid)
X_test = base_pipe.transform(X_test)

In [60]:
# sklearn gradient boosting model
gbr = GBR(n_estimators=400, learning_rate=0.1, subsample=1., min_samples_split=10, min_samples_leaf=2, 
          max_depth=9, random_state=seed, max_features=None)

#X_train = np.concatenate([X_train, X_valid], axis=0)
#y_train = np.concatenate([y_train, y_valid], axis=0)
X_train_ = pd.concat([X_train, X_valid], axis=0)
y_train_ = pd.concat([y_train, y_valid], axis=0)

gbr.fit(X_train_, y_train_)

train_pred = gbr.predict(X_train_)
test_pred = gbr.predict(X_test)

evaluate_preds(y_train_, train_pred, plot=False)
evaluate_preds(y_test, test_pred, plot=False)

#### 11/18 columns selected by LGBM BestSet

'longitude', 'latitude', 'housing_median_age', 'population', 'households', 'median_income', 'lonlat', 'living_rooms',
'population_per_room', 'is_city', 'Location_NEAR BAY'

#### 12/14 columns selected by CatBoost SHAP selector

In [61]:
shap_cols = ['longitude', 'latitude', 'housing_median_age', 'total_bedrooms', 'population', 'households', 'median_income', 
             'ocean_proximity', 'lonlat', 'population_per_room', 'is_city', 'ocean_age_price']

df = pd.read_csv(path)
df = df[df[TARGET_NAME] < 500001]
df.dropna(inplace=True)
    
X, y = df.drop(TARGET_NAME, axis=1), df[TARGET_NAME]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=seed)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=1500, shuffle=True, random_state=seed)
print(f'Train size: {X_train.shape[0]}, Validation size: {X_valid.shape[0]}, Test size: {X_test.shape[0]}')

In [62]:
compose = FeatureCompose()
X_train = compose.fit_transform(X_train, y_train)[shap_cols]
X_valid = compose.transform(X_valid)[shap_cols]
X_test = compose.transform(X_test)[shap_cols]

In [63]:
cat_cols = X_train.select_dtypes(include=['category', 'object']).columns.tolist()
train = Pool(X_train, y_train, cat_features=cat_cols)
valid = Pool(X_valid, y_valid, cat_features=cat_cols)

In [104]:
def select_features(algorithm: EFeaturesSelectionAlgorithm, steps: int = 1):
    print('Algorithm:', algorithm)
    model = CatBoostRegressor(iterations=200, random_seed=0)
    summary = model.select_features(
        train,
        eval_set=valid,
        features_for_select=list(range(train.num_col())),
        num_features_to_select=12,
        steps=steps,
        algorithm=algorithm,
        shap_calc_type=EShapCalcType.Regular,
        train_final_model=False,
        logging_level='Silent',
        plot=False
    )
    print('Selected features:', summary['selected_features_names'])
    return summary

feature_set = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, steps=2)

#### Features selected by CatShap
'longitude', 'latitude', 'housing_median_age', 'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity', 'lonlat', 'population_per_room', 'is_city', 'ocean_age_price'

In [273]:
# from black box
cat = CatBoostRegressor(verbose=0)

cat.fit(train, eval_set=valid)
train_pred = cat.predict(X_train)
test_pred = cat.predict(X_test)

evaluate_preds(y_train, train_pred, plot=False)
evaluate_preds(y_test, test_pred, plot=False)

In [274]:
params_model = {
                'eval_metric': 'R2', 
                'iterations': 1000,
                'silent': True,
                'one_hot_max_size': 10,
                'early_stopping_rounds': 5,
                'grow_policy': 'SymmetricTree',
                'allow_writing_files': False,
                'use_best_model': True,
                'random_seed': seed,
    }

In [117]:
def objective(trial):
    param_trials = {
                    'depth': trial.suggest_int('depth', 3, 9),
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
                    'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1., 5.),
                    'bagging_temperature': trial.suggest_float('bagging_temperature', 1., 3.),
                    'subsample': trial.suggest_float('subsample', 0.5, 1.),
                    'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.),
                    'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
                    }
    param_trials.update(params_model)
    opt_model = CatBoostRegressor(**param_trials)
    opt_model.fit(train, eval_set=valid)
    
    y_pred = opt_model.predict(X_valid)
    score = r2_score(y_valid, y_pred)
    
    return score

In [278]:
optuna.logging.set_verbosity(optuna.logging.FATAL)
study = optuna.create_study(sampler=TPESampler(seed=seed), direction="maximize")
study.optimize(objective, n_trials=250, timeout=6000)

print(f'Number of completed trials: {len(study.trials)}')
print('Best trial')
trial = study.best_trial
print(f'Best score: {trial.value}')
print('Best params')
for key, value in trial.params.items():
    print(f'{key}: {value}')

In [77]:
params_model = {
                'eval_metric': 'R2', 
                'iterations': 1000,
                'silent': True,
                'one_hot_max_size': 10,
                'early_stopping_rounds': 10, # 5
                'grow_policy': 'SymmetricTree',
                'allow_writing_files': False,
                'use_best_model': True,
                'random_seed': seed,
                'depth': 9,
                'learning_rate': 0.08169420038180615,
                'l2_leaf_reg': 4.881490982982831,
                'bagging_temperature': 2.4496228923749714,
                'subsample': 0.5410733614482673,
                'colsample_bylevel': 0.8551364222057061,
                'boosting_type': 'Ordered',
    }

In [78]:
cat = CatBoostRegressor(**params_model)

cat.fit(train, eval_set=valid)
train_pred = cat.predict(X_train)
test_pred = cat.predict(X_test)

evaluate_preds(y_train, train_pred, plot=True)
evaluate_preds(y_test, test_pred, plot=True)

In [87]:
kf = KFold(n_splits=10, shuffle=True, random_state=seed)

df = pd.read_csv(path)
df = df[df[TARGET_NAME] < 500001]
df.dropna(inplace=True)
    
X, y = df.drop(TARGET_NAME, axis=1), df[TARGET_NAME]
X_fold, X_test, y_fold, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=seed)

cats = []
train_indices = []

for train_index, valid_index in kf.split(X_fold, y_fold):
    X_train, X_valid = X_fold.iloc[train_index, :], X_fold.iloc[valid_index, :]
    y_train, y_valid = y_fold.iloc[train_index], y_fold.iloc[valid_index]
    train_indices.append((train_index, valid_index))
    
    compose = FeatureCompose()
    X_train = compose.fit_transform(X_train, y_train)[shap_cols]
    X_valid = compose.transform(X_valid)[shap_cols]
    X_test_ = compose.transform(X_test)[shap_cols]

    cat = CatBoostRegressor(**params_model)
    cat_cols = X_train.select_dtypes(include=['category', 'object']).columns.tolist()
    train = Pool(X_train, y_train, cat_features=cat_cols)
    valid = Pool(X_valid, y_valid, cat_features=cat_cols)
    cat.fit(train, eval_set=valid)
    
    train_preds = cat.predict(X_train)
    test_preds = cat.predict(X_test_)
    
    evaluate_preds(y_train, train_preds, plot=False)
    print()
    evaluate_preds(y_test, test_preds, plot=False)
    print('-'*30)
    cats.append(cat)

In [88]:
train_preds, test_preds = [], []
compose = FeatureCompose()
X_fold_ = compose.fit_transform(X_fold, y_fold)[shap_cols]
X_test_ = compose.transform(X_test)[shap_cols]

for cat in cats:
    train_preds.append(cat.predict(X_fold_))
    test_preds.append(cat.predict(X_test_))

In [93]:
mean_train_preds = np.array(train_preds).mean(axis=0)
mean_test_preds = np.array(test_preds).mean(axis=0)

evaluate_preds(y_fold, mean_train_preds, plot=False)
print()
evaluate_preds(y_test, mean_test_preds, plot=False)

#### SHAP values

In [94]:
shap_pool = Pool(X_fold_, y_fold, cat_features=cat_cols)
shap_values = cats[-3].get_feature_importance(shap_pool, type='ShapValues')

expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

In [95]:
shap.initjs()
shap.force_plot(expected_value, shap_values[0,:], X_fold_.iloc[0,:])

In [96]:
shap.initjs()
shap.summary_plot(shap_values, X_fold_)

### Experimentaly devide the dataset in two locations and train two models separatly

In [111]:
df = pd.read_csv(path)
df = df[df[TARGET_NAME] < 500001]
df.dropna(inplace=True)

In [112]:
df_sf = df[df['longitude'] > -120]
df_la = df[df['longitude'] <= -120]

In [113]:
X, y = df_sf.drop(TARGET_NAME, axis=1), df_sf[TARGET_NAME]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=seed)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=850, shuffle=True, random_state=seed)
print(f'Train size: {X_train.shape[0]}, Validation size: {X_valid.shape[0]}, Test size: {X_test.shape[0]}')

In [114]:
shap_cols = ['longitude', 'latitude', 'housing_median_age', 'population', 'households', 'median_income', 'ocean_proximity', 
             'lonlat', 'living_rooms', 'population_per_room', 'is_city', 'ocean_age_price']

compose = FeatureCompose()
X_train = compose.fit_transform(X_train, y_train)[shap_cols]
X_valid = compose.transform(X_valid)[shap_cols]
X_test = compose.transform(X_test)[shap_cols]

cat_cols = X_train.select_dtypes(include=['category', 'object']).columns.tolist()
train = Pool(X_train, y_train, cat_features=cat_cols)
valid = Pool(X_valid, y_valid, cat_features=cat_cols)

In [108]:
feature_set = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, steps=2)

In [140]:
params_model = {
                'eval_metric': 'R2', 
                'iterations': 1000,
                'silent': True,
                'one_hot_max_size': 10,
                'early_stopping_rounds': 10,
                'grow_policy': 'SymmetricTree',
                'allow_writing_files': False,
                'use_best_model': True,
                'random_seed': seed,
    }

In [118]:
optuna.logging.set_verbosity(optuna.logging.FATAL)
study = optuna.create_study(sampler=TPESampler(seed=seed), direction="maximize")
study.optimize(objective, n_trials=250, timeout=6000)

print(f'Number of completed trials: {len(study.trials)}')
print('Best trial')
trial = study.best_trial
print(f'Best score: {trial.value}')
print('Best params')
for key, value in trial.params.items():
    print(f'{key}: {value}')

In [130]:
params_model = {
                'eval_metric': 'R2', 
                'iterations': 1000,
                'silent': True,
                'one_hot_max_size': 10,
                'early_stopping_rounds': 10, 
                'grow_policy': 'SymmetricTree', #'Depthwise',
                'allow_writing_files': False,
                'use_best_model': True,
                'random_seed': seed,
                'depth': 7,
                'learning_rate': 0.0422300357023412,
                'l2_leaf_reg': 4.952369753337499,
                'bagging_temperature': 2.120321906094676,
                'subsample': 0.7202089061245398,
                'colsample_bylevel': 0.8927465558512657,
                'boosting_type': 'Plain',
    }

In [131]:
cat = CatBoostRegressor(**params_model)

cat.fit(train, eval_set=valid)
train_pred = cat.predict(X_train)
test_pred = cat.predict(X_test)

evaluate_preds(y_train, train_pred, plot=False)
evaluate_preds(y_test, test_pred, plot=False)

In [138]:
X, y = df_la.drop(TARGET_NAME, axis=1), df_la[TARGET_NAME]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=seed)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=600, shuffle=True, random_state=seed)
print(f'Train size: {X_train.shape[0]}, Validation size: {X_valid.shape[0]}, Test size: {X_test.shape[0]}')

In [139]:
shap_cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'households', 'median_income', 'ocean_proximity', 
             'lonlat', 'living_rooms', 'population_per_room', 'is_city', 'ocean_age_price']

compose = FeatureCompose()
X_train = compose.fit_transform(X_train, y_train)[shap_cols]
X_valid = compose.transform(X_valid)[shap_cols]
X_test = compose.transform(X_test)[shap_cols]

cat_cols = X_train.select_dtypes(include=['category', 'object']).columns.tolist()
train = Pool(X_train, y_train, cat_features=cat_cols)
valid = Pool(X_valid, y_valid, cat_features=cat_cols)

In [136]:
feature_set = select_features(algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, steps=2)

In [141]:
optuna.logging.set_verbosity(optuna.logging.FATAL)
study = optuna.create_study(sampler=TPESampler(seed=seed), direction="maximize")
study.optimize(objective, n_trials=250, timeout=6000)

print(f'Number of completed trials: {len(study.trials)}')
print('Best trial')
trial = study.best_trial
print(f'Best score: {trial.value}')
print('Best params')
for key, value in trial.params.items():
    print(f'{key}: {value}')

In [142]:
params_model = {
                'eval_metric': 'R2', 
                'iterations': 1000,
                'silent': True,
                'one_hot_max_size': 10,
                'early_stopping_rounds': 10, 
                'grow_policy': 'SymmetricTree', #'Depthwise',
                'allow_writing_files': False,
                'use_best_model': True,
                'random_seed': seed,
                'depth': 9,
                'learning_rate': 0.24344461968153278,
                'l2_leaf_reg': 3.3123400272972665,
                'bagging_temperature': 2.7482685295299825,
                'subsample': 0.900418911712863,
                'colsample_bylevel': 0.6939118398841142,
                'boosting_type': 'Ordered',
    }

In [143]:
cat2 = CatBoostRegressor(**params_model)

cat2.fit(train, eval_set=valid)
train_pred2 = cat2.predict(X_train)
test_pred2 = cat2.predict(X_test)

evaluate_preds(y_train, train_pred2, plot=True)
evaluate_preds(y_test, test_pred2, plot=True)