# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
import random
import pickle

from os import path

SEED = 20033002
random.seed(SEED)
np.random.seed(SEED)

# Data preprocessing

In [2]:
AUX_DATA_ROOT = "./data"
train = pd.read_csv(path.join(AUX_DATA_ROOT, 'train.csv'), index_col='id')

target = 'Цена'
train_idx = train.shape[0]
Y = train[target]

train.head()

Unnamed: 0_level_0,Тип_жилья,Широта,Долгота,Город,Индекс,Площадь,Этаж,Размер_участка,Расход_тепла,Ктгр_энергоэффективности,...,Нлч_парковки,Нлч_почтового_ящика,Нлч_балкона,Нлч_террасы,Нлч_подвала,Нлч_гаража,Нлч_кондиционера,Последний_этаж,Верхний_этаж,Цена
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35831305,квартира,48.875375,2.48382,rosny-sous-bois,4575,64.0,2.0,,150.0,C,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,295000.0
35854039,квартира,45.522327,4.869322,vienne,1793,75.0,,,220.0,D,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,194000.0
36058437,квартира,43.672828,1.281469,mondonville,1425,42.0,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,179000.0
35989147,дом,44.158752,1.532081,caussade,4077,102.0,,2000.0,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,205000.0
36029269,квартира,47.655212,-2.754196,vannes,2498,84.0,3.0,,,,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,508000.0


## Filling NaN

In [3]:
def fill_na(df):    
    # custom filling
    means = train.groupby(by='Тип_жилья')['Размер_участка'].mean().isna()
    zero_types = means.index[means.values]
    df.loc[df['Тип_жилья'].isin(zero_types), 'Размер_участка'] = 0
    
    means = train.groupby(by='Тип_жилья')['Расход_тепла'].mean().isna()
    zero_types = means.index[means.values]
    df.loc[df['Тип_жилья'].isin(zero_types), ['Расход_тепла', 'Кво_вредных_выбросов']] = 0

    mode = train.groupby(by='Тип_жилья')['Ктгр_энергоэффективности'].agg(lambda x: x.mode())
    empty_indices = [i for i, x in enumerate(mode.values) if isinstance(x, np.ndarray) and len(x) == 0]
    unk_types = mode.index[empty_indices]
    df.loc[df['Тип_жилья'].isin(unk_types), ['Ктгр_энергоэффективности', 'Ктгр_вредных_выбросов']] = 'unkown'
    
    mode = train.groupby(by='Тип_жилья')['Направление'].agg(lambda x: x.mode())
    empty_indices = [i for i, x in enumerate(mode.values) if isinstance(x, np.ndarray) and len(x) == 0]
    unk_types = mode.index[empty_indices]
    df.loc[df['Тип_жилья'].isin(unk_types), 'Направление'] = 'unkown'
    
    df.fillna({'Кво_фото': 0}, inplace=True)
    df.loc[(df['Тип_жилья'] == 'комната') & df['Кво_спален'].isna(), 'Кво_спален'] = 1
    
    nan_num_cols = ['Площадь', 'Кво_комнат', 'Кво_спален', 'Размер_участка','Расход_тепла', 'Кво_вредных_выбросов']
    med_groups = train.groupby('Тип_жилья')[nan_num_cols].median()

    for col in nan_num_cols:
        df[col] = df.apply(
            lambda row: med_groups[col][row['Тип_жилья']] if pd.isna(row[col]) else row[col],
            axis=1)
    
    # simple filling
    si_n = SimpleImputer(strategy='median')
    si_c = SimpleImputer(strategy='most_frequent')
    
    num_cols = df.select_dtypes(include='number').columns
    si_n.fit(train[num_cols])
    df[num_cols] = si_n.transform(df[num_cols])
    
    cat_cols = df.select_dtypes(include='O').columns
    si_c.fit(train[cat_cols])
    df[cat_cols] = si_c.transform(df[cat_cols])
    
    return df

train = fill_na(train)

df = train.drop(columns=target).copy()
df.isna().sum().sum()

np.int64(0)

## EDA

### Logarithmic and count features

In [4]:
def log_nan(row):
    row[row < 1] = 1
    return np.log(row)

df_ext = df.copy()
cols_to_log = ['Площадь', 'Размер_участка', 'Расход_тепла', 'Кво_вредных_выбросов',]
log_cols = ['Лог_' + col for col in cols_to_log]
df_ext[log_cols] = df[cols_to_log].apply(log_nan)

In [5]:
bool_cols = ['Нлч_парковки', 'Нлч_почтового_ящика', 'Нлч_балкона', 'Нлч_террасы', 'Нлч_подвала', 
            'Нлч_гаража', 'Нлч_кондиционера']
df_ext['Плюшки'] = df_ext[bool_cols].sum(axis=1)

df_ext.head()

Unnamed: 0_level_0,Тип_жилья,Широта,Долгота,Город,Индекс,Площадь,Этаж,Размер_участка,Расход_тепла,Ктгр_энергоэффективности,...,Нлч_подвала,Нлч_гаража,Нлч_кондиционера,Последний_этаж,Верхний_этаж,Лог_Площадь,Лог_Размер_участка,Лог_Расход_тепла,Лог_Кво_вредных_выбросов,Плюшки
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35831305,квартира,48.875375,2.48382,rosny-sous-bois,4575.0,64.0,2.0,94.5,150.0,C,...,1.0,0.0,0.0,0.0,0.0,4.158883,4.5486,5.010635,2.484907,2.0
35854039,квартира,45.522327,4.869322,vienne,1793.0,75.0,2.0,94.5,220.0,D,...,0.0,0.0,0.0,0.0,0.0,4.317488,4.5486,5.393628,3.931826,0.0
36058437,квартира,43.672828,1.281469,mondonville,1425.0,42.0,2.0,94.5,173.0,D,...,0.0,0.0,0.0,0.0,0.0,3.73767,4.5486,5.153292,2.639057,1.0
35989147,дом,44.158752,1.532081,caussade,4077.0,102.0,2.0,2000.0,188.0,D,...,0.0,0.0,0.0,0.0,0.0,4.624973,7.600902,5.236442,2.890372,1.0
36029269,квартира,47.655212,-2.754196,vannes,2498.0,84.0,3.0,94.5,173.0,D,...,1.0,0.0,0.0,0.0,0.0,4.430817,4.5486,5.153292,2.639057,3.0


### Cluster features

In [6]:
kmeans = KMeans(n_clusters=15, n_init=10, random_state=SEED)
kmeans.fit(train[['Широта', 'Долгота']])
df_ext['Кластер'] = kmeans.predict(df_ext[['Широта', 'Долгота']])
df_ext['Кластер'] = df_ext['Кластер'].astype("object")

df_cd = kmeans.transform(df_ext[['Широта', 'Долгота']])

df_cd = pd.DataFrame(df_cd, columns=[f"Центр_{i}" for i in range(df_cd.shape[1])], index=df_ext.index)
df_ext = df_ext.join(df_cd)
df_ext.head()

Unnamed: 0_level_0,Тип_жилья,Широта,Долгота,Город,Индекс,Площадь,Этаж,Размер_участка,Расход_тепла,Ктгр_энергоэффективности,...,Центр_5,Центр_6,Центр_7,Центр_8,Центр_9,Центр_10,Центр_11,Центр_12,Центр_13,Центр_14
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35831305,квартира,48.875375,2.48382,rosny-sous-bois,4575.0,64.0,2.0,94.5,150.0,C,...,5.968955,5.326387,4.204772,5.591455,7.224722,5.289429,3.998422,1.311812,2.362927,2.633328
35854039,квартира,45.522327,4.869322,vienne,1793.0,75.0,2.0,94.5,220.0,D,...,1.940456,8.075758,0.728199,5.616398,3.144573,3.782871,4.200185,4.804905,2.194875,4.979323
36058437,квартира,43.672828,1.281469,mondonville,1425.0,42.0,2.0,94.5,173.0,D,...,4.078129,6.008742,4.761215,1.945976,5.886928,0.292558,1.745806,6.603585,3.322162,3.901142
35989147,дом,44.158752,1.532081,caussade,4077.0,102.0,2.0,2000.0,188.0,D,...,3.861773,5.843233,4.321843,2.122949,5.683645,0.495724,1.435327,6.067547,2.776054,3.5156
36029269,квартира,47.655212,-2.754196,vannes,2498.0,84.0,3.0,94.5,173.0,D,...,9.050716,0.442123,8.41205,4.072727,10.80172,5.886933,4.1542,6.352098,5.79237,3.036566


In [7]:
with open("weights/kmeans.pkl", "wb") as f:
    pickle.dump(kmeans, f)

### PCA

In [8]:
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y, random_state=SEED)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [9]:
pca_cols = ['Площадь', 'Размер_участка', 'Расход_тепла', 'Кво_вредных_выбросов',
            'Лог_Площадь', 'Лог_Размер_участка', 'Лог_Расход_тепла', 'Лог_Кво_вредных_выбросов', 
            'Кво_комнат', 'Кво_спален', 'Кво_ванных',]

train_pca = df_ext.iloc[:train_idx][pca_cols].copy()
train_pca_scaled = (train_pca - train_pca.mean(axis=0)) / train_pca.std(axis=0)
pca_1 = PCA(random_state=SEED)

X_pca = pca_1.fit_transform(train_pca_scaled)
component_names = [f"PC_{i}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

pca_mi = make_mi_scores(X_pca, train[target])
pca_mi

PC_0     0.191245
PC_1     0.169319
PC_4     0.156414
PC_8     0.144177
PC_9     0.138456
PC_5     0.135458
PC_3     0.135070
PC_2     0.126248
PC_6     0.122356
PC_10    0.103659
PC_7     0.099129
Name: MI Scores, dtype: float64

In [10]:
top_pca = list(pca_mi.index[:5])
df_pca = df_ext[pca_cols].copy()
df_pca_scaled = (df_pca - df_pca.iloc[:train_idx].mean(axis=0)) / df_pca.iloc[:train_idx].std(axis=0)
df_pca = pca_1.transform(df_pca_scaled)

df_ext[top_pca] = df_pca[:, [int(pca_t.split('_')[-1]) for pca_t in top_pca]]

In [11]:
top_pca

['PC_0', 'PC_1', 'PC_4', 'PC_8', 'PC_9']

In [12]:
pca_cols_c = ['Центр_0', 'Центр_1','Центр_2', 'Центр_3', 
            'Центр_4', 'Центр_5', 'Центр_6', 'Центр_7',
            'Центр_8', 'Центр_9', 'Центр_10', 'Центр_11', 
            'Центр_12', 'Центр_13', 'Центр_14',]


train_pca = df_ext.iloc[:train_idx][pca_cols_c].copy()
pca_2 = PCA(random_state=SEED)

X_pca = pca_2.fit_transform(train_pca)

component_names = [f"PCц_{i}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

pca_mi = make_mi_scores(X_pca, train[target])
pca_mi

PCц_0     0.234372
PCц_5     0.213921
PCц_1     0.199394
PCц_4     0.187374
PCц_9     0.169992
PCц_3     0.169297
PCц_2     0.169057
PCц_11    0.167679
PCц_10    0.167003
PCц_14    0.166981
PCц_6     0.154049
PCц_13    0.153805
PCц_7     0.149320
PCц_12    0.148683
PCц_8     0.135141
Name: MI Scores, dtype: float64

In [13]:
top_pca_c = list(pca_mi.index[:10])
df_pca = pca_2.transform(df_ext[pca_cols_c])

df_ext[top_pca_c] = df_pca[:, [int(pca_t.split("_")[-1]) for pca_t in top_pca_c]]

In [14]:
top_pca_c

['PCц_0',
 'PCц_5',
 'PCц_1',
 'PCц_4',
 'PCц_9',
 'PCц_3',
 'PCц_2',
 'PCц_11',
 'PCц_10',
 'PCц_14']

In [15]:
with open("weights/pca_1.pkl", "wb") as f:
    pickle.dump(pca_1, f)

with open("weights/pca_2.pkl", "wb") as f:
    pickle.dump(pca_2, f)

## Outliers processing

In [16]:
outls_clf = IsolationForest(contamination=0.01, random_state=SEED, max_features=0.5, n_estimators=500)
cat_cols = df_ext.select_dtypes(include='O').columns
anomal_ind = outls_clf.fit_predict(df_ext.drop(columns=cat_cols)[:train_idx])
anomal_ind = anomal_ind == 1
train_clean = df_ext.iloc[:train_idx].loc[anomal_ind]
df_ext = pd.concat((train_clean, df_ext[train_idx:]))

train_idx_old = train_idx
train_idx = train_clean.shape[0]
print(f'Total outliers in the data: {train_idx_old - train_idx}')

Total outliers in the data: 262


## Deleting bad features

In [17]:
# these columns had low feature importance value in CatBoost 
drop_cols = [
    'Последний_этаж', 'Верхний_этаж', 'Этаж',
    'Центр_2', 'Центр_3', 'Центр_4', 'Центр_6', 'Центр_7'
    ]
df_ext.drop(columns=drop_cols, inplace=True)

# Models and training

In [18]:
class MyMAPE: # MAPE for catboost log preds
    @staticmethod
    def mape(y_t, y_p):
        return mape(y_t, y_p)
    
    def get_final_error(self, error, weight):
        return error
    
    def is_max_optimal(self):
        # the larger metric value the better
        return False

    def evaluate(self, y_p, y_t, weight):
        score = self.mape(np.exp(y_t), np.exp(y_p[0]))
        return score, 0

In [19]:
best_params = {
    'random_seed': SEED,
    'iterations': 10000,
    'eval_metric': MyMAPE(),
    'l2_leaf_reg': 2.81255,
    'learning_rate': 0.0721322,
    'loss_function': 'RMSE',
    'leaf_estimation_method': 'Newton',
    'bootstrap_type': 'MVS',
    'boosting_type': 'Plain',
    'grow_policy': 'Depthwise',
    'rsm': 0.4535,
    'depth': 10,
    'verbose': 2500,
    'task_type': 'CPU',
    }

## Беггинг (KFold)

In [None]:
kf = KFold(n_splits=6, shuffle=True, random_state=SEED)

y = np.log(Y)[anomal_ind]
X = df_ext[:train_idx]

cat_cols = list(df_ext.select_dtypes(include='O').columns)

models = []
weights = []

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    train_pool = Pool(data=x_train, label=y_train, cat_features=cat_cols)
    val_pool = Pool(data=x_test, label=y_test, cat_features=cat_cols)

    model = CatBoostRegressor(**best_params)
    model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=300, use_best_model=True)

    # model.save_model(f'weights/cat_boost_{i}')
    models.append(model)
    weights.append(model.best_score_['validation']['MyMAPE'])
    
    l = 20
    print('^' * 3 + '-' * l + str(i + 1) + '-' * l + '^' * 3)

models = np.array(models)

mean_val_score = np.mean(weights)
print(f'Mean validation score = {mean_val_score:.6f}')
print('Validation MAPE score per model:', weights)

weights = [1/w for w in weights]
weights = np.array(weights) / np.sum(weights)
print('Weights of the models in the ensemble:', weights)

np.save('weights/bagging_models.npy', models, allow_pickle=True)
np.save('weights/vote_weights.npy', weights)
print('All weigths were saved in \'./weights/\' directory')

  _check_train_params(params)


0:	learn: 0.7708368	test: 0.7698570	best: 0.7698570 (0)	total: 288ms	remaining: 47m 59s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.2693379357
bestIteration = 2098

Shrink model to first 2099 iterations.
^^^--------------------1--------------------^^^


  _check_train_params(params)


0:	learn: 0.7679929	test: 0.7889150	best: 0.7889150 (0)	total: 113ms	remaining: 18m 50s
2500:	learn: 0.0256431	test: 0.2535624	best: 0.2535591 (2498)	total: 4m 35s	remaining: 13m 45s
5000:	learn: 0.0084947	test: 0.2529810	best: 0.2529808 (4998)	total: 8m 56s	remaining: 8m 55s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.2529561076
bestIteration = 5793

Shrink model to first 5794 iterations.
^^^--------------------2--------------------^^^


  _check_train_params(params)


0:	learn: 0.7677351	test: 0.7801889	best: 0.7801889 (0)	total: 126ms	remaining: 20m 58s
2500:	learn: 0.0259558	test: 0.2686229	best: 0.2686229 (2500)	total: 5m 1s	remaining: 15m 5s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.26823966
bestIteration = 3680

Shrink model to first 3681 iterations.
^^^--------------------3--------------------^^^


  _check_train_params(params)


0:	learn: 0.7677263	test: 0.7698334	best: 0.7698334 (0)	total: 137ms	remaining: 22m 50s
2500:	learn: 0.0273658	test: 0.2656623	best: 0.2656214 (2309)	total: 5m 59s	remaining: 17m 56s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.2656213903
bestIteration = 2309

Shrink model to first 2310 iterations.
^^^--------------------4--------------------^^^


  _check_train_params(params)


0:	learn: 0.7726551	test: 0.7471073	best: 0.7471073 (0)	total: 153ms	remaining: 25m 29s
2500:	learn: 0.0259852	test: 0.2550353	best: 0.2550121 (2489)	total: 6m 13s	remaining: 18m 40s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.2547606257
bestIteration = 3645

Shrink model to first 3646 iterations.
^^^--------------------5--------------------^^^


  _check_train_params(params)


0:	learn: 0.7677179	test: 0.7657085	best: 0.7657085 (0)	total: 196ms	remaining: 32m 43s
2500:	learn: 0.0268404	test: 0.2610480	best: 0.2610479 (2494)	total: 7m 1s	remaining: 21m 2s
5000:	learn: 0.0085936	test: 0.2603218	best: 0.2603203 (4984)	total: 13m 53s	remaining: 13m 53s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.2602921959
bestIteration = 5071

Shrink model to first 5072 iterations.
^^^--------------------6--------------------^^^
Mean validation score = 0.261868
Validation MAPE score per model: [0.2693379356742161, 0.25295610760919995, 0.2682396599559407, 0.26562139030213233, 0.25476062572673136, 0.2602921959042935]
Weights of the models in the ensemble: [0.16194805 0.17243605 0.16261113 0.16421401 0.17121466 0.16757611]
All weigths were saved in './weigts/' directory
