# Import

In [95]:
import pandas as pd
import numpy as np
from catboost import Pool
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
import random
from os import path

seed = 20033002
random.seed(seed)

# Data preprocessing

In [96]:
AUX_DATA_ROOT = "./data"
AUX_WEIGHT_ROOT = './weights'

train = pd.read_csv(path.join(AUX_DATA_ROOT, 'train.csv'), index_col='id')
# test = pd.read_csv(path.join(AUX_DATA_ROOT, 'private_test.csv'), index_col='id')
test = pd.read_csv(path.join(AUX_DATA_ROOT, 'public_test.csv'), index_col='id')

target = 'Цена'
train_idx = train.shape[0]
Y = train[target]

train.head()

Unnamed: 0_level_0,Тип_жилья,Широта,Долгота,Город,Индекс,Площадь,Этаж,Размер_участка,Расход_тепла,Ктгр_энергоэффективности,...,Нлч_парковки,Нлч_почтового_ящика,Нлч_балкона,Нлч_террасы,Нлч_подвала,Нлч_гаража,Нлч_кондиционера,Последний_этаж,Верхний_этаж,Цена
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
35831305,квартира,48.875375,2.48382,rosny-sous-bois,4575,64.0,2.0,,150.0,C,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,295000.0
35854039,квартира,45.522327,4.869322,vienne,1793,75.0,,,220.0,D,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,194000.0
36058437,квартира,43.672828,1.281469,mondonville,1425,42.0,,,,,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,179000.0
35989147,дом,44.158752,1.532081,caussade,4077,102.0,,2000.0,,,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,205000.0
36029269,квартира,47.655212,-2.754196,vannes,2498,84.0,3.0,,,,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,508000.0


## Filling NaN

In [97]:
train['Тип_жилья'].unique()

array(['квартира', 'дом', 'дуплекс', 'вилла', 'особняк', 'земля',
       'разное', 'земля под застройку', 'лофт', 'ферма',
       'участок с землей', 'пожизненная рента', 'паркинг', 'шале',
       'мельница', 'усадьба', 'дом на воде', 'комната', 'ночлежка',
       'мастерская', 'отель-усадьба'], dtype=object)

In [98]:
test['Тип_жилья'].unique()

array(['квартира', 'дом', 'земля', 'земля под застройку',
       'участок с землей', 'разное', 'мельница', 'пожизненная рента',
       'вилла', 'дуплекс', 'ферма', 'усадьба', 'отель-усадьба', 'лофт',
       'дом на воде', 'шале', 'паркинг', 'особняк'], dtype=object)

In [99]:
set(test['Тип_жилья'].unique()) - set(train['Тип_жилья'].unique())

set()

In [100]:
test.loc[test['Тип_жилья'] == 'отель']

Unnamed: 0_level_0,Тип_жилья,Широта,Долгота,Город,Индекс,Площадь,Этаж,Размер_участка,Расход_тепла,Ктгр_энергоэффективности,...,Кво_фото,Нлч_парковки,Нлч_почтового_ящика,Нлч_балкона,Нлч_террасы,Нлч_подвала,Нлч_гаража,Нлч_кондиционера,Последний_этаж,Верхний_этаж
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [101]:
def fill_na(df):    
    # custom filling
    means = train.groupby(by='Тип_жилья')['Размер_участка'].mean().isna()
    zero_types = means.index[means.values]
    df.loc[df['Тип_жилья'].isin(zero_types), 'Размер_участка'] = 0
    
    means = train.groupby(by='Тип_жилья')['Расход_тепла'].mean().isna()
    zero_types = means.index[means.values]
    df.loc[df['Тип_жилья'].isin(zero_types), ['Расход_тепла', 'Кво_вредных_выбросов']] = 0

    mode = train.groupby(by='Тип_жилья')['Ктгр_энергоэффективности'].agg(lambda x: x.mode())
    empty_indices = [i for i, x in enumerate(mode.values) if isinstance(x, np.ndarray) and len(x) == 0]
    unk_types = mode.index[empty_indices]
    df.loc[df['Тип_жилья'].isin(unk_types), ['Ктгр_энергоэффективности', 'Ктгр_вредных_выбросов']] = 'unkown'
    
    mode = train.groupby(by='Тип_жилья')['Направление'].agg(lambda x: x.mode())
    empty_indices = [i for i, x in enumerate(mode.values) if isinstance(x, np.ndarray) and len(x) == 0]
    unk_types = mode.index[empty_indices]
    df.loc[df['Тип_жилья'].isin(unk_types), 'Направление'] = 'unkown'
    
    df.fillna({'Кво_фото': 0}, inplace=True)
    df.loc[(df['Тип_жилья'] == 'комната') & df['Кво_спален'].isna(), 'Кво_спален'] = 1
    
    nan_num_cols = ['Площадь', 'Кво_комнат', 'Кво_спален', 'Размер_участка','Расход_тепла', 'Кво_вредных_выбросов']
    med_groups = train.groupby('Тип_жилья')[nan_num_cols].median()
    med_groups.loc['отель'] = med_groups.loc['отель-усадьба'] # private test special case

    for col in nan_num_cols:
        df[col] = df.apply(
            lambda row: med_groups[col][row['Тип_жилья']] if pd.isna(row[col]) else row[col],
            axis=1)
    
    # simple filling
    si_n = SimpleImputer(strategy='median')
    si_c = SimpleImputer(strategy='most_frequent')
    
    num_cols = df.select_dtypes(include='number').columns
    si_n.fit(train[num_cols])
    df[num_cols] = si_n.transform(df[num_cols])
    
    cat_cols = df.select_dtypes(include='O').columns
    si_c.fit(train[cat_cols])
    df[cat_cols] = si_c.transform(df[cat_cols])
    
    return df

test = fill_na(test)
train = fill_na(train)

df = pd.concat((train.drop(columns=target), test))
df.isna().sum().sum()

0

## EDA

### Logarithmic and count features

In [102]:
def log_nan(row):
    row[row < 1] = 1
    return np.log(row)

# add features
df_ext = df.copy()

cols_to_log = ['Площадь', 'Размер_участка', 'Расход_тепла', 'Кво_вредных_выбросов',]
log_cols = ['Лог_' + col for col in cols_to_log]
df_ext[log_cols] = df[cols_to_log].apply(log_nan)

bool_cols = ['Нлч_парковки', 'Нлч_почтового_ящика', 'Нлч_балкона', 'Нлч_террасы', 'Нлч_подвала', 
            'Нлч_гаража', 'Нлч_кондиционера']

df_ext['Плюшки'] = df_ext[bool_cols].sum(axis=1)

### Claster features

In [103]:
kmeans = KMeans(n_clusters=15, n_init=10, random_state=seed)
kmeans.fit(train[['Широта', 'Долгота']])

df_ext['Кластер'] = kmeans.predict(df_ext[['Широта', 'Долгота']])
df_ext['Кластер'] = df_ext['Кластер'].astype("object")

df_cd = kmeans.transform(df_ext[['Широта', 'Долгота']])
df_cd = pd.DataFrame(df_cd, columns=[f"Центр_{i}" for i in range(df_cd.shape[1])], index=df_ext.index)
df_ext = df_ext.join(df_cd)

### PCA

In [104]:
def make_mi_scores(X, y):
    mi_scores = mutual_info_regression(X, y, random_state=seed)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

pca_cols = ['Площадь', 'Размер_участка', 'Расход_тепла', 'Кво_вредных_выбросов',
            'Лог_Площадь', 'Лог_Размер_участка', 'Лог_Расход_тепла', 'Лог_Кво_вредных_выбросов', 
            'Кво_комнат', 'Кво_спален', 'Кво_ванных',]

train_pca = df_ext.iloc[:train_idx][pca_cols].copy()
train_pca_scaled = (train_pca - train_pca.mean(axis=0)) / train_pca.std(axis=0)
pca = PCA(random_state=seed)

X_pca = pca.fit_transform(train_pca_scaled)
component_names = [f"PC{i}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)
pca_mi = make_mi_scores(X_pca, train[target])

top_pca = list(pca_mi.index[:5])
df_pca = df_ext[pca_cols].copy()
df_pca_scaled = (df_pca - df_pca.mean(axis=0)) / df_pca.std(axis=0)
df_pca = pca.transform(df_pca_scaled)

df_ext[top_pca] = df_pca[:, [int(pca_t[2:]) for pca_t in top_pca]]

pca_cols = ['Центр_0', 'Центр_1','Центр_2', 'Центр_3', 
            'Центр_4', 'Центр_5', 'Центр_6', 'Центр_7',
            'Центр_8', 'Центр_9', 'Центр_10', 'Центр_11', 
            'Центр_12', 'Центр_13', 'Центр_14',]


train_pca = df_ext.iloc[:train_idx][pca_cols].copy()
pca = PCA(random_state=seed)

X_pca = pca.fit_transform(train_pca)

component_names = [f"PC_ц{i}" for i in range(X_pca.shape[1])]
X_pca = pd.DataFrame(X_pca, columns=component_names)

pca_mi = make_mi_scores(X_pca, train[target])

top_pca_c = list(pca_mi.index[:10])
df_pca = pca.transform(df_ext[pca_cols])

df_ext[top_pca_c] = df_pca[:, [int(pca_t[4:]) for pca_t in top_pca_c]]

## Deleting bad features

In [105]:
drop_cols = [
    'Последний_этаж', 'Верхний_этаж', 'Этаж',
    'Центр_2', 'Центр_3', 'Центр_4', 'Центр_6', 'Центр_7'
    ]
df_ext.drop(columns=drop_cols, inplace=True)

# Prediction

In [106]:
class MyMAPE: # MAPE for catboost log preds
    @staticmethod
    def mape(y_t, y_p):
        return mape(y_t, y_p)
    
    def get_final_error(self, error, weight):
        return error
    
    def is_max_optimal(self):
        # the larger metric value the better
        return False

    def evaluate(self, y_p, y_t, weight):
        score = self.mape(np.exp(y_t), np.exp(y_p[0]))
        return score, 0

In [107]:
models = np.load(path.join(AUX_WEIGHT_ROOT, 'bagging_models.npy'), allow_pickle=True)
weights = np.load(path.join(AUX_WEIGHT_ROOT, 'vote_weights.npy'))

In [108]:
def get_pred(models, x, strat='', weights=None):
    preds = np.zeros(x.shape[0])
    if strat == 'top' and weights is not None: 
        preds = np.exp(models[np.argmax(weights)].predict(x))
    else:
        for m, w in zip(models, weights):
            if strat == 'weighted' and weights is not None: preds += w * np.exp(m.predict(x))
            else: preds += np.exp(m.predict(x))

    if strat == '': preds /= len(models)
    return preds

In [109]:
X_test = df_ext[train_idx:]

cat_cols = list(df_ext.select_dtypes(include='O').columns)
test_pool = Pool(X_test, cat_features=cat_cols)

y_pred = get_pred(models, test_pool, 'weighted', weights)

submission = pd.DataFrame({target: y_pred}, index=X_test.index)
sub_name = f'sub_private_{seed}.csv'
# submission.to_csv(sub_name)
sub_name

'sub_private_20033002.csv'