In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("../data/train_simple.gzip", compression="gzip")
dataset.shape

(279792, 23)

In [3]:
cat_cols = ["region", "osm_city_nearest_name", "realty_type"]

for col in cat_cols:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])

In [4]:
dataset["floor"] = dataset["floor"].fillna(1)
dataset["city_population"] = dataset["city_population"].fillna(0)

In [5]:
dataset["price_type"] = dataset["price_type"].astype(bool)

In [6]:
dataset.isna().sum().sum()

0

# Split data

In [7]:
target = "per_square_meter_price"
treatment = "price_type"

In [8]:
feature_columns = [
    'city_population',                  # население города
    'total_square',                     # Общая площадь
    'osm_catering_points_in_0.005',     # Кол-во заведений общепита в радиусе 500 метров, OpenStreetMap
    'region',                           # Регион
    'floor',                            # Этаж
    'realty_type',                      # Тип недвижимости (закодирован)
    'osm_crossing_closest_dist',        # Расстояние до ближайшего пешеходного перехода
    'osm_city_nearest_name',            # Название ближайшего города по OpenStreetMap
    'osm_subway_closest_dist',          # Расстояние до ближайщей станции метро, OpenStreetMap
    'reform_mean_floor_count_1000',     # Средняя этажность домов в радиусе 1 км по истоинчку РеформаЖКХ
    'osm_transport_stop_closest_dist',  # Расстояние до ближайщей остановки общественного транспорта, OpenStreetMap
    'osm_amenity_points_in_0.001',      # Кол-во объектов связанных с удобством в радиусе 100 метров, OpenStreetMap
    'osm_city_nearest_population',      # Население ближайшего города по OpenStreetMap
    'lng',                              # Долгота
    'osm_city_closest_dist',            # Расстояние до центра ближайшего города, OpenStreetMap
    'osm_crossing_points_in_0.005',     # Кол-во пешеходных переходов в радиусе 500 метров, OpenStreetMap
    'floor_type',                       # тип этажа
    'lat',                              # Широта
    'reform_mean_year_building_500',    # Среднее значение года постройки домов в радиусе 500 метров по истоинчку РеформаЖКХ
    'osm_culture_points_in_0.005',      # Кол-во объектов культуры в радиусе 500 метров, OpenStreetMap
    'reform_house_population_500',      # Коэффициент количества проживающих людей в радиусе 500 метров по источнику РеформаЖКХ
]

In [9]:
X_train = dataset.loc[dataset[treatment] == 0, feature_columns]
X_test = dataset.loc[dataset[treatment] == 1, feature_columns]

y_train = dataset.loc[dataset[treatment] == 0, target]
y_test = dataset.loc[dataset[treatment] == 1, target]

X_train.shape, X_test.shape

((275299, 21), (4493, 21))

# Elastic log target

In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error

import optuna
from optuna.samplers import TPESampler
from optuna.study import Study

In [34]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import make_scorer

In [24]:
def log_rmse(y_true, y_pred):
    y_true = np.exp(y_true)
    y_pred = np.exp(y_pred)
    return np.sqrt(mean_squared_error(y_true, y_pred))

lrmse_scorer = make_scorer(log_rmse, greater_is_better=False)

In [60]:
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

In [61]:
def objective(trial) -> float:
    params = {
        "alpha": trial.suggest_float("alpha", 0.0, 1.0),
        "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
    }
    
    n_features = trial.suggest_int("n_features", 10, len(feature_columns))
    
    model = make_pipeline(
        StandardScaler(),
        ElasticNet(),
    )

    cv_scores = cross_val_score(
        model, 
        X_train.iloc[:, :n_features].values, 
        y_train_log, 
        scoring=lrmse_scorer, 
        cv=5
    )
    
    mean_scores = np.mean(cv_scores)
    return mean_scores if not np.isnan(mean_scores) else -10

In [62]:
sampler = TPESampler(seed=42)
direction = "maximize"
study = optuna.create_study(sampler=sampler, direction=direction)
study.optimize(
    objective,
    n_trials=100,
    timeout=3600,
)

[32m[I 2021-11-26 01:14:46,877][0m A new study created in memory with name: no-name-30316e20-1b85-4dac-b3b8-1614dbd49473[0m
[32m[I 2021-11-26 01:14:47,884][0m Trial 0 finished with value: -175796.6609351745 and parameters: {'alpha': 0.3745401188473625, 'l1_ratio': 0.9507143064099162, 'n_features': 18}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:14:48,617][0m Trial 1 finished with value: -176140.92682987527 and parameters: {'alpha': 0.5986584841970366, 'l1_ratio': 0.15601864044243652, 'n_features': 11}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:14:49,591][0m Trial 2 finished with value: -175796.6609351745 and parameters: {'alpha': 0.05808361216819946, 'l1_ratio': 0.8661761457749352, 'n_features': 17}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:14:50,735][0m Trial 3 finished with value: -175796.6609351745 and parameters: {'alpha': 0.7080725777960455, 'l1_ratio': 0.020584494295802447, 'n_

[32m[I 2021-11-26 01:15:20,748][0m Trial 35 finished with value: -175796.6609351745 and parameters: {'alpha': 0.9026571487153404, 'l1_ratio': 0.8029127610900822, 'n_features': 17}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:15:21,693][0m Trial 36 finished with value: -175796.6609351745 and parameters: {'alpha': 0.6282020856424965, 'l1_ratio': 0.5235586298415592, 'n_features': 16}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:15:22,612][0m Trial 37 finished with value: -175796.6609351745 and parameters: {'alpha': 0.6814869961412247, 'l1_ratio': 0.37946056218135366, 'n_features': 15}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:15:23,658][0m Trial 38 finished with value: -175796.6609351745 and parameters: {'alpha': 0.43185778599967695, 'l1_ratio': 0.663524151188571, 'n_features': 20}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:15:24,546][0m Trial 39 finished with

[32m[I 2021-11-26 01:15:53,409][0m Trial 70 finished with value: -175796.6609351745 and parameters: {'alpha': 0.5785513489008213, 'l1_ratio': 0.6324740191755416, 'n_features': 14}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:15:54,468][0m Trial 71 finished with value: -175796.6609351745 and parameters: {'alpha': 0.22930004024482215, 'l1_ratio': 0.9474541562793227, 'n_features': 20}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:15:55,574][0m Trial 72 finished with value: -175796.6609351745 and parameters: {'alpha': 0.10186420320784642, 'l1_ratio': 0.9078611605968363, 'n_features': 21}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:15:56,612][0m Trial 73 finished with value: -175796.6609351745 and parameters: {'alpha': 0.5017541027587771, 'l1_ratio': 0.8629729297452161, 'n_features': 20}. Best is trial 0 with value: -175796.6609351745.[0m
[32m[I 2021-11-26 01:15:57,567][0m Trial 74 finished wit

In [21]:
study.best_params

{'alpha': 0.885695762228656, 'l1_ratio': 0.4081196617917175, 'n_features': 20}

In [22]:
%%time

model = make_pipeline(
    StandardScaler(),
    ElasticNet(alpha=0.885695762228656, l1_ratio=0.4081196617917175),
)

cv_scores = cross_val_score(
    model, 
    X_train.iloc[:, :20], 
    y_train, 
    scoring="neg_root_mean_squared_error", 
    cv=5
)

cv_scores.mean()

Wall time: 1.37 s


-129842.59349359092

In [64]:
%%time

model = make_pipeline(
        StandardScaler(),
        ElasticNet(alpha=0.885695762228656, l1_ratio=0.4081196617917175),
    )

model.fit(X_train.iloc[:, :20], y_train_log)
y_pred = model.predict(X_test.iloc[:, :20])
log_rmse(y_test_log, y_pred)

Wall time: 228 ms


87935.9429470016

### MLPRegressor Log target

In [44]:
def objective(trial) -> float:
    n_features = trial.suggest_int("n_features", 10, len(feature_columns))
    n_hidden_layer = trial.suggest_int("n_hidden_layer", 1, 3)
    hidden_layer_sizes = trial.suggest_int("hidden_layer_sizes", 10, 100)
    
    model = make_pipeline(
        StandardScaler(),
        MLPRegressor(hidden_layer_sizes=(hidden_layer_sizes,) * n_hidden_layer,
                     max_iter=10,
                     learning_rate='adaptive',
                     solver='adam'),
    )

    cv_scores = cross_val_score(
        model, 
        X_train.iloc[:, :n_features].values, 
        y_train_log, 
        scoring=lrmse_scorer, 
        cv=5
    )
    
    mean_scores = np.mean(cv_scores)
    return mean_scores if not np.isnan(mean_scores) else -10

In [None]:
sampler = TPESampler(seed=42)
direction = "maximize"
study = optuna.create_study(sampler=sampler, direction=direction)
study.optimize(
    objective,
    n_trials=100,
    timeout=3600,
)

In [66]:
study.best_params

{'n_features': 21, 'n_hidden_layer': 3, 'hidden_layer_sizes': 100}

In [67]:
%%time

model = make_pipeline(
        StandardScaler(),
        MLPRegressor(hidden_layer_sizes=(100,)*3,
                     max_iter=20,
                     learning_rate='adaptive',
                     solver='adam',
                     verbose=True,
                    ),
    )

model.fit(X_train.iloc[:, :], y_train_log)
y_pred = model.predict(X_test.iloc[:, :])
log_rmse(y_test_log, y_pred)

Iteration 1, loss = 1.17391022
Iteration 2, loss = 0.34136710
Iteration 3, loss = 0.33258506
Iteration 4, loss = 0.32425938
Iteration 5, loss = 0.31859063
Iteration 6, loss = 0.31698680
Iteration 7, loss = 0.31247005
Iteration 8, loss = 0.30995690
Iteration 9, loss = 0.30808771
Iteration 10, loss = 0.30629250
Iteration 11, loss = 0.30491626
Iteration 12, loss = 0.30363126
Iteration 13, loss = 0.30049969
Iteration 14, loss = 0.30060327
Iteration 15, loss = 0.29838067
Iteration 16, loss = 0.29722722
Iteration 17, loss = 0.29569685
Iteration 18, loss = 0.29465473
Iteration 19, loss = 0.29277837
Iteration 20, loss = 0.29208851
Wall time: 1min 3s




81247.72714562679