In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("../data/train_simple.gzip", compression="gzip")
dataset.shape

(279792, 23)

In [3]:
cat_cols = ["region", "osm_city_nearest_name", "realty_type"]

for col in cat_cols:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])

In [4]:
dataset["floor"] = dataset["floor"].fillna(1)
dataset["city_population"] = dataset["city_population"].fillna(0)

In [5]:
dataset["price_type"] = dataset["price_type"].astype(bool)

In [6]:
dataset.isna().sum().sum()

0

# Split data

In [7]:
target = "per_square_meter_price"
treatment = "price_type"

In [8]:
feature_columns = [
    'city_population',                  # население города
    'total_square',                     # Общая площадь
    'osm_catering_points_in_0.005',     # Кол-во заведений общепита в радиусе 500 метров, OpenStreetMap
    'region',                           # Регион
    'floor',                            # Этаж
    'realty_type',                      # Тип недвижимости (закодирован)
    'osm_crossing_closest_dist',        # Расстояние до ближайшего пешеходного перехода
    'osm_city_nearest_name',            # Название ближайшего города по OpenStreetMap
    'osm_subway_closest_dist',          # Расстояние до ближайщей станции метро, OpenStreetMap
    'reform_mean_floor_count_1000',     # Средняя этажность домов в радиусе 1 км по истоинчку РеформаЖКХ
    'osm_transport_stop_closest_dist',  # Расстояние до ближайщей остановки общественного транспорта, OpenStreetMap
    'osm_amenity_points_in_0.001',      # Кол-во объектов связанных с удобством в радиусе 100 метров, OpenStreetMap
    'osm_city_nearest_population',      # Население ближайшего города по OpenStreetMap
    'lng',                              # Долгота
    'osm_city_closest_dist',            # Расстояние до центра ближайшего города, OpenStreetMap
    'osm_crossing_points_in_0.005',     # Кол-во пешеходных переходов в радиусе 500 метров, OpenStreetMap
    'floor_type',                       # тип этажа
    'lat',                              # Широта
    'reform_mean_year_building_500',    # Среднее значение года постройки домов в радиусе 500 метров по истоинчку РеформаЖКХ
    'osm_culture_points_in_0.005',      # Кол-во объектов культуры в радиусе 500 метров, OpenStreetMap
    'reform_house_population_500',      # Коэффициент количества проживающих людей в радиусе 500 метров по источнику РеформаЖКХ
]

In [9]:
X_train = dataset.loc[dataset[treatment] == 0, feature_columns]
X_test = dataset.loc[dataset[treatment] == 1, feature_columns]

y_train = dataset.loc[dataset[treatment] == 0, target]
y_test = dataset.loc[dataset[treatment] == 1, target]

X_train.shape, X_test.shape

((275299, 21), (4493, 21))

# Model

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet

import optuna
from optuna.samplers import TPESampler
from optuna.study import Study

In [15]:
def objective(trial) -> float:
    params = {
        "alpha": trial.suggest_float("alpha", 0.0, 1.0),
        "l1_ratio": trial.suggest_float("l1_ratio", 0.0, 1.0),
    }
    
    n_features = trial.suggest_int("n_features", 10, len(feature_columns))
    
    model = make_pipeline(
        StandardScaler(),
        ElasticNet(),
    )

    cv_scores = cross_val_score(
        model, 
        X_train.iloc[:, :n_features].values, 
        y_train, 
        scoring="neg_root_mean_squared_error", 
        cv=5
    )
    
    mean_scores = np.mean(cv_scores)
    return mean_scores if not np.isnan(mean_scores) else -200_000

In [20]:
sampler = TPESampler(seed=42)
direction = "maximize"
study = optuna.create_study(sampler=sampler, direction=direction)
study.optimize(
    objective,
    n_trials=100,
    timeout=3600,
)

[32m[I 2021-11-25 02:02:46,333][0m A new study created in memory with name: no-name-6e8c5a76-0973-4de2-bf1c-c42053188ec1[0m
[32m[I 2021-11-25 02:02:47,592][0m Trial 0 finished with value: -129772.29605751717 and parameters: {'alpha': 0.3745401188473625, 'l1_ratio': 0.9507143064099162, 'n_features': 18}. Best is trial 0 with value: -129772.29605751717.[0m
[32m[I 2021-11-25 02:02:48,418][0m Trial 1 finished with value: -134093.64771499374 and parameters: {'alpha': 0.5986584841970366, 'l1_ratio': 0.15601864044243652, 'n_features': 11}. Best is trial 0 with value: -129772.29605751717.[0m
[32m[I 2021-11-25 02:02:49,605][0m Trial 2 finished with value: -129767.47261658136 and parameters: {'alpha': 0.05808361216819946, 'l1_ratio': 0.8661761457749352, 'n_features': 17}. Best is trial 2 with value: -129767.47261658136.[0m
[32m[I 2021-11-25 02:02:51,047][0m Trial 3 finished with value: -129754.7221908299 and parameters: {'alpha': 0.7080725777960455, 'l1_ratio': 0.020584494295802447

[32m[I 2021-11-25 02:03:30,561][0m Trial 34 finished with value: -129754.7221908299 and parameters: {'alpha': 0.6113822886003106, 'l1_ratio': 0.20432051594156575, 'n_features': 21}. Best is trial 22 with value: -129738.00093359035.[0m
[32m[I 2021-11-25 02:03:31,786][0m Trial 35 finished with value: -130465.64301828932 and parameters: {'alpha': 0.3143132317335919, 'l1_ratio': 0.9372706852885611, 'n_features': 16}. Best is trial 22 with value: -129738.00093359035.[0m
[32m[I 2021-11-25 02:03:33,174][0m Trial 36 finished with value: -129738.00093359035 and parameters: {'alpha': 0.9077687473760129, 'l1_ratio': 0.7820830001801455, 'n_features': 20}. Best is trial 22 with value: -129738.00093359035.[0m
[32m[I 2021-11-25 02:03:34,451][0m Trial 37 finished with value: -129772.29605751717 and parameters: {'alpha': 0.21073666831137206, 'l1_ratio': 0.9810773713598697, 'n_features': 18}. Best is trial 22 with value: -129738.00093359035.[0m
[32m[I 2021-11-25 02:03:35,661][0m Trial 38 f

[32m[I 2021-11-25 02:04:16,657][0m Trial 69 finished with value: -129754.7221908299 and parameters: {'alpha': 0.3611435347697475, 'l1_ratio': 0.6343761476668652, 'n_features': 21}. Best is trial 22 with value: -129738.00093359035.[0m
[32m[I 2021-11-25 02:04:17,832][0m Trial 70 finished with value: -130465.64301828932 and parameters: {'alpha': 0.4989943356344818, 'l1_ratio': 0.8701826835394787, 'n_features': 16}. Best is trial 22 with value: -129738.00093359035.[0m
[32m[I 2021-11-25 02:04:19,246][0m Trial 71 finished with value: -129738.00093359035 and parameters: {'alpha': 0.46102858832735083, 'l1_ratio': 0.6655516311321864, 'n_features': 20}. Best is trial 22 with value: -129738.00093359035.[0m
[32m[I 2021-11-25 02:04:20,593][0m Trial 72 finished with value: -129752.73074231853 and parameters: {'alpha': 0.5927339062980397, 'l1_ratio': 0.6349349891262572, 'n_features': 19}. Best is trial 22 with value: -129738.00093359035.[0m
[32m[I 2021-11-25 02:04:21,997][0m Trial 73 fi

In [21]:
study.best_params

{'alpha': 0.885695762228656, 'l1_ratio': 0.4081196617917175, 'n_features': 20}

In [22]:
%%time

model = make_pipeline(
    StandardScaler(),
    ElasticNet(alpha=0.885695762228656, l1_ratio=0.4081196617917175),
)

cv_scores = cross_val_score(
    model, 
    X_train.iloc[:, :20], 
    y_train, 
    scoring="neg_root_mean_squared_error", 
    cv=5
)

cv_scores.mean()

Wall time: 1.37 s


-129842.59349359092