In [1]:
import numpy as np
import pandas as pd


from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("../data/train_simple.gzip", compression="gzip")
dataset.shape

(279792, 23)

In [3]:
cat_cols = ["region", "osm_city_nearest_name", "realty_type"]

for col in cat_cols:
    le = LabelEncoder()
    dataset[col] = le.fit_transform(dataset[col])

In [4]:
dataset["floor"] = dataset["floor"].fillna(1)
dataset["city_population"] = dataset["city_population"].fillna(0)

In [5]:
dataset["price_type"] = dataset["price_type"].astype(bool)

In [6]:
dataset.isna().sum().sum()

0

# Split data

In [7]:
target = "per_square_meter_price"
treatment = "price_type"

In [8]:
feature_columns = [
    'city_population',                  # население города
    'total_square',                     # Общая площадь
    'osm_catering_points_in_0.005',     # Кол-во заведений общепита в радиусе 500 метров, OpenStreetMap
    'region',                           # Регион
    'floor',                            # Этаж
    'realty_type',                      # Тип недвижимости (закодирован)
    'osm_crossing_closest_dist',        # Расстояние до ближайшего пешеходного перехода
    'osm_city_nearest_name',            # Название ближайшего города по OpenStreetMap
    'osm_subway_closest_dist',          # Расстояние до ближайщей станции метро, OpenStreetMap
    'reform_mean_floor_count_1000',     # Средняя этажность домов в радиусе 1 км по истоинчку РеформаЖКХ
    'osm_transport_stop_closest_dist',  # Расстояние до ближайщей остановки общественного транспорта, OpenStreetMap
    'osm_amenity_points_in_0.001',      # Кол-во объектов связанных с удобством в радиусе 100 метров, OpenStreetMap
    'osm_city_nearest_population',      # Население ближайшего города по OpenStreetMap
    'lng',                              # Долгота
    'osm_city_closest_dist',            # Расстояние до центра ближайшего города, OpenStreetMap
    'osm_crossing_points_in_0.005',     # Кол-во пешеходных переходов в радиусе 500 метров, OpenStreetMap
    'floor_type',                       # тип этажа
    'lat',                              # Широта
    'reform_mean_year_building_500',    # Среднее значение года постройки домов в радиусе 500 метров по истоинчку РеформаЖКХ
    'osm_culture_points_in_0.005',      # Кол-во объектов культуры в радиусе 500 метров, OpenStreetMap
    'reform_house_population_500',      # Коэффициент количества проживающих людей в радиусе 500 метров по источнику РеформаЖКХ
]

In [9]:
X_train = dataset.loc[dataset[treatment] == 0, feature_columns]
X_test = dataset.loc[dataset[treatment] == 1, feature_columns]

y_train = dataset.loc[dataset[treatment] == 0, target]
y_test = dataset.loc[dataset[treatment] == 1, target]

X_train.shape, X_test.shape

((275299, 21), (4493, 21))

# Model

In [10]:
import nmslib
from scipy.sparse import csr_matrix

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

import optuna
from optuna.samplers import TPESampler
from optuna.study import Study

In [11]:
class NMSlibTransformer(TransformerMixin, BaseEstimator):
    """Wrapper for using nmslib as sklearn's KNeighborsTransformer"""

    def __init__(self, n_neighbors=5, metric='euclidean', method='sw-graph',
                 n_jobs=1):
        self.n_neighbors = n_neighbors
        self.method = method
        self.metric = metric
        self.n_jobs = n_jobs

    def fit(self, X, y=None):
        self.n_samples_fit_ = X.shape[0]

        # see more metric in the manual
        # https://github.com/nmslib/nmslib/tree/master/manual
        space = {
            'euclidean': 'l2',
            'cosine': 'cosinesimil',
            'l1': 'l1',
            'l2': 'l2',
        }[self.metric]

        self.nmslib_ = nmslib.init(method=self.method, space=space)
        self.nmslib_.addDataPointBatch(X)
        self.nmslib_.createIndex()
        return self

    def transform(self, X):
        n_samples_transform = X.shape[0]

        # For compatibility reasons, as each sample is considered as its own
        # neighbor, one extra neighbor will be computed.
        n_neighbors = self.n_neighbors + 1

        results = self.nmslib_.knnQueryBatch(X, k=n_neighbors,
                                             num_threads=self.n_jobs)
        indices, distances = zip(*results)
        indices, distances = np.vstack(indices), np.vstack(distances)

        indptr = np.arange(0, n_samples_transform * n_neighbors + 1,
                           n_neighbors)
        kneighbors_graph = csr_matrix((distances.ravel(), indices.ravel(),
                                       indptr), shape=(n_samples_transform,
                                                       self.n_samples_fit_))

        return kneighbors_graph

In [12]:
def objective(trial) -> float:
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 3, 25),
        "metric": trial.suggest_categorical("metric", ['euclidean', 'cosine', 'l1', 'l2']),
        "method": trial.suggest_categorical("method", ['hnsw', 'sw-graph']),
    }
    
    n_neighbors = trial.suggest_int("n_neighbors_knn", 1, params["n_neighbors"])
    n_features = trial.suggest_int("n_features", 10, len(feature_columns))

    
    model = make_pipeline(
        StandardScaler(),
        NMSlibTransformer(**params),
        KNeighborsRegressor(n_neighbors=n_neighbors, metric="precomputed"),
    )

    cv_scores = cross_val_score(
        model, 
        X_train.iloc[:, :n_features].values, 
        y_train, 
        scoring="neg_root_mean_squared_error", 
        cv=5
    )
    
    mean_scores = np.mean(cv_scores)
    return mean_scores if not np.isnan(mean_scores) else -200_000

In [13]:
sampler = TPESampler(seed=12)
direction = "maximize"
study = optuna.create_study(sampler=sampler, direction=direction)
study.enqueue_trial({
    'n_neighbors': 11,
    'metric': 'l1',
    'method': 'hnsw',
    'n_neighbors_knn': 5,
    'n_features': 21
})
study.optimize(
    objective,
    show_progress_bar=True,
    n_trials=50,
    timeout=3600,
)

[32m[I 2021-10-21 17:22:58,135][0m A new study created in memory with name: no-name-1078329f-d827-4543-8ac6-e35fb6ab81e4[0m
  study.enqueue_trial({
  create_trial(state=TrialState.WAITING, system_attrs={"fixed_params": params})
  self.add_trial(
  self._init_valid()


  0%|          | 0/50 [00:00<?, ?it/s]

[32m[I 2021-10-21 17:23:55,146][0m Trial 0 finished with value: -96484.02572840759 and parameters: {'n_neighbors': 11, 'metric': 'l1', 'method': 'hnsw', 'n_neighbors_knn': 5, 'n_features': 21}. Best is trial 0 with value: -96484.02572840759.[0m
[32m[I 2021-10-21 17:24:40,247][0m Trial 1 finished with value: -105634.7121642932 and parameters: {'n_neighbors': 6, 'metric': 'euclidean', 'method': 'hnsw', 'n_neighbors_knn': 1, 'n_features': 21}. Best is trial 0 with value: -96484.02572840759.[0m
[32m[I 2021-10-21 17:25:27,817][0m Trial 2 finished with value: -96077.04104295363 and parameters: {'n_neighbors': 6, 'metric': 'l1', 'method': 'sw-graph', 'n_neighbors_knn': 4, 'n_features': 15}. Best is trial 2 with value: -96077.04104295363.[0m
[32m[I 2021-10-21 17:26:27,407][0m Trial 3 finished with value: -103493.93995567206 and parameters: {'n_neighbors': 20, 'metric': 'cosine', 'method': 'sw-graph', 'n_neighbors_knn': 14, 'n_features': 15}. Best is trial 2 with value: -96077.041042

2 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ningeen/anaconda3/envs/interp_ai/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ningeen/anaconda3/envs/interp_ai/lib/python3.8/site-packages/sklearn/pipeline.py", line 390, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/ningeen/anaconda3/envs/interp_ai/lib/python3.8/site-packages/sklearn/pipeline.py", line 348, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/ningeen/anaconda3/envs/interp_ai/lib/python3.8/site-p

[32m[I 2021-10-21 17:28:10,971][0m Trial 5 finished with value: -200000.0 and parameters: {'n_neighbors': 20, 'metric': 'euclidean', 'method': 'hnsw', 'n_neighbors_knn': 20, 'n_features': 15}. Best is trial 2 with value: -96077.04104295363.[0m
[32m[I 2021-10-21 17:28:57,766][0m Trial 6 finished with value: -97988.31777967178 and parameters: {'n_neighbors': 12, 'metric': 'l1', 'method': 'sw-graph', 'n_neighbors_knn': 8, 'n_features': 10}. Best is trial 2 with value: -96077.04104295363.[0m
[32m[I 2021-10-21 17:29:42,551][0m Trial 7 finished with value: -97858.87602390665 and parameters: {'n_neighbors': 9, 'metric': 'l1', 'method': 'sw-graph', 'n_neighbors_knn': 8, 'n_features': 11}. Best is trial 2 with value: -96077.04104295363.[0m
[32m[I 2021-10-21 17:30:21,902][0m Trial 8 finished with value: -98407.29337290392 and parameters: {'n_neighbors': 9, 'metric': 'euclidean', 'method': 'hnsw', 'n_neighbors_knn': 6, 'n_features': 15}. Best is trial 2 with value: -96077.04104295363.

[32m[I 2021-10-21 17:54:30,202][0m Trial 38 finished with value: -95292.70296231464 and parameters: {'n_neighbors': 7, 'metric': 'l1', 'method': 'sw-graph', 'n_neighbors_knn': 3, 'n_features': 15}. Best is trial 31 with value: -94360.89565843194.[0m
[32m[I 2021-10-21 17:55:20,963][0m Trial 39 finished with value: -101636.7523313956 and parameters: {'n_neighbors': 10, 'metric': 'euclidean', 'method': 'sw-graph', 'n_neighbors_knn': 2, 'n_features': 17}. Best is trial 31 with value: -94360.89565843194.[0m
[32m[I 2021-10-21 17:56:22,194][0m Trial 40 finished with value: -102740.36621060799 and parameters: {'n_neighbors': 19, 'metric': 'l1', 'method': 'sw-graph', 'n_neighbors_knn': 8, 'n_features': 19}. Best is trial 31 with value: -94360.89565843194.[0m
[32m[I 2021-10-21 17:57:13,439][0m Trial 41 finished with value: -95161.37259428394 and parameters: {'n_neighbors': 11, 'metric': 'l1', 'method': 'sw-graph', 'n_neighbors_knn': 2, 'n_features': 16}. Best is trial 31 with value: -

In [15]:
study.best_params

{'n_neighbors': 10,
 'metric': 'l1',
 'method': 'sw-graph',
 'n_neighbors_knn': 3,
 'n_features': 16}

In [16]:
%%time

model = make_pipeline(
    StandardScaler(),
    NMSlibTransformer(n_neighbors=10, method='sw-graph', metric="l1"),
    KNeighborsRegressor(n_neighbors=3, metric="precomputed"),
)

cv_scores = cross_val_score(
    model, 
    X_train.iloc[:, :16], 
    y_train, 
    scoring="neg_root_mean_squared_error", 
    cv=5
)

cv_scores.mean()

CPU times: user 1min 32s, sys: 631 ms, total: 1min 32s
Wall time: 51.3 s


-94417.13966568315

In [17]:
%%time

model = make_pipeline(
    StandardScaler(),
    NMSlibTransformer(n_neighbors=10, method='hnsw', metric="l1"),
    KNeighborsRegressor(n_neighbors=3, metric="precomputed"),
)

cv_scores = cross_val_score(
    model, 
    X_train.iloc[:, :16], 
    y_train, 
    scoring="neg_root_mean_squared_error", 
    cv=5
)

cv_scores.mean()

CPU times: user 4min 48s, sys: 2.24 s, total: 4min 50s
Wall time: 48 s


-93580.13367321526

In [20]:
%%time

model = make_pipeline(
    StandardScaler(),
    NMSlibTransformer(n_neighbors=10, method='hnsw', metric="l1"),
    KNeighborsRegressor(n_neighbors=3, metric="precomputed"),
)
model.fit(
    X_train.iloc[:, :16], 
    y_train,
)
y_pred = model.predict(X_test.iloc[:, :16])
np.sqrt(mean_squared_error(y_test, y_pred))

CPU times: user 1min 12s, sys: 528 ms, total: 1min 13s
Wall time: 10.7 s


88288.55653283234