In [1]:
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
rmse = lambda y_1, y_2: mean_squared_error(y_1, y_2) ** 0.5

In [3]:
def initial_preparations(line):
    y, m, d = line['date'].split('-')
    line['year'] = int(y)
    line['month'] = int(m)
    line['day'] = int(d)
    
    return line

In [4]:
x = pd.read_csv('./data/train_data.csv').apply(initial_preparations, axis=1).drop(['index', 'id', 'date'], axis=1)
x = pd.get_dummies(x, columns=[]).values[:, 1:]
y = pd.read_csv('./data/train_target.csv').values[:,1]

validation_data = pd.read_csv('./data/test_data.csv').apply(initial_preparations, axis=1).drop(['id', 'date'], axis=1)
idx_submittion = validation_data['index'].values.astype(int)
validation_data = pd.get_dummies(validation_data.drop(['index'], axis=1), columns=[])

x_submittion = validation_data.values[:, 1:]

В качествуе предобработки выполняется преобразование даты в отдельные категориальные признаки --- год и месяц объявления.

In [5]:
train_size = int(x.shape[0] * 0.66)
x_train, x_test, y_train, y_test = x[:train_size], x[train_size:], y[:train_size], y[train_size:]

Выполняется разбиение на обучающую и тестовую выборку для проведения локального контроля

In [6]:
preprocess = lambda x : x

In [7]:
x_pr, x_train_pr, x_test_pr, x_submittion_pr = preprocess(x), preprocess(x_train), preprocess(x_test), preprocess(x_submittion)

Различные преобразования признаков не показали увеличения качества на отложенной выборке, поэтому никаких изменений с данными не производится.

In [8]:
model = GradientBoostingRegressor(
    n_estimators=10000, learning_rate=0.008, max_depth=5, max_features='log2', min_samples_leaf=16, loss='huber', subsample=0.8
)

In [9]:
model.fit(x_train_pr, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.008, loss='huber', max_depth=5,
             max_features='log2', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=16, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10000,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=0.8, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)

In [10]:
print("Train: {0:.4f}".format(rmse(model.predict(x_train_pr), y_train) ))
print("Test: {0:.4f}".format(rmse(model.predict(x_test_pr), y_test)))

Train: 80.2182
Test: 116.2049


Выполняется обучение и локальное тестирование модели.

In [11]:
model.fit(x_pr, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.008, loss='huber', max_depth=5,
             max_features='log2', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=16, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10000,
             n_iter_no_change=None, presort='auto', random_state=None,
             subsample=0.8, tol=0.0001, validation_fraction=0.1, verbose=0,
             warm_start=False)

Обучение финальной модели и создание submittion для тестирующей системы.

In [12]:
print("All data: {0:.4f}".format(rmse(model.predict(x_pr), y)))

All data: 79.5060


In [13]:
y_submittion = model.predict(x_submittion_pr)

In [14]:
with open('submission.csv', 'w') as file:
    file.write('index,price\n')
    for idx in range(y_submittion.shape[0]):
        file.write("{0:d},{1:f}\n".format(idx_submittion[idx], y_submittion[idx]))