In [1]:
import numpy as np
import pandas as pd

### Чтение данных

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
dtypes: float64(8), int64(9), object(3)
memory usage: 1.5+ MB


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


### Предобработка данных

In [5]:
def prepare_data(data, train_list=None):
    data = data.drop('Healthcare_1', axis=1)
    
    # Корректировка года постройки
    data.loc[data['HouseYear'] == 4968, 'HouseYear'] = 1968
    data.loc[data['HouseYear'] == 20052011, 'HouseYear'] = 2008
    
    # Корректировка кол-ва комнат
    data.loc[data['Rooms'] == 10, 'Rooms'] = 2
    data.loc[data['Rooms'] == 19, 'Rooms'] = 1
    
    # Корректировка номера этажа
    data.loc[data['HouseFloor'] == 0, 'HouseFloor'] = np.round(data.loc[data['HouseFloor'] != 0, 'HouseFloor'].median())
    data.loc[data['Floor'] > data['HouseFloor'], 'Floor'] = data['HouseFloor']
    
    # Вычисление средней доли площади кухни и жилой площади в общей площади, средней площади комнаты, средней нежилой площади
    data['KitchenSqFrac'] = data['KitchenSquare'] / data['Square']
    data['LifeSqFrac'] = data['LifeSquare'] / data['Square']
    data['RoomSquare'] = data['LifeSquare'] / data['Rooms']
    data['EmptySquare'] = data['Square'] - data['LifeSquare']
    
    if train_list is None: # для трейна
        kitchen_square_frac_median = data['KitchenSqFrac'].median()
        life_square_frac_median = data['LifeSqFrac'].median()
        room_square_median = data['RoomSquare'].median()
        empty_square_median = data['EmptySquare'].median()
    
    else: # для теста
        kitchen_square_frac_median = train_list[0]
        life_square_frac_median = train_list[1]
        room_square_median = train_list[2]
        empty_square_median = train_list[3]    
    
    # Корректировка площадей квартиры и количества комнат
    data.loc[data['Rooms'] == 0, 'Rooms'] = np.round(data['Square'] * life_square_frac_median / room_square_median)
    data.loc[data['Rooms'] == 0, 'Rooms'] = 1
    data.loc[data['Square'] > (data['Rooms'] * room_square_median + empty_square_median), 'Square'] = data['Rooms'] * room_square_median + empty_square_median
    data.loc[data['Square'] < room_square_median, 'Square'] = data['Rooms'] * room_square_median + empty_square_median

    data['LifeSquare'] = data['LifeSquare'].fillna(data['Square'] * life_square_frac_median)

    data['LifeSqFrac'] = data['LifeSquare'] / data['Square']
    data['KitchenSqFrac'] = data['KitchenSquare'] / data['Square']
    data['RoomSquare'] = data['LifeSquare'] / data['Rooms']
    data['EmptySquare'] = data['Square'] - data['LifeSquare']

    data.loc[data['LifeSqFrac'] < 0.45, 'LifeSquare'] = data['Square'] * life_square_frac_median
    data.loc[data['LifeSqFrac'] > 0.8, 'LifeSquare'] = data['Square'] * life_square_frac_median
    data.loc[data['KitchenSqFrac'] < 0.1, 'KitchenSquare'] = data['Square'] * kitchen_square_frac_median
    data.loc[data['KitchenSqFrac'] > 0.2, 'KitchenSquare'] = data['Square'] * kitchen_square_frac_median
    data.loc[data['KitchenSquare'] < 5, 'KitchenSquare'] = 5
    
    # Вычисленной средней стоимости квадратного метра по районам
    if train_list is None: # для трейна
        data['SqMeterPrice'] = data['Price'] / data['Square']
        stat1 = data.groupby(['DistrictId', 'Rooms'], as_index=False)[['SqMeterPrice']].mean().rename(columns={'SqMeterPrice':'MeanSqMeterPrice'})
        stat2 = data.groupby(['Rooms'], as_index=False)[['SqMeterPrice']].mean().rename(columns={'SqMeterPrice':'MeanSqMeterPrice2'})
        mean_price3 = data['SqMeterPrice'].mean()
        
    else: # для теста
        stat1 = train_list[4] 
        stat2 = train_list[5] 
        mean_price3 = train_list[6] 
        
    data = pd.merge(data, stat1, on=['DistrictId', 'Rooms'], how='left')
    data = pd.merge(data, stat2, on='Rooms', how='left')
    
    data['MeanSqMeterPrice2'] = data['MeanSqMeterPrice2'].fillna(mean_price3)
    data['MeanSqMeterPrice'] = data['MeanSqMeterPrice'].fillna(data['MeanSqMeterPrice2'])
    
    # Обработка категориальных признаков
    data = pd.get_dummies(data)
    
    return data, [kitchen_square_frac_median, life_square_frac_median, room_square_median, empty_square_median, stat1, stat2, mean_price3]

In [6]:
train, train_list = prepare_data(train)

In [7]:
test, _ = prepare_data(test, train_list)

### Моделирование

In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import r2_score

In [9]:
feats = ['Rooms', 'Square', 'LifeSquare', 'KitchenSquare', 'Floor', 'HouseFloor', 'HouseYear',
        'MeanSqMeterPrice', 'Ecology_1', 'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A', 'Ecology_3_B', 
        'Social_1', 'Social_2', 'Social_3', 'Helthcare_2', 'Shops_1', 'Shops_2_A', 'Shops_2_B']

In [10]:
x_train, x_val, y_train, y_val = train_test_split(train[feats], train['Price'], test_size=0.30, random_state=42)

In [11]:
scaler = RobustScaler()
x_train[feats] = scaler.fit_transform(x_train[feats])
x_val[feats] = scaler.transform(x_val[feats])

In [12]:
params = [{'n_estimators': [10, 50, 100, 500],
'max_depth': [1, 5, 10],
'learning_rate': [0.01, 0.1, 1.0]}]

In [13]:
clf = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, max_depth=5, random_state=42)

In [14]:
clf.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [15]:
y_pred = clf.predict(x_train)
r2_score(y_train, y_pred)

0.8582900155046752

In [16]:
y_pred = clf.predict(x_val)
r2_score(y_val, y_pred)

0.7314518753386834

In [17]:
idx_array = np.argsort(clf.feature_importances_)[::-1]
print("Наиболее значимые признаки:", feats[idx_array[0]], feats[idx_array[1]], feats[idx_array[2]])
idx_array = np.argsort(clf.feature_importances_)
print("Наименее значимые признаки:", feats[idx_array[0]], feats[idx_array[1]], feats[idx_array[2]])

Наиболее значимые признаки: MeanSqMeterPrice LifeSquare HouseYear
Наименее значимые признаки: Ecology_2_A Shops_2_A Ecology_2_B


### Предсказание

In [18]:
test[feats] = scaler.transform(test[feats])

In [19]:
test['Price'] = clf.predict(test[feats])

In [20]:
test['Price'].describe()

count      5000.000000
mean     214938.655302
std       79507.188804
min       63964.566322
25%      162593.048529
50%      196267.452180
75%      246189.951058
max      568782.921963
Name: Price, dtype: float64

In [21]:
test.loc[:, ['Id', 'Price']].to_csv('VMaksimenko_predictions.csv', index=False)