In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.metrics import r2_score as r2

warnings.filterwarnings('ignore')
%matplotlib inline
pd.options.display.max_columns = 100

# Подготовка данных

In [2]:
def fill_Floor(df):
    df.loc[(df['Floor'] == 0), 'Floor'] = 1
    df.loc[(df['HouseFloor'] == 0), 'HouseFloor'] = df['Floor']
    df['Floor_rel'] = df['Floor'] / df['HouseFloor']
    df.loc[df['HouseYear'] > 2020, 'HouseYear'] = df['HouseYear'].median()
    df.loc[df['HouseYear'] < 1900, 'HouseYear'] = df['HouseYear'].median()
    return df

def clean_rooms(df, source_df):
    rooms_median = source_df['Rooms'].median()
    df.loc[df['Rooms'] >= 6, 'Rooms'] = rooms_median
    return df

def clean_square(df, source_df):
    square_room = source_df.groupby(['Rooms'], as_index=False)['Square'].median()
    square_room = square_room.rename(columns={'Square':'Square_ref'})
    df = pd.merge(df, square_room, on='Rooms', how='left')
    df.loc[df['Square_ref'].isnull(), 'Square_ref'] = df['Square']
    df.loc[df['Square'] < 0.5 * df['Square_ref'], 'Square'] = df['Square_ref']
    df.loc[df['Square'] > 2 * df['Square_ref'], 'Square'] = df['Square_ref']
    return df

def dist_price(df, source_df):
    room_dist = source_df.groupby(['DistrictId', 'Rooms'], as_index=False)['Price'].mean()
    room_dist = room_dist.rename(columns={'Price':'price_by_district'})
    room_mean = source_df.groupby(['Rooms'], as_index=False)['Price'].mean()
    room_mean = room_mean.rename(columns={'Price':'price_by_rooms'})
    df = pd.merge(df, room_dist, on=['DistrictId', 'Rooms'], how='left')
    df = pd.merge(df, room_mean, on='Rooms', how='left')
    df.loc[df['price_by_rooms'].isnull(), 'price_by_rooms'] = room_mean.loc[room_mean['Rooms']==4, 
                                                                          'price_by_rooms'].values[0]
    df.loc[df['price_by_district'].isnull(), 'price_by_district'] = df['price_by_rooms']
    return df

def other_features(df):
    df.loc[df['Ecology_2']=='A', 'Ecology_2'] = 1
    df.loc[df['Ecology_2']=='B', 'Ecology_2'] = 2
    df = pd.get_dummies(df)
    return df

In [3]:
def prepare_data(df, source_df):
    df = fill_Floor(df)
    df = clean_rooms(df, source_df)
    df = clean_square(df, source_df)
    df = dist_price(df, source_df)
    df = other_features(df)
    return df


In [4]:
data = pd.read_csv('train.csv')
data = prepare_data(data, data)

In [5]:
train, valid = train_test_split(data, test_size=0.3, random_state=28)

# Модель и ее валидация

Выбираем целевую переменную и признаки.
Логарифм цены в качестве целевой переменной плохо себя показал.
Параметры, которые имеют объектные значения можно не учитывать - на точность практически не влияет.
Вместо этажа используем относительный этаж, вместо района - среднюю цену по району.

In [6]:
target = ['Price']
features = ['Floor_rel', 'Ecology_1', 'Ecology_2', 'HouseYear', 'Social_1', 'Social_2', 
            'Social_3','Shops_1',  'price_by_district',  'Square', 'HouseFloor', 'price_by_rooms']

Линейная регрессия показала себя хуже. Параметры подбирал через GridSearch

In [7]:
model = RF(n_estimators=150, max_depth=8, max_features=8, random_state=1);
model.fit(train.loc[:, features], train.loc[:, target]);
pred_train = model.predict(train.loc[:, features]);
pred_valid = model.predict(valid.loc[:, features]);

In [8]:
r2(train['Price'], (pred_train) )

0.8347854178175511

In [9]:
r2(valid['Price'], (pred_valid) )

0.7449915963954594

In [10]:
model.fit(train.loc[:, features], train.loc[:, target]);

# Обработка тестовых данных

In [11]:
test = pd.read_csv('test.csv')

In [12]:
test = prepare_data(test, data)

In [13]:
test['Price'] = model.predict(test.loc[:, features])

In [14]:
test.loc[:, ['Id', 'Price']].to_csv('ANebavskiy_predictions.csv', index=False)