#### Проект

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('input/train.csv')

In [3]:
data.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2,Price
0,14038,35,2.0,47.981561,29.442751,6.0,7,9.0,1969,0.08904,B,B,33,7976,5,,0,11,B,184966.93073
1,15053,41,3.0,65.68364,40.049543,8.0,7,9.0,1978,7e-05,B,B,46,10309,1,240.0,1,16,B,300009.450063
2,4765,53,2.0,44.947953,29.197612,0.0,8,12.0,1968,0.049637,B,B,34,7759,0,229.0,1,3,B,220925.908524
3,5809,58,2.0,53.352981,52.731512,9.0,8,17.0,1977,0.437885,B,B,23,5735,3,1084.0,0,5,B,175616.227217
4,10783,99,1.0,39.649192,23.776169,7.0,11,12.0,1976,0.012339,B,B,35,5776,1,2078.0,2,4,B,150226.531644


In [4]:
data.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price'],
      dtype='object')

In [5]:
data.shape

(10000, 20)

In [6]:
data.loc[data['Ecology_2'] == 'A', 'Ecology' ] = 1
data.loc[data['Ecology_2'] != 'A', 'Ecology' ] = 0
data.loc[data['Ecology_3'] == 'A', 'Ecology' ] = (data['Ecology'] + 1)
data.loc[data['Ecology_1'] > 0, 'Ecology' ] = (data['Ecology'] + data['Ecology_1'])
data['Ecology'] = data['Ecology'].round(3)

data.loc[data['Shops_2'] == 'A', 'Shops' ] = (data['Shops_1'] + 1)
data.loc[data['Shops_2'] != 'A', 'Shops' ] = (data['Shops_1'])
data.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Ecology_2',
       'Ecology_3', 'Social_1', 'Social_2', 'Social_3', 'Healthcare_1',
       'Helthcare_2', 'Shops_1', 'Shops_2', 'Price', 'Ecology', 'Shops'],
      dtype='object')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
Id               10000 non-null int64
DistrictId       10000 non-null int64
Rooms            10000 non-null float64
Square           10000 non-null float64
LifeSquare       7887 non-null float64
KitchenSquare    10000 non-null float64
Floor            10000 non-null int64
HouseFloor       10000 non-null float64
HouseYear        10000 non-null int64
Ecology_1        10000 non-null float64
Ecology_2        10000 non-null object
Ecology_3        10000 non-null object
Social_1         10000 non-null int64
Social_2         10000 non-null int64
Social_3         10000 non-null int64
Healthcare_1     5202 non-null float64
Helthcare_2      10000 non-null int64
Shops_1          10000 non-null int64
Shops_2          10000 non-null object
Price            10000 non-null float64
Ecology          10000 non-null float64
Shops            10000 non-null float64
dtypes: float64(10), int64(9), obje

In [8]:
data = data.loc[data['Rooms'] < 10, :]
data = data.loc[data['Price'].between(30000, 600000), :]

In [9]:
data.shape

(9977, 22)

In [10]:
square_mean_1 = data.loc[data['Rooms']<=1, 'Square'].mean()

In [11]:
square_mean_3 = data.loc[data['Rooms']==3, 'Square'].mean()

In [12]:
data.loc[(data['Square'] < 15) & (data['LifeSquare'] < 15) & (data['Rooms']<=1), 'Square'] = square_mean_1

data.loc[(data['Square'] < 15) & (data['LifeSquare'] < 15) & (data['Rooms']==3), 'Square'] = square_mean_3

data.loc[(data['Square'] > 15) & (data['LifeSquare'] < 15), 'LifeSquare'] = data['Square']

data.loc[data['Square'] < data['LifeSquare'], 'LifeSquare'] = data['Square']

data['LifeSquare'] = data['LifeSquare'].fillna(data['Square'])

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9977 entries, 0 to 9999
Data columns (total 22 columns):
Id               9977 non-null int64
DistrictId       9977 non-null int64
Rooms            9977 non-null float64
Square           9977 non-null float64
LifeSquare       9977 non-null float64
KitchenSquare    9977 non-null float64
Floor            9977 non-null int64
HouseFloor       9977 non-null float64
HouseYear        9977 non-null int64
Ecology_1        9977 non-null float64
Ecology_2        9977 non-null object
Ecology_3        9977 non-null object
Social_1         9977 non-null int64
Social_2         9977 non-null int64
Social_3         9977 non-null int64
Healthcare_1     5185 non-null float64
Helthcare_2      9977 non-null int64
Shops_1          9977 non-null int64
Shops_2          9977 non-null object
Price            9977 non-null float64
Ecology          9977 non-null float64
Shops            9977 non-null float64
dtypes: float64(10), int64(9), object(3)
memory usage: 2

In [13]:
data = data.drop('Healthcare_1', axis=1)
data = data.drop('Ecology_1', axis=1)
data = data.drop('Ecology_2', axis=1)
data = data.drop('Ecology_3', axis=1)
data = data.drop('Shops_1', axis=1)
data = data.drop('Shops_2', axis=1)
data = pd.get_dummies(data)

In [14]:
data.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Social_1', 'Social_2', 'Social_3',
       'Helthcare_2', 'Price', 'Ecology', 'Shops'],
      dtype='object')

In [15]:
train, valid = train_test_split(data, test_size=0.2, random_state=42)

In [16]:
mix_stat = train.groupby(['Ecology', 'Shops', 'Helthcare_2'])[['Price']].mean().reset_index().rename(columns={'Price': 'mean_price'})

In [17]:
mix_stat.shape

(135, 4)

In [18]:
train = pd.merge(train, mix_stat, on=['Ecology', 'Shops', 'Helthcare_2'], how='left')

In [19]:
train['mean_price'].isnull().sum()

0

In [20]:
valid = pd.merge(valid, mix_stat, on=['Ecology', 'Shops', 'Helthcare_2'], how='left')

In [21]:
valid['mean_price'].isnull().sum()

0

In [22]:
train.shape

(7981, 17)

In [23]:
valid.shape

(1996, 17)

In [24]:
train.columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Social_1', 'Social_2', 'Social_3',
       'Helthcare_2', 'Price', 'Ecology', 'Shops', 'mean_price'],
      dtype='object')

In [25]:
train.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Social_1,Social_2,Social_3,Helthcare_2,Price,Ecology,Shops,mean_price
0,2463,47,2.0,63.667916,35.294284,12.0,16,14.0,2007,8,2423,1,0,252679.385235,0.197,1.0,188618.571968
1,14008,26,1.0,49.992258,20.277695,11.0,6,30.0,2010,36,6714,2,0,239534.792071,0.0,2.0,245614.184734
2,5044,138,2.0,45.497826,28.410568,5.0,4,5.0,1963,36,6714,2,0,200379.489484,0.0,2.0,245614.184734
3,14767,58,2.0,48.739764,31.762801,6.0,3,12.0,1986,15,2787,2,0,185219.528845,0.061,7.0,280430.217657
4,3917,45,2.0,73.480915,73.480915,1.0,13,1.0,1977,23,5212,6,3,376697.121584,0.196,2.0,306792.783839


In [26]:
fts = ['Square', 'Rooms', 'LifeSquare', 'HouseYear', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops', 'Ecology', 'KitchenSquare', 'Floor', 'mean_price']

In [27]:
from sklearn.ensemble import RandomForestRegressor as RF

rf = RF(n_estimators=20, max_depth=10, min_samples_leaf=2, random_state=42)

In [28]:
rf.fit(train.loc[:, fts], train['Price'])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [29]:
pred = rf.predict(train.loc[:, fts])

In [30]:
pred

array([223058.29659098, 240019.35508536, 195195.82600332, ...,
       247348.93123663, 182429.51328935, 157595.69228561])

In [31]:
r2(train['Price'], pred)

0.8619408131397309

In [32]:
pred_valid = rf.predict(valid.loc[:, fts])

In [33]:
pred_valid

array([222951.75053682, 154430.48968687, 226265.71535934, ...,
       301196.67227981, 190248.87035931, 230222.06590598])

In [34]:
r2(valid['Price'], pred_valid)

0.7260915170049824

#### Предсказание на тесте

In [35]:
test = pd.read_csv('input/test.csv')

In [36]:
test.shape

(5000, 19)

In [37]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Ecology_1,Ecology_2,Ecology_3,Social_1,Social_2,Social_3,Healthcare_1,Helthcare_2,Shops_1,Shops_2
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,0.310199,B,B,11,2748,1,,0,0,B
1,15856,74,2.0,69.263183,,1.0,6,1.0,1977,0.075779,B,B,6,1437,3,,0,2,B
2,5480,190,1.0,13.597819,15.948246,12.0,2,5.0,1909,0.0,B,B,30,7538,87,4702.0,5,5,B
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,0.101872,B,B,23,4583,3,,3,3,B
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,0.072158,B,B,2,629,1,,0,0,A


In [38]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 19 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       3959 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Ecology_1        5000 non-null float64
Ecology_2        5000 non-null object
Ecology_3        5000 non-null object
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Healthcare_1     2623 non-null float64
Helthcare_2      5000 non-null int64
Shops_1          5000 non-null int64
Shops_2          5000 non-null object
dtypes: float64(7), int64(9), object(3)
memory usage: 742.3+ KB


In [39]:
test.loc[(test['Square'] < 15) & (test['LifeSquare'] < 15) & (test['Rooms']<=1), 'Square'] = square_mean_1

test.loc[(test['Square'] < 15) & (test['LifeSquare'] < 15) & (test['Rooms']==3), 'Square'] = square_mean_3

test.loc[(test['Square'] > 15) & (test['LifeSquare'] < 15), 'LifeSquare'] = test['Square']

test.loc[test['Square'] < test['LifeSquare'], 'LifeSquare'] = test['Square']

test.loc[test['Ecology_2'] == 'A', 'Ecology' ] = 1
test.loc[test['Ecology_2'] != 'A', 'Ecology' ] = 0
test.loc[test['Ecology_3'] == 'A', 'Ecology' ] = (test['Ecology'] + 1)
test.loc[test['Ecology_1'] > 0, 'Ecology' ] = (test['Ecology'] + test['Ecology_1'])
test['Ecology'] = test['Ecology'].round(3)

test.loc[test['Shops_2'] == 'A', 'Shops' ] = (test['Shops_1'] + 1)
test.loc[test['Shops_2'] != 'A', 'Shops' ] = (test['Shops_1'])

test['LifeSquare'] = test['LifeSquare'].fillna(data['Square'])
test = test.drop('Healthcare_1', axis=1)
test = test.drop('Ecology_1', axis=1)
test = test.drop('Ecology_2', axis=1)
test = test.drop('Ecology_3', axis=1)
test = test.drop('Shops_1', axis=1)
test = test.drop('Shops_2', axis=1)

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
Id               5000 non-null int64
DistrictId       5000 non-null int64
Rooms            5000 non-null float64
Square           5000 non-null float64
LifeSquare       4999 non-null float64
KitchenSquare    5000 non-null float64
Floor            5000 non-null int64
HouseFloor       5000 non-null float64
HouseYear        5000 non-null int64
Social_1         5000 non-null int64
Social_2         5000 non-null int64
Social_3         5000 non-null int64
Helthcare_2      5000 non-null int64
Ecology          5000 non-null float64
Shops            5000 non-null float64
dtypes: float64(7), int64(8)
memory usage: 586.0 KB


In [40]:
test['LifeSquare'] = test['LifeSquare'].fillna(test['LifeSquare'].mean())

In [41]:
test = pd.get_dummies(test)

In [42]:
test.loc[:, fts].head()

Unnamed: 0,Square,Rooms,LifeSquare,HouseYear,Social_1,Social_2,Social_3,Helthcare_2,Shops,Ecology,KitchenSquare,Floor
0,49.882643,2.0,33.432782,1972,11,2748,1,0,0.0,0.31,6.0,6
1,69.263183,2.0,65.68364,1977,6,1437,3,0,2.0,0.076,1.0,6
2,13.597819,1.0,13.597819,1909,30,7538,87,5,5.0,0.0,12.0,2
3,73.046609,2.0,51.940842,2007,23,4583,3,3,3.0,0.102,9.0,22
4,47.527111,1.0,43.387569,2017,2,629,1,0,1.0,0.072,1.0,17


In [43]:
pred_test = rf.predict(test.loc[:, fts])

In [44]:
pred_test

array([161635.88024157, 175684.23745322, 163774.38707848, ...,
       318474.0297589 , 201051.40154901, 224853.3103509 ])

In [45]:
pred_test.shape

(5000,)

In [46]:
test['Price'] = pred_test

In [47]:
test.head()

Unnamed: 0,Id,DistrictId,Rooms,Square,LifeSquare,KitchenSquare,Floor,HouseFloor,HouseYear,Social_1,Social_2,Social_3,Helthcare_2,Ecology,Shops,Price
0,725,58,2.0,49.882643,33.432782,6.0,6,14.0,1972,11,2748,1,0,0.31,0.0,161635.880242
1,15856,74,2.0,69.263183,65.68364,1.0,6,1.0,1977,6,1437,3,0,0.076,2.0,175684.237453
2,5480,190,1.0,13.597819,13.597819,12.0,2,5.0,1909,30,7538,87,5,0.0,5.0,163774.387078
3,15664,47,2.0,73.046609,51.940842,9.0,22,22.0,2007,23,4583,3,3,0.102,3.0,335740.737203
4,14275,27,1.0,47.527111,43.387569,1.0,17,17.0,2017,2,629,1,0,0.072,1.0,127928.581763


In [48]:
test.loc[:, ['Id', 'Price']].to_csv('output/YOgay_predictions.csv', index=None)