In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score as r2
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_rows=100
pd.options.display.max_columns = 200

In [26]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [27]:
test.shape, data.shape

((5000, 19), (10000, 20))

In [28]:
data.loc[data['HouseYear']==20052011, 'HouseYear'] = 2011
data = data.loc[data['HouseYear']<=2020]
data['PriceByMeter'] = data['Price']/data['Square']
data['Old'] = (data['HouseYear'] < 1950).astype('int64')
data['Square'] = (data['Square']/10).astype('int64')*10
data['IsFirstFloor'] = (data['Floor'] == 1).astype('int64')
data['IsBig'] = (data['Square'] >= 100).astype('int64')
data.loc[data['HouseFloor']<data['Floor'],'HouseFloor'] = data['Floor']
data = data.drop(['Healthcare_1'], axis=1)

square_mean_1 = data.loc[data['Rooms']<=1, 'Square'].mean()
square_mean_3 = data.loc[data['Rooms']==3, 'Square'].mean()
data.loc[(data['Square'] < 15) & (data['LifeSquare'] < 15) & (data['Rooms']<=1), 'Square'] = square_mean_1
data.loc[(data['Square'] < 15) & (data['LifeSquare'] < 15) & (data['Rooms']==3), 'Square'] = square_mean_3
data.loc[(data['Square'] > 15) & (data['LifeSquare'] < 15), 'LifeSquare'] = data['Square']
data.loc[data['Square'] < data['LifeSquare'], 'LifeSquare'] = data['Square']
data['LifeSquare'] = data['LifeSquare'].fillna(data['Square'])

In [29]:
test.loc[test['HouseYear']==20052011, 'HouseYear'] = 2011
test['Old'] = (test['HouseYear'] < 1950).astype('int64')
test['Square'] = (test['Square']/10).astype('int64')*10
test['IsFirstFloor'] = (test['Floor'] == 1).astype('int64')
test['IsBig'] = (test['Square'] >= 100).astype('int64')
test.loc[test['HouseFloor']<test['Floor'],'HouseFloor'] = test['Floor']
test = test.drop(['Healthcare_1'], axis=1)

test.loc[(test['Square'] < 15) & (test['LifeSquare'] < 15) & (test['Rooms']<=1), 'Square'] = square_mean_1
test.loc[(test['Square'] < 15) & (test['LifeSquare'] < 15) & (test['Rooms']==3), 'Square'] = square_mean_3
test.loc[(test['Square'] > 15) & (test['LifeSquare'] < 15), 'LifeSquare'] = test['Square']
test.loc[test['Square'] < test['LifeSquare'], 'LifeSquare'] = test['Square']
test['LifeSquare'] = test['LifeSquare'].fillna(test['Square'])

In [30]:
data = pd.get_dummies(data,columns=['Ecology_2','Ecology_3','Shops_2'])
test = test.reindex(columns = data.columns, fill_value=0)

In [31]:
test.shape, data.shape

((5000, 26), (9999, 26))

In [32]:
target = data.loc[:,['Price']]

In [33]:
train, valid, y_train, y_valid = train_test_split(data,target, test_size=0.2, random_state=42)
columns = train.columns
columns

Index(['Id', 'DistrictId', 'Rooms', 'Square', 'LifeSquare', 'KitchenSquare',
       'Floor', 'HouseFloor', 'HouseYear', 'Ecology_1', 'Social_1', 'Social_2',
       'Social_3', 'Helthcare_2', 'Shops_1', 'Price', 'PriceByMeter', 'Old',
       'IsFirstFloor', 'IsBig', 'Ecology_2_A', 'Ecology_2_B', 'Ecology_3_A',
       'Ecology_3_B', 'Shops_2_A', 'Shops_2_B'],
      dtype='object')

In [34]:
fts = train.columns
fts = fts.drop(['PriceByMeter','Price','Id','DistrictId'])

In [35]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(learning_rate=0.1,random_state=42)

In [36]:
parameters_grid = {'max_depth' : [8,10] ,
                   'max_features' : [6,3,4],
                   'n_estimators' : [300,310],
                  'loss': ['lad']}
                  

In [37]:
grid = GridSearchCV(model,parameters_grid,n_jobs=8,verbose=2)

In [38]:
%%time
grid.fit(train.loc[:,fts],y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  36 out of  36 | elapsed:   36.1s finished


Wall time: 39.3 s


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...te=42, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=8,
       param_grid={'max_depth': [8, 10], 'max_features': [6, 3, 4], 'n_estimators': [300, 310], 'loss': ['lad']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=2)

In [39]:
grid.best_estimator_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=8, max_features=6,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=310, n_iter_no_change=None, presort='auto',
             random_state=42, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [40]:
grid.best_score_

0.7348712026993701

In [41]:
grid.best_params_

{'loss': 'lad', 'max_depth': 8, 'max_features': 6, 'n_estimators': 310}

In [42]:
predicted_train = grid.predict(train.loc[:,fts])
r2(y_train,predicted_train)

0.8697302034755796

In [43]:
predicted = grid.predict(valid.loc[:,fts])
r2(y_valid,predicted)

0.7389927726049811

In [44]:
predicted_test = grid.predict(test.loc[:,fts])
predicted_test

array([155901.40777033, 187683.50067147, 158248.20884621, ...,
       327868.96400003, 197900.62317788, 185372.15306278])

In [45]:
test['Price'] = predicted_test

In [48]:
test.loc[:,['Id','Price']].to_csv('SNevzorov_predictions.csv',index=None)