# Boosting

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error, explained_variance_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
PATH = "../../Datasets/sberbank/"
tr_name = "train.csv"

In [3]:
tr = pd.read_csv(f"{PATH}{tr_name}")

In [4]:
tr.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [5]:
tr.price_doc.describe()

count    3.047100e+04
mean     7.123035e+06
std      4.780111e+06
min      1.000000e+05
25%      4.740002e+06
50%      6.274411e+06
75%      8.300000e+06
max      1.111111e+08
Name: price_doc, dtype: float64

In [6]:
tr.describe(include=['object'])

Unnamed: 0,timestamp,product_type,sub_area,culture_objects_top_25,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,water_1line,big_road1_1line,railroad_1line,ecology
count,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471
unique,1161,2,146,2,2,2,2,2,2,2,2,2,2,2,2,5
top,2014-12-16,Investment,Poselenie Sosenskoe,no,no,no,no,no,no,no,no,no,no,no,no,poor
freq,160,19448,1776,28543,28817,28155,30175,19600,29335,27649,29608,27427,28134,29690,29578,8018


In [7]:
tr['timestamp'] = pd.to_datetime(tr['timestamp'])
tr['hour'] = tr.timestamp.dt.hour
tr['month'] = tr.timestamp.dt.month
tr['year'] = tr.timestamp.dt.year
tr['dayofweek'] = tr.timestamp.dt.dayofweek
tr['day'] = tr.timestamp.dt.day
tr.drop(['timestamp'], axis = 1, inplace = True)

In [8]:
for col in tr.columns[tr.dtypes == 'object']:
    tr[col] = tr[col].map( dict( zip(tr[col].unique(), range(len(tr[col].unique()))) ) )

In [9]:
for col in tr.columns:
    tr[col] = tr[col].fillna(tr[col].median())

In [10]:
tr = tr.dropna(axis=0)

In [11]:
tr.shape

(30471, 296)

## Модель

In [12]:
X, Y = tr.drop('price_doc', axis = 1), tr.price_doc

In [36]:
model = GradientBoostingRegressor(max_depth = 5, n_estimators = 165)

In [37]:
def fit_and_res(model, x, y, show_params = False, fit = True):
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.25, random_state=42)
    if fit:
        model.fit(x_train, y_train)
    mse = mean_squared_error(y_val, model.predict(x_val))
    r2 = explained_variance_score(y_val, model.predict(x_val))
    return (model, r2, mse, np.sqrt(mse)) if show_params else (r2, mse, np.sqrt(mse))

In [38]:
res = []
fit_model, r2, mse, sqrt = fit_and_res(model, X, Y, True)
res.append([r2, mse, sqrt])
res = pd.DataFrame(np.array(res).T, columns = ['Boosting'], index = ['r2', 'mse', 'sqrt'])
res

Unnamed: 0,Boosting
r2,0.7162171
mse,5984122000000.0
sqrt,2446247.0


In [40]:
_, x_val, _, y_val = train_test_split(X, Y, test_size=0.25, random_state=42)

In [42]:
mean_squared_log_error(y_val, model.predict(x_val))

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [45]:
def rmsle(actual, predicted): 
    return np.sqrt(np.nansum(np.square(np.log(predicted + 1) - np.log(actual + 1)))/float(len(predicted)))

In [46]:
rmsle(y_val, model.predict(x_val))

  


0.4668699750541738

In [31]:
param_grid = {
    "max_depth" : [5],
    "n_estimators" : [165],
}

In [32]:
grid = GridSearchCV(model, param_grid, n_jobs=-1, cv=5)

In [33]:
%%time
fitting_grid = fit_and_res(grid, X, Y, show_params = True)

Wall time: 5min 24s


In [34]:
fitting_grid

({'max_depth': 5, 'max_features': 180, 'n_estimators': 165},
 0.7093226886014272,
 6129401060986.726,
 2475762.7230788344)