# Find best model

### Imports

In [98]:
import __main__ as main

from helpers.training import *
from pathlib import Path
from joblib import load
from sklearn.model_selection import train_test_split
from datetime import datetime
from helpers.is_interactive import is_interactive

### Run dependency notebooks

In [99]:
if is_interactive(main):
    %run 01_0_data_wrangling.ipynb -p
    %run 01_2_data_wrangling_kaggle.ipynb -p
    %run 02_0_scaling.ipynb -p
    %run 02_1_scaling_kaggle.ipynb -p

Running previous notebooks...


### Load Dataframe

In [100]:
source_path = Path('./data/model/02_train_data.pkl')
training_data = load(source_path)

df = training_data['dataset']

X = df.drop('price_cleaned', axis=1)
y = df['price_cleaned']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

df.head(10)

Unnamed: 0,Living space,Plot area,price_cleaned,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,...,gde_workers_total_2,gde_workers_total_3,gde_workers_total_4,gde_workers_total_5,gde_workers_total_6,rooms_2,rooms_3,rooms_4,rooms_5,rooms_6
0,0.011759,0.003654,1150000.0,0.285714,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,...,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,9.050323e-20,0.020408,0.002915,0.000416,5.9e-05,8e-06
1,0.018764,0.000633,1420000.0,0.241071,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,...,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,9.050323e-20,0.020408,0.002915,0.000416,5.9e-05,8e-06
2,0.010883,0.003417,720000.0,0.214286,0.182127,0.09593,0.001911,0.799258,0.468164,0.0,...,0.005103043,0.0003645388,2.604104e-05,1.860257e-06,1.328885e-07,0.020408,0.002915,0.000416,5.9e-05,8e-06
3,0.018514,0.001054,1430000.0,0.142857,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,...,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,9.050323e-20,0.020408,0.002915,0.000416,5.9e-05,8e-06
4,0.017013,0.001318,995000.0,0.142857,0.372216,0.279429,0.145835,0.803051,0.470341,0.0,...,8.15219e-06,2.327617e-08,6.64582e-11,1.897517e-13,5.417799e-16,0.020408,0.002915,0.000416,5.9e-05,8e-06
5,0.023017,0.003029,2160000.0,0.142857,0.212473,0.162927,0.034759,0.801165,0.461133,0.0,...,4.101045e-06,8.30504e-09,1.681857e-11,3.405934e-14,6.897369e-17,0.020408,0.002915,0.000416,5.9e-05,8e-06
6,0.014761,0.00057,550000.0,0.241071,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,...,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,9.050323e-20,0.020408,0.002915,0.000416,5.9e-05,8e-06
7,0.01326,0.000307,590000.0,0.25,0.05723,0.0,0.0,0.794885,0.467948,0.187309,...,0.005103043,0.0003645388,2.604104e-05,1.860257e-06,1.328885e-07,0.020408,0.002915,0.000416,5.9e-05,8e-06
8,0.008631,0.000118,547000.0,0.200746,0.308985,0.438584,0.327528,0.801046,0.474388,0.088907,...,0.005103043,0.0003645388,2.604104e-05,1.860257e-06,1.328885e-07,0.020408,0.002915,0.000416,5.9e-05,8e-06
9,0.01301,0.00603,1125000.0,0.142857,0.156497,0.140161,0.15665,0.798179,0.461295,0.0,...,4.101045e-06,8.30504e-09,1.681857e-11,3.405934e-14,6.897369e-17,0.020408,0.002915,0.000416,5.9e-05,8e-06


## Train all Models

In [101]:
TRAINING_FUNCTIONS = [
    train_gradient_boosting_v1,
    #train_gradient_boosting,
    #train_gradient_boosting_robust,
    #train_random_forest,
    #train_linear_regression,
    #train_mlp_regressor,
]

In [102]:
results = [
    func(X_train, X_test, y_train, y_test)
    for func in TRAINING_FUNCTIONS
]

Training GradientBoostingRegressor with -1 jobs
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 2/5; 1/1] START loss=absolute_error, max_depth=21, max_features=sqrt, min_samples_leaf=13, min_samples_split=1, n_estimators=130, random_state=42
[CV 3/5; 1/1] START loss=absolute_error, max_depth=21, max_features=sqrt, min_samples_leaf=13, min_samples_split=1, n_estimators=130, random_state=42
[CV 4/5; 1/1] START loss=absolute_error, max_depth=21, max_features=sqrt, min_samples_leaf=13, min_samples_split=1, n_estimators=130, random_state=42
[CV 1/5; 1/1] START loss=absolute_error, max_depth=21, max_features=sqrt, min_samples_leaf=13, min_samples_split=1, n_estimators=130, random_state=42
[CV 5/5; 1/1] START loss=absolute_error, max_depth=21, max_features=sqrt, min_samples_leaf=13, min_samples_split=1, n_estimators=130, random_state=42
[CV 2/5; 1/1] END loss=absolute_error, max_depth=21, max_features=sqrt, min_samples_leaf=13, min_samples_split=1, n_estimators=130, random_stat

In [103]:
pd.DataFrame(results)

Unnamed: 0,num_columns,score,model
0,309,0.271276,"GridSearchCV(cv=5, estimator=GradientBoostingR..."


In [104]:
best_model_row = results[0]
best_model = best_model_row['model']

In [105]:
best_model.best_params_

{'loss': 'absolute_error',
 'max_depth': 21,
 'max_features': 'sqrt',
 'min_samples_leaf': 13,
 'min_samples_split': 1,
 'n_estimators': 130,
 'random_state': 42}

In [106]:
best_model.best_score_

-0.26968691078346096

In [107]:
validation_data = load(Path('data/kaggle/02_model_data.pkl'))
validation_df = validation_data['dataset']
base_imputer = validation_data['base_imputer']
scaler = validation_data['scaler']
validation_df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,gde_workers_total_2,gde_workers_total_3,gde_workers_total_4,gde_workers_total_5,gde_workers_total_6,rooms_2,rooms_3,rooms_4,rooms_5,rooms_6
0,0.02677,0.002089,0.142857,0.183264,0.100085,0.063548,0.786382,0.475555,0.014734,0.0,...,0.0001242706,1.385328e-06,1.544318e-08,1.721555e-10,1.919134e-12,0.020408,0.002915,0.000416,5.9e-05,8e-06
1,0.028021,0.002,0.142857,0.29082,0.170527,0.083253,0.786102,0.474934,0.010139,0.0,...,0.0001242706,1.385328e-06,1.544318e-08,1.721555e-10,1.919134e-12,0.020408,0.002915,0.000416,5.9e-05,8e-06
2,0.015637,0.000373,0.142857,0.483981,0.35818,0.125505,0.807571,0.477015,0.0,0.0,...,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,9.050323e-20,0.020408,0.002915,0.000416,5.9e-05,8e-06
3,0.016763,0.000587,0.241071,0.165213,0.076652,0.0,0.787002,0.475789,0.020076,0.0,...,0.0001242706,1.385328e-06,1.544318e-08,1.721555e-10,1.919134e-12,0.020408,0.002915,0.000416,5.9e-05,8e-06
4,0.018764,0.000633,0.241071,0.569894,0.286608,0.090908,0.808677,0.477811,0.0,0.0,...,4.489742e-07,3.008376e-10,2.015778e-13,1.350683e-16,9.050323e-20,0.020408,0.002915,0.000416,5.9e-05,8e-06


In [108]:
validation_df.isna().sum()

Living space      0
Plot area         0
Floor             0
ForestDensityL    0
ForestDensityM    0
                 ..
rooms_2           0
rooms_3           0
rooms_4           0
rooms_5           0
rooms_6           0
Length: 309, dtype: int64

In [109]:
predictions = pd.DataFrame({
    'Id': list(validation_df.index),
    'Expected': best_model.predict(validation_df)
})
predictions.head()

Unnamed: 0,Id,Expected
0,0,2110181.0
1,1,1776337.0
2,2,1034073.0
3,3,1012864.0
4,4,1242594.0


In [110]:
#Test for submission: if predictions["Id"] is in ids set predictions["Expected"] to 150_000
ids_type_nan = load(Path('data/model/03_ids_type_nan.pkl'))
predictions.loc[predictions["Id"].isin(ids_type_nan), "Expected"] = 150_000

In [111]:
def getClassName(obj):
   return type(obj).__name__

def getFormattedDate(date = datetime.now()):
    return date.strftime('%Y%m%d_%H%M')

In [112]:
filename = f'{getFormattedDate()}_{getClassName(best_model.estimator)}_{getClassName(scaler)}_{getClassName(base_imputer)}_{best_model.best_params_}_{best_model_row["score"]:.3f}'
path = Path(f'data/kaggle/submissions/{filename}.csv')
predictions.to_csv(path, index=False)