# Find best model

### Imports

In [2]:
import __main__ as main

from helpers.training_regressor import *
from helpers.feature_selection import select_features
from helpers.paths import Paths
from joblib import load, dump
from sklearn.model_selection import train_test_split
from datetime import datetime
from helpers.is_interactive import is_interactive

### Run dependency notebooks

In [3]:
if is_interactive(main):
    %run 01_0_data_wrangling.ipynb -p
    %run 01_1_data_wrangling_kaggle.ipynb -p
    %run 02_0_scaling.ipynb -p
    %run 02_1_scaling_kaggle.ipynb -p

Running previous notebooks...


### Load Dataframe

In [4]:
training_data = load(Paths.REGRESSOR_SCALING_DATA)

df = training_data['dataset']

X = df.drop('price_cleaned', axis=1)
y = df['price_cleaned']

df.head(10)

Unnamed: 0,Living space,Plot area,price_cleaned,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,...,WorkplaceDensity_2,WorkplaceDensity_3,WorkplaceDensity_4,WorkplaceDensity_5,WorkplaceDensity_6,ForestDensity_2,ForestDensity_3,ForestDensity_4,ForestDensity_5,ForestDensity_6
0,0.0125,0.003654,1150000.0,0.363636,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856293,0.002716846,0.000833447
1,0.0195,0.000633,1420000.0,0.306818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856293,0.002716846,0.000833447
2,0.011625,0.003417,720000.0,0.272727,0.799258,0.468164,0.462,0.034049,0.129342,0.366948,...,0.036712,0.007034,0.001348,0.0002582432,4.948057e-05,0.008129,0.000733,6.608701e-05,5.958613e-06,5.372472e-07
3,0.01925,0.001054,1430000.0,0.181818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856293,0.002716846,0.000833447
4,0.01775,0.001318,995000.0,0.181818,0.803051,0.470341,0.464541,0.054848,0.377475,0.566478,...,0.002871,0.000154,8e-06,4.415364e-07,2.3657e-08,0.068665,0.017993,0.00471484,0.001235474,0.000323743
5,0.02375,0.003029,2160000.0,0.181818,0.801165,0.461133,0.464079,0.118598,0.375855,0.606279,...,0.006233,0.000492,3.9e-05,3.067084e-06,2.421424e-07,0.017961,0.002407,0.0003226117,4.323646e-05,5.794556e-06
6,0.0155,0.00057,550000.0,0.306818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856293,0.002716846,0.000833447
7,0.014,0.000271,590000.0,0.318182,0.794885,0.467948,0.462,0.027599,0.129342,0.366948,...,0.120739,0.041954,0.014578,0.005065411,0.001760101,0.000314,6e-06,9.866563e-08,1.748668e-09,3.099195e-11
8,0.009375,0.000118,547000.0,0.255495,0.801046,0.474388,0.465696,0.038177,0.129342,0.366948,...,0.010812,0.001124,0.000117,1.215577e-05,1.263976e-06,0.129677,0.046698,0.01681615,0.006055616,0.00218067
9,0.01375,0.00603,1125000.0,0.181818,0.798179,0.461295,0.464079,0.106152,0.375855,0.606279,...,0.005998,0.000465,3.6e-05,2.78604e-06,2.157669e-07,0.022775,0.003437,0.0005186871,7.827657e-05,1.181294e-05


## Find the most relevant features to train the model

In [5]:
relevant_features = select_features(X, y)
X_relevant = X[relevant_features]
X_relevant.head()

Unnamed: 0,Living space,Plot area,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,gde_area_nonproductive_percentage,...,WorkplaceDensity_2,WorkplaceDensity_3,WorkplaceDensity_4,WorkplaceDensity_5,WorkplaceDensity_6,ForestDensity_2,ForestDensity_3,ForestDensity_4,ForestDensity_5,ForestDensity_6
0,0.0125,0.003654,0.363636,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,0.051803,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856,0.002717,0.000833447
1,0.0195,0.000633,0.306818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,0.051803,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856,0.002717,0.000833447
2,0.011625,0.003417,0.272727,0.799258,0.468164,0.462,0.034049,0.129342,0.366948,0.08056,...,0.036712,0.007034,0.001348,0.0002582432,4.948057e-05,0.008129,0.000733,6.6e-05,6e-06,5.372472e-07
3,0.01925,0.001054,0.181818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,0.051803,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856,0.002717,0.000833447
4,0.01775,0.001318,0.181818,0.803051,0.470341,0.464541,0.054848,0.377475,0.566478,0.013291,...,0.002871,0.000154,8e-06,4.415364e-07,2.3657e-08,0.068665,0.017993,0.004715,0.001235,0.000323743


In [6]:
set(X.columns).difference(relevant_features)

{'Living space_6',
 'NoisePollutionRailway_6',
 'Plot area_4',
 'Plot area_5',
 'Plot area_6',
 'RiversAndLakes_6',
 'gde_population_5',
 'gde_population_6',
 'gde_private_apartments_5',
 'gde_private_apartments_6',
 'gde_workers_sector2_5',
 'gde_workers_sector2_6',
 'gde_workers_sector3_5',
 'gde_workers_sector3_6',
 'gde_workers_total_5',
 'gde_workers_total_6',
 'rooms_6'}

## Train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_relevant, y, train_size=0.8, random_state=42)

## Train all Models

In [8]:
TRAINING_FUNCTIONS = [
    train_ridge_regression,
    train_gradient_boosting,
    train_random_forest,
    train_mlp_regressor
]

In [9]:
results = [
    func(X_train, X_test, y_train, y_test)
    for func in TRAINING_FUNCTIONS
]

Training Ridge with -1 jobs
Parameters: {'alpha': [0.05], 'tol': [0.1], 'random_state': [42]}
Training GradientBoostingRegressor with -1 jobs
Parameters: {'loss': ['absolute_error'], 'max_depth': [23], 'min_samples_split': [25], 'max_features': [0.5], 'min_samples_leaf': [10], 'n_estimators': [130], 'random_state': [42]}
Training RandomForestRegressor with -1 jobs
Parameters: {'min_samples_split': [9], 'min_samples_leaf': [6], 'max_features': [0.5], 'n_estimators': [130], 'random_state': [42]}


In [None]:
pd.DataFrame(results)

Unnamed: 0,num_columns,score,best_params,model
0,231,0.579712,"{'alpha': 0.05, 'tol': 0.1, 'random_state': 42}","Ridge(alpha=0.05, random_state=42, tol=0.1)"
1,231,0.242424,"{'loss': 'absolute_error', 'max_depth': 23, 'm...",([DecisionTreeRegressor(criterion='friedman_ms...
2,231,0.301259,"{'min_samples_split': 9, 'min_samples_leaf': 6...","(DecisionTreeRegressor(max_features=0.5, min_s..."
3,231,0.472175,"{'hidden_layer_sizes': (15, 15, 15, 15), 'acti...","MLPRegressor(hidden_layer_sizes=(15, 15, 15, 1..."


## Select the best model

In [None]:
best_model_row = sorted(results, key=lambda x: x['score'])[0]
best_model = best_model_row['model']
best_model

In [None]:
best_model_row['best_params']

{'loss': 'absolute_error',
 'max_depth': 23,
 'min_samples_split': 25,
 'max_features': 0.5,
 'min_samples_leaf': 10,
 'n_estimators': 130,
 'random_state': 42}

## Validation Data
### Load Validation Data

In [None]:
validation_data = load(Paths.KAGGLE_SCALING_DATA)
validation_df = validation_data['dataset']
base_imputer = validation_data['base_imputer']
scaler = validation_data['scaler']
validation_df.tail()

Unnamed: 0,Living space,Plot area,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,gde_area_nonproductive_percentage,...,WorkplaceDensity_2,WorkplaceDensity_3,WorkplaceDensity_4,WorkplaceDensity_5,WorkplaceDensity_6,ForestDensity_2,ForestDensity_3,ForestDensity_4,ForestDensity_5,ForestDensity_6
24551,0.0045,0.000103,0.272727,0.700912,0.764213,1.0,0.371798,0.593406,0.310593,0.197378,...,0.003543,0.000211,1.3e-05,7.469957e-07,4.446125e-08,0.005044,0.000358,2.5e-05,1.806941e-06,1.283313e-07
24552,0.005,0.000114,0.318182,0.700719,0.763639,1.0,0.368098,0.593406,0.310593,0.197378,...,0.004826,0.000335,2.3e-05,1.617701e-06,1.123771e-07,0.002933,0.000159,9e-06,4.657e-07,2.521898e-08
24553,0.01075,0.007988,0.181818,0.700719,0.763639,1.0,0.368098,0.593406,0.310593,0.197378,...,0.004826,0.000335,2.3e-05,1.617701e-06,1.123771e-07,0.002933,0.000159,9e-06,4.657e-07,2.521898e-08
24554,0.015875,0.000362,0.272727,0.700719,0.763639,1.0,0.368098,0.593406,0.310593,0.197378,...,0.004826,0.000335,2.3e-05,1.617701e-06,1.123771e-07,0.002933,0.000159,9e-06,4.657e-07,2.521898e-08
24555,0.018909,0.001738,0.181818,0.700719,0.763639,1.0,0.368098,0.593406,0.310593,0.197378,...,0.004826,0.000335,2.3e-05,1.617701e-06,1.123771e-07,0.002933,0.000159,9e-06,4.657e-07,2.521898e-08


### Predict the results with our best model

In [None]:
predictions = pd.DataFrame({
    'Id': list(validation_df.index),
    'Expected': best_model.predict(validation_df[relevant_features])
})
predictions

Unnamed: 0,Id,Expected
0,0,2.005998e+06
1,1,1.757951e+06
2,2,8.109124e+05
3,3,9.605037e+05
4,4,1.235903e+06
...,...,...
24551,24551,2.775904e+05
24552,24552,2.829388e+05
24553,24553,4.954964e+05
24554,24554,7.915455e+05


### Add ID column

In [None]:
right_ids = load(Paths.KAGGLE_IDS_TO_PREDICT_DATA)
predictions = predictions.join(right_ids, on='Id', how='left')

In [None]:
predictions = predictions.drop('Id', axis=1)
predictions = predictions.rename(columns={'Unnamed: 0.1': 'Id'})
#Id as first column
predictions = predictions[['Id', 'Expected']]
predictions

Unnamed: 0,Id,Expected
0,0,2.005998e+06
1,1,1.757951e+06
2,2,8.109124e+05
3,3,9.605037e+05
4,4,1.235903e+06
...,...,...
24551,24561,2.775904e+05
24552,24562,2.829388e+05
24553,24563,4.954964e+05
24554,24564,7.915455e+05


### Set Prediction to 150'000 for bad data (data without type)

In [None]:
ids_type_nan = load(Paths.KAGGLE_IDS_TYPE_NONE_DATA)
predictions.loc[predictions.index.isin(ids_type_nan), "Expected"] = 150_000

## Save Results

In [None]:
def getClassName(obj):
   return type(obj).__name__

def getFormattedDate(date = datetime.now()):
    return date.strftime('%Y%m%d_%H%M')

In [None]:
filename = f'{getFormattedDate()}_{getClassName(best_model)}_{getClassName(scaler)}_{getClassName(base_imputer)}_{best_model_row["best_params"]}_{best_model_row["score"]:.3f}'
predictions.to_csv(
    Paths.KAGGLE_SUBMISSIONS_PATH(filename),
    index=False)

In [None]:
for result in results:
    dump(
        result['model'],
        Paths.REGRESSOR_MODEL_DATA(getClassName(result['model']))
    )