# Model Tests

## General

### Imports

In [1]:
import pandas as pd
from sklearn import model_selection
from pathlib import Path
import random
from IPython.display import clear_output
from helpers.training import *

### Dataset

In [2]:
df = pd.read_csv(Path('./data/model/immoscout_robust.csv'))

df.head(10)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.688099,0.686833,0.130463,0.0,-1.131549,0.009053,3.466803,0.108954,0.809961,-0.102607,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.688099,0.686833,0.130463,0.0,-1.131549,0.009053,3.466803,0.108954,0.809961,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.19858,0.66892,0.106747,0.0,0.71514,0.80289,35.34003,0.0,-0.078321,-0.709981,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.688099,0.686833,0.130463,0.0,-1.131549,0.009053,3.466803,0.108954,0.809961,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.673461,0.676133,0.112097,0.0,-0.744323,-0.303501,30.638062,0.302994,0.208265,-0.025255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.402338,0.672546,0.089462,0.0,-0.68395,0.136669,0.0,0.0,0.844996,-0.029724,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.688099,0.686833,0.130463,0.0,-1.131549,0.009053,3.466803,0.108954,0.809961,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,-0.473383,0.660603,0.106216,10.189522,0.676815,0.18526,0.0,0.0,-0.184291,-0.709981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.963411,0.67232,0.122047,2.610778,0.881928,0.140491,16.075488,0.185477,-0.015675,-0.709981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.338853,0.666867,0.089861,0.0,-0.237389,0.00648,0.0,0.0,0.737623,-0.029724,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Helpers

In [3]:
def train_test_split(X, y):
    return model_selection.train_test_split(X, y, train_size=0.6, random_state=42)

In [4]:
def get_random_column_names(columns: list, percentage: float, add_type_columns: bool) -> list:
    type_columns = [c for c in columns if c.startswith("type_") and add_type_columns]
    regular_columns = [c for c in columns if (not c.startswith("type_")) and random.random() < percentage]
    return [*type_columns, *regular_columns]

## Execution function

In [5]:
# Constants
NUM_ITERATIONS = 50
MIN_PERCENTAGE_COLUMNS = 0.4
MAX_PERCENTAGE_COLUMNS = 1
TRAINING_FUNCTIONS = [train_random_forest]
#TRAINING_FUNCTIONS = [train_random_forest, train_ridge, train_mlp_regressor]
#TRAINING_FUNCTIONS = [train_ridge, train_bayesian_regression, train_quantile_regression, train_passive_agressive, train_random_forest, train_linear_regression]
TYPE_COLUMN_OPTIONS = [True, False]

print("Total number of iterations:", NUM_ITERATIONS * len(TRAINING_FUNCTIONS) * len(TYPE_COLUMN_OPTIONS))

Total number of iterations: 100


In [6]:
# Data preparation
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [7]:
results = []
for i in range(NUM_ITERATIONS):
    for add_type_columns in TYPE_COLUMN_OPTIONS:
        column_percentage = random.random() * (MAX_PERCENTAGE_COLUMNS - MIN_PERCENTAGE_COLUMNS) + MIN_PERCENTAGE_COLUMNS
        column_names = get_random_column_names(X_train.columns, column_percentage, add_type_columns)
        temp_X_train, temp_X_test = X_train[column_names], X_test[column_names]

        for func in TRAINING_FUNCTIONS:
            results.append({**func(temp_X_train, temp_X_test, y_train, y_test), "add_type_columns": add_type_columns})
    clear_output(wait=True)
    print(int(((i+1) / NUM_ITERATIONS)*100), '% Done')

pd.DataFrame(results).sort_values("score", ascending=False)

100 % Done


Unnamed: 0,columns,num_columns,score,model,add_type_columns
86,"[type_attic-flat, type_attic-room, type_castle...",51,0.707887,"(DecisionTreeRegressor(max_features='sqrt', ra...",True
24,"[type_attic-flat, type_attic-room, type_castle...",50,0.705041,"(DecisionTreeRegressor(max_features='sqrt', ra...",True
82,"[type_attic-flat, type_attic-room, type_castle...",55,0.703426,"(DecisionTreeRegressor(max_features='sqrt', ra...",True
96,"[type_attic-flat, type_attic-room, type_castle...",53,0.703266,"(DecisionTreeRegressor(max_features='sqrt', ra...",True
92,"[type_attic-flat, type_attic-room, type_castle...",52,0.702590,"(DecisionTreeRegressor(max_features='sqrt', ra...",True
...,...,...,...,...,...
8,"[type_attic-flat, type_attic-room, type_castle...",33,0.497194,"(DecisionTreeRegressor(max_features='sqrt', ra...",True
33,"[ForestDensityM, Latitude, Longitude, Populati...",21,0.392691,"(DecisionTreeRegressor(max_features='sqrt', ra...",False
51,"[NoisePollutionRailwayM, NoisePollutionRoadM, ...",17,0.372849,"(DecisionTreeRegressor(max_features='sqrt', ra...",False
47,"[Latitude, Longitude, NoisePollutionRailwayM, ...",21,0.371759,"(DecisionTreeRegressor(max_features='sqrt', ra...",False
