# Model Tests

## General

### Imports

In [127]:
import pandas as pd
from sklearn import linear_model, model_selection, ensemble, tree, neural_network
from pathlib import Path
import random
from IPython.display import clear_output

### Dataset

In [128]:
df = pd.read_csv(Path('./data/model/immoscout_robust.csv'))

df.head(10)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.217974,0.66892,0.106747,0.0,0.71514,0.80289,1248.917696,0.0,-0.068507,-0.709981,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.045734,0.676133,0.112097,0.0,-0.744323,-0.303501,938.690859,0.091805,0.205637,-0.025255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.520198,0.672546,0.089462,0.0,-0.68395,0.136669,0.0,0.0,1.045871,-0.029724,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,-0.214762,0.660603,0.106216,103.826366,0.676815,0.18526,0.0,0.0,-0.153521,-0.709981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.763675,0.67232,0.122047,6.816164,0.881928,0.140491,258.421309,0.034402,-0.014097,-0.709981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.417501,0.666867,0.089861,0.0,-0.237389,0.00648,0.0,0.0,0.881834,-0.029724,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Helpers

In [129]:
def train_test_split(X, y):
    return model_selection.train_test_split(X, y, train_size=0.6, random_state=42)

In [130]:
def get_random_column_names(columns: list, percentage: float, add_type_columns: bool) -> list:
    type_columns = [c for c in columns if c.startswith("type_") and add_type_columns]
    regular_columns = [c for c in columns if (not c.startswith("type_")) and random.random() < percentage]
    return [*type_columns, *regular_columns]

## Model Functions

### Linear Models

In [131]:
# Linear Regression
def train_linear_regression(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)

    return {
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

In [132]:
# Ridge
def train_ridge(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = linear_model.Ridge()
    model.fit(X_train, y_train, 100)

    return {
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

In [133]:
# Bayesian Regression
def train_bayesian_regression(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = linear_model.BayesianRidge()
    model.fit(X_train, y_train)

    return {
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

In [134]:
# Passive Agressive Regressor
def train_passive_agressive(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = linear_model.PassiveAggressiveRegressor()
    model.fit(X_train, y_train,)

    return {
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

In [135]:
#Quantile Regression
def train_quantile_regression(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = linear_model.BayesianRidge()
    model.fit(X_train, y_train, 100)

    return {
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

### Ensemble

In [136]:
# Random Forest
def train_random_forest(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = ensemble.RandomForestRegressor()
    model.fit(X_train, y_train)

    return {
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

## Neural Network

In [137]:
# MLP Regressor
def train_mlp_regressor(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = neural_network.MLPRegressor(max_iter=15000)
    model.fit(X_train, y_train)

    return {
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

## Execution function

In [138]:
# Constants
NUM_ITERATIONS = 50
MIN_PERCENTAGE_COLUMNS = 0.4
MAX_PERCENTAGE_COLUMNS = 1
TRAINING_FUNCTIONS = [train_random_forest]
#TRAINING_FUNCTIONS = [train_random_forest, train_ridge, train_mlp_regressor]
#TRAINING_FUNCTIONS = [train_ridge, train_bayesian_regression, train_quantile_regression, train_passive_agressive, train_random_forest, train_linear_regression]
TYPE_COLUMN_OPTIONS = [True, False]

print("Total number of iterations:", NUM_ITERATIONS * len(TRAINING_FUNCTIONS) * len(TYPE_COLUMN_OPTIONS))

Total number of iterations: 100


In [139]:
# Data preparation
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [140]:
results = []
for i in range(NUM_ITERATIONS):
    for add_type_columns in TYPE_COLUMN_OPTIONS:
        column_percentage = random.random() * (MAX_PERCENTAGE_COLUMNS - MIN_PERCENTAGE_COLUMNS) + MIN_PERCENTAGE_COLUMNS
        column_names = get_random_column_names(X_train.columns, column_percentage, add_type_columns)
        temp_X_train, temp_X_test = X_train[column_names], X_test[column_names]

        for func in TRAINING_FUNCTIONS:
            results.append({**func(temp_X_train, temp_X_test, y_train, y_test), "add_type_columns": add_type_columns})
    clear_output(wait=True)
    print(int(((i+1) / NUM_ITERATIONS)*100), '% Done')

pd.DataFrame(results).sort_values("score", ascending=False)

100 % Done


Unnamed: 0,columns,num_columns,score,model,add_type_columns
68,"[type_attic-flat, type_attic-room, type_castle...",46,0.635151,"(DecisionTreeRegressor(max_features=1.0, rando...",True
66,"[type_attic-flat, type_attic-room, type_castle...",50,0.631910,"(DecisionTreeRegressor(max_features=1.0, rando...",True
18,"[type_attic-flat, type_attic-room, type_castle...",47,0.631188,"(DecisionTreeRegressor(max_features=1.0, rando...",True
63,"[ForestDensityM, Latitude, Longitude, NoisePol...",24,0.624056,"(DecisionTreeRegressor(max_features=1.0, rando...",False
44,"[type_attic-flat, type_attic-room, type_castle...",53,0.623645,"(DecisionTreeRegressor(max_features=1.0, rando...",True
...,...,...,...,...,...
70,"[type_attic-flat, type_attic-room, type_castle...",37,0.416597,"(DecisionTreeRegressor(max_features=1.0, rando...",True
96,"[type_attic-flat, type_attic-room, type_castle...",38,0.402464,"(DecisionTreeRegressor(max_features=1.0, rando...",True
38,"[type_attic-flat, type_attic-room, type_castle...",34,0.395996,"(DecisionTreeRegressor(max_features=1.0, rando...",True
74,"[type_attic-flat, type_attic-room, type_castle...",38,0.395933,"(DecisionTreeRegressor(max_features=1.0, rando...",True
