# Model Tests

## General

### Imports

In [26]:
import pandas as pd
from sklearn import linear_model, model_selection, ensemble, tree
from pathlib import Path
import random

### Dataset

In [27]:
df = pd.read_csv(Path('./data/model/immoscout_robust.csv'))

df.head(10)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.217974,0.66892,0.106747,0.0,0.71514,0.80289,1248.917696,0.0,-0.068507,-0.709981,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.045734,0.676133,0.112097,0.0,-0.744323,-0.303501,938.690859,0.091805,0.205637,-0.025255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.520198,0.672546,0.089462,0.0,-0.68395,0.136669,0.0,0.0,1.045871,-0.029724,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1.078117,0.686833,0.130463,0.0,-1.131549,0.009053,12.018725,0.011871,0.991351,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,-0.214762,0.660603,0.106216,103.826366,0.676815,0.18526,0.0,0.0,-0.153521,-0.709981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.763675,0.67232,0.122047,6.816164,0.881928,0.140491,258.421309,0.034402,-0.014097,-0.709981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.417501,0.666867,0.089861,0.0,-0.237389,0.00648,0.0,0.0,0.881834,-0.029724,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### Helpers

In [28]:
def train_test_split(X, y):
    return model_selection.train_test_split(X, y, train_size=0.6, random_state=42)

In [29]:
def get_random_column_names(columns: list, percentage: float, add_type_columns: bool) -> list:
    type_columns = [c for c in columns if c.startswith("type_") and add_type_columns]
    regular_columns = [c for c in columns if (not c.startswith("type_")) and random.random() < percentage]
    return [*type_columns, *regular_columns]

## Model Functions

### Linear Models

In [30]:
# Linear Regression
def train_linear_regression(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)

    return {
        "type": "LinearRegression",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

In [31]:
# Ridge
def train_ridge(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = linear_model.Ridge()
    model.fit(X_train, y_train, 100)

    return {
        "type": "Ridge",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

### Ensemble

In [32]:
# Random Forest
def train_random_forest(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = ensemble.RandomForestRegressor()
    model.fit(X_train, y_train)

    return {
        "type": "RandomForest",
        "columns": list(X_train.columns),
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test),
        "model": model
    }

## Execution function

In [33]:
# Constants
NUM_ITERATIONS = 250
MIN_PERCENTAGE_COLUMNS = 0.2
MAX_PERCENTAGE_COLUMNS = 1
TRAINING_FUNCTIONS = [train_random_forest]

print("Total number of iterations:", NUM_ITERATIONS * len(TRAINING_FUNCTIONS) * 2)

Total number of iterations: 500


In [34]:
# Data preparation
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [35]:
results = []
for _ in range(NUM_ITERATIONS):
    for add_type_columns in [True, False]:
        column_percentage = random.random() * (MAX_PERCENTAGE_COLUMNS - MIN_PERCENTAGE_COLUMNS) + MIN_PERCENTAGE_COLUMNS
        column_names = get_random_column_names(X_train.columns, column_percentage, add_type_columns)
        temp_X_train, temp_X_test = X_train[column_names], X_test[column_names]

        for func in TRAINING_FUNCTIONS:
            results.append({**func(temp_X_train, temp_X_test, y_train, y_test), "add_type_columns": add_type_columns})

pd.DataFrame(results).sort_values("score", ascending=False)

Unnamed: 0,type,columns,num_columns,score,model,add_type_columns
52,RandomForest,"[type_attic-flat, type_attic-room, type_castle...",46,0.640561,"(DecisionTreeRegressor(max_features=1.0, rando...",True
146,RandomForest,"[type_attic-flat, type_attic-room, type_castle...",44,0.638864,"(DecisionTreeRegressor(max_features=1.0, rando...",True
22,RandomForest,"[type_attic-flat, type_attic-room, type_castle...",51,0.634190,"(DecisionTreeRegressor(max_features=1.0, rando...",True
474,RandomForest,"[type_attic-flat, type_attic-room, type_castle...",53,0.634056,"(DecisionTreeRegressor(max_features=1.0, rando...",True
144,RandomForest,"[type_attic-flat, type_attic-room, type_castle...",50,0.633358,"(DecisionTreeRegressor(max_features=1.0, rando...",True
...,...,...,...,...,...,...
167,RandomForest,"[Latitude, gde_area_agriculture_percentage, gd...",13,0.203601,"(DecisionTreeRegressor(max_features=1.0, rando...",False
419,RandomForest,"[ForestDensityM, Latitude, Longitude, NoisePol...",20,0.197769,"(DecisionTreeRegressor(max_features=1.0, rando...",False
361,RandomForest,"[ForestDensityM, NoisePollutionRoadM, gde_area...",13,0.194111,"(DecisionTreeRegressor(max_features=1.0, rando...",False
55,RandomForest,"[ForestDensityM, Latitude, NoisePollutionRailw...",24,0.190709,"(DecisionTreeRegressor(max_features=1.0, rando...",False
