# Model Tests

## General

### Imports

In [55]:
import pandas as pd
from sklearn import linear_model, model_selection
from pathlib import Path
import random

### Dataset

In [56]:
df = pd.read_csv(Path('./data/clean/immoscount_model.csv'))

df.head(10)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,1,0,0,0,0,0,0,0,0,0
1,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,0,0,0,0,0,0,0,0,1,0
2,0.21574,0.66892,0.106747,0.0,0.715909,0.80415,1893.128856,0.0,-0.071577,-0.709981,...,1,0,0,0,0,0,0,0,0,0
3,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,0,0,0,0,0,0,0,0,0,0
4,1.035876,0.676133,0.112097,0.0,-0.74281,-0.302241,1422.882194,0.091805,0.202417,-0.025255,...,0,0,0,0,0,0,0,0,0,0
5,0.515181,0.672546,0.089462,0.0,-0.682468,0.13793,0.0,0.0,1.042194,-0.029724,...,0,0,0,0,0,0,0,0,0,0
6,1.06796,0.686833,0.130463,0.0,-1.129839,0.010313,18.218171,0.011871,0.987703,-0.102607,...,0,0,0,0,0,0,0,0,1,0
7,-0.21301,0.660603,0.106216,103.826366,0.677604,0.18652,0.0,0.0,-0.156544,-0.709981,...,0,0,0,0,0,0,0,0,0,0
8,1.747204,0.67232,0.122047,6.816164,0.882613,0.141751,391.719036,0.034402,-0.017196,-0.709981,...,0,0,0,0,0,0,0,0,0,0
9,0.413429,0.666867,0.089861,0.0,-0.236134,0.00774,0.0,0.0,0.878246,-0.029724,...,0,0,0,0,0,0,1,0,0,0


### Helpers

In [57]:
def train_test_split(X, y):
    return model_selection.train_test_split(X, y, train_size=0.6, random_state=42)

In [58]:
def get_random_column_names(columns: list, percentage: float, add_type_columns: bool) -> list:
    type_columns = [c for c in columns if c.startswith("type_") and add_type_columns]
    regular_columns = [c for c in columns if (not c.startswith("type_")) and random.random() < percentage]
    return [*type_columns, *regular_columns]

## Model Functions

### Linear Models

In [59]:
# Linear Regression
def train_linear_regression(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = linear_model.LinearRegression()
    model.fit(X_train, y_train)

    return {
        "type": "LinearRegression",
        "columns": X_train.columns,
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test)
    }

In [60]:
# Ridge
def train_ridge(X_train: pd.DataFrame, X_test: pd.DataFrame, y_train: pd.Series, y_test: pd.Series) -> dict:
    model = linear_model.Ridge()
    model.fit(X_train, y_train, 100)

    return {
        "type": "Ridge",
        "columns": X_train.columns,
        "num_columns": len(X_train.columns),
        "score": model.score(X_test, y_test)
    }

## Execution function

In [61]:
# Constants
NUM_ITERATIONS = 10_000
MIN_PERCENTAGE_COLUMNS = 0.2
MAX_PERCENTAGE_COLUMNS = 1
PERCENTAGE_TYPE_COLUMNS = 0.5
TRAINING_FUNCTIONS = [train_linear_regression, train_ridge]
VERBOSE = True

In [62]:
# should be reviewed
df = df.dropna(axis=0)

# Data preparation
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [63]:
results = []
for _ in range(NUM_ITERATIONS):
    column_percentage = random.random() * (MAX_PERCENTAGE_COLUMNS - MIN_PERCENTAGE_COLUMNS) + MIN_PERCENTAGE_COLUMNS
    add_type_columns = random.random() < PERCENTAGE_TYPE_COLUMNS
    column_names = get_random_column_names(X_train.columns, column_percentage, add_type_columns)
    temp_X_train, temp_X_test = X_train[column_names], X_test[column_names]

    for func in TRAINING_FUNCTIONS:
        results.append({**func(temp_X_train, temp_X_test, y_train, y_test), "add_type_columns": add_type_columns})

pd.DataFrame(results).sort_values("score", ascending=False)

Unnamed: 0,type,columns,num_columns,score,add_type_columns
1269,Ridge,"Index(['ForestDensityM', 'Longitude', 'NoisePo...",23,0.533139,False
1268,LinearRegression,"Index(['ForestDensityM', 'Longitude', 'NoisePo...",23,0.533103,False
16384,LinearRegression,"Index(['ForestDensityM', 'NoisePollutionRailwa...",24,0.530126,False
16385,Ridge,"Index(['ForestDensityM', 'NoisePollutionRailwa...",24,0.530124,False
14669,Ridge,"Index(['ForestDensityM', 'Latitude', 'Longitud...",22,0.525661,False
...,...,...,...,...,...
19660,LinearRegression,"Index(['type_attic-flat', 'type_attic-room', '...",43,-3.121479,True
16690,LinearRegression,"Index(['type_attic-flat', 'type_attic-room', '...",42,-3.504914,True
16870,LinearRegression,"Index(['type_attic-flat', 'type_attic-room', '...",44,-3.696579,True
708,LinearRegression,"Index(['type_attic-flat', 'type_attic-room', '...",43,-4.575784,True
