# Solution 05 Immoscout
Since we will be training and evaluating multiple models in the following and also doing the associated data preprocessing multiple times, the functionality has been encapsulated into (reusable) functions so that they can be used with each model with little code duplication. Furthermore, the plots were removed to make the code clearer.

In [4]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn import linear_model

from warnings import simplefilter

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

pd.options.display.max_columns = 50

## Load the data

In [5]:
df = pd.read_csv("../Data/immo_data.csv")
desc = pd.read_csv("../Data/immo_data_column_description.csv")

## Methods for data preprocessing and model evaluation

In [6]:
def drop_columns(df):
    """ Remove (supposedly) unimportant columns """
    return df.drop(
        [
            "scoutId",
            "houseNumber",
            "geo_bln",
            "geo_krs",
            "geo_plz",
            "date",
            "street",
            "streetPlain",
            "description",
            "facilities",
            "regio3",
            "firingTypes",
            "telekomHybridUploadSpeed",
            "totalRent",
            "baseRentRange",
        ],
        axis=1,
    )


def remove_outliers(df, lower_limit=0.005, upper_limit=0.995):
    """ Removing the (lower and upper) outliers """
    dfc = df.copy()
    columns_with_outliers = [
        "serviceCharge",
        "yearConstructed",
        "noParkSpaces",
        "baseRent",
        "livingSpace",
        "noRooms",
        "floor",
        "numberOfFloors",
        "heatingCosts",
        "lastRefurbish",
    ]

    #For each column we keep: Data that are < (99.5% quantile) and > (0.5% quantile) OR that are NaN (we will deal with this later). 
    upper_limits = df[columns_with_outliers].quantile(upper_limit)
    lower_limits = df[columns_with_outliers].quantile(lower_limit)

    for colname in columns_with_outliers:
        col = dfc[colname]
        dfc = dfc[
            ((col <= upper_limits[colname]) & (col >= lower_limits[colname]))
            | col.isna()
            ]
    return dfc


def remove_rows_with_NaN_target(df):
    """ Removing the records without a label """
    return df[df["baseRent"].isna() == False]


def impute_NaNs(df):
    """ Replacing NaNs with mean or most frequent """
    dfc = df.copy()
    categorical_columns = dfc.select_dtypes(exclude=np.number).columns
    imp_freq = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
    #dfc.loc[:, categorical_columns] = imp_freq.fit_transform(dfc[categorical_columns])
    dfc[categorical_columns] = imp_freq.fit_transform(dfc[categorical_columns])

    numeric_columns = dfc.select_dtypes(include=np.number).columns
    imp_mean = SimpleImputer(missing_values=np.nan, strategy="mean")
    #dfc.loc[:, numeric_columns] = imp_mean.fit_transform(dfc[numeric_columns])
    dfc[numeric_columns] = imp_mean.fit_transform(dfc[numeric_columns])
    return dfc


def print_evaluation(pipeline_or_model, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred, feature_names):
    """ Output of R2 value, MSE and MAE for training and test set """
    r2_train = r2_score(y_train, y_train_pred)
    rmse_train = math.sqrt(mean_squared_error(y_train, y_train_pred))
    mae_train = mean_absolute_error(y_train, y_train_pred)

    r2_test = r2_score(y_test, y_test_pred)
    rmse_test = math.sqrt(mean_squared_error(y_test, y_test_pred));
    mae_test = mean_absolute_error(y_test, y_test_pred)

    print(
        f"{pipeline_or_model} Evaluation:\n"
        f"{'':6} {'R²':>10} | {'RMSE':>14} | {'MAE':>10} | {'rows':>8} | {'columns':>8}\n"
        f"{'Train':6} {r2_train:10.5f} | {rmse_train:14.2f} | {mae_train:10.2f} | {X_train.shape[0]:8} | {X_train.shape[1]:8}\n"
        f"{'Test':6} {r2_test:10.5f} | {rmse_test:14.2f} | {mae_test:10.2f} | {X_test.shape[0]:8} | {X_test.shape[1]:8}\n"
    )

## 1.Model. The same as in Exercise 06

In [7]:
# Data pre-processing
df_reduced = drop_columns(df)
df_reduced = remove_outliers(df_reduced)
df_reduced = remove_rows_with_NaN_target(df_reduced)
df_reduced = impute_NaNs(df_reduced)
df_reduced = pd.get_dummies(df_reduced)
y = df_reduced.pop("baseRent")

# Training-Test-Split
X_train, X_test, y_train, y_test = train_test_split(df_reduced, y, test_size=0.2, random_state=0)

# Training
model_lr = linear_model.LinearRegression()
model_lr.fit(X_train, y_train)
y_train_pred = model_lr.predict(X_train)
y_test_pred = model_lr.predict(X_test)

# Evaluation
print_evaluation(model_lr, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred,
                 feature_names=df_reduced.columns)

LinearRegression() Evaluation:
               R² |           RMSE |        MAE |     rows |  columns
Train     0.84047 |         163.46 |     106.33 |   207773 |      518
Test      0.84079 |         164.57 |     107.07 |    51944 |      518


## 2. Model: Without outlier distance

In [8]:
# Data pre-processing
df_reduced = drop_columns(df)
df_reduced = remove_rows_with_NaN_target(df_reduced)
df_reduced = impute_NaNs(df_reduced)
df_reduced = pd.get_dummies(df_reduced)
y = df_reduced.pop("baseRent")

# Training-Test-Split
X_train, X_test, y_train, y_test = train_test_split(df_reduced, y, test_size=0.2, random_state=42)

# Training
model_lr = linear_model.LinearRegression()
model_lr.fit(X_train, y_train)
y_train_pred = model_lr.predict(X_train)
y_test_pred = model_lr.predict(X_test)

# Evaluation
print_evaluation(model_lr, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred,
                 feature_names=df_reduced.columns)

LinearRegression() Evaluation:
               R² |           RMSE |        MAE |     rows |  columns
Train     0.01653 |       21659.03 |     334.93 |   215080 |      518
Test    -47.56742 |        3638.43 |     297.75 |    53770 |      518


If the outliers are not removed, the model is extremely poor even on the training data. Since some outliers here are several orders of magnitude above the "normal" range (15 million EUR rent!), a few points are enough to make the model completely unsuitable for forecasting. 

## 3. Model: Regionally restricted

In [9]:
# Data pre-processing
df_reduced = drop_columns(df[df["regio1"] == "Baden_Württemberg"])
#df_reduced = drop_columns(df[df["regio2"]=="Karlsruhe"])
df_reduced = remove_outliers(df_reduced)
df_reduced = remove_rows_with_NaN_target(df_reduced)
df_reduced = impute_NaNs(df_reduced)
df_reduced = pd.get_dummies(df_reduced)
y = df_reduced.pop("baseRent")

# Training-Test-Split
X_train, X_test, y_train, y_test = train_test_split(df_reduced, y, test_size=0.2, random_state=42)

# Training
model_lr = linear_model.LinearRegression()
model_lr.fit(X_train, y_train)
y_train_pred = model_lr.predict(X_train)
y_test_pred = model_lr.predict(X_test)

# Evaluation
print_evaluation(model_lr, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred,
                 feature_names=df_reduced.columns)

LinearRegression() Evaluation:
               R² |           RMSE |        MAE |     rows |  columns
Train     0.76241 |         214.60 |     154.29 |    12447 |      127
Test      0.74666 |         222.64 |     158.49 |     3112 |      127


The overall model quality decreases somewhat. If the region is chosen too small, there is too little data and we observe overfitting (result on test data significantly worse than on training data). Since there are fewer data points, the training is significantly faster.

## 4. Model: Restriction to $K$ features

In [10]:
# Data pre-processing
df_reduced = drop_columns(df)
df_reduced = remove_outliers(df_reduced)
df_reduced = remove_rows_with_NaN_target(df_reduced)
df_reduced = impute_NaNs(df_reduced)
df_reduced = pd.get_dummies(df_reduced)
y = df_reduced.pop("baseRent")

# Training-Test-Split
X_train, X_test, y_train, y_test = train_test_split(df_reduced, y, test_size=0.2, random_state=42)

# By SelectKBest we choose the 50 most promising features
feature_selection = SelectKBest(f_regression, k=50)
X_train = feature_selection.fit_transform(X_train, y_train)
X_test = feature_selection.transform(X_test)

# Training
model_lr = linear_model.LinearRegression()
model_lr.fit(X_train, y_train)
y_train_pred = model_lr.predict(X_train)
y_test_pred = model_lr.predict(X_test)

# Evaluation
print_evaluation(model_lr, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred,
                 feature_names=df_reduced.columns)

LinearRegression() Evaluation:
               R² |           RMSE |        MAE |     rows |  columns
Train     0.79481 |         185.31 |     123.01 |   207773 |       50
Test      0.79633 |         186.40 |     123.68 |    51944 |       50


The R² value drops from 0.84 to 0.79, but we only need 10% of the features. The smaller you choose the $K$, the worse the model becomes overall. Note: `SelectKBest` is only a *heuristic*, it is not guaranteed that the resulting model with 50 features is the best of all models with 50 features. 

## 5. Model: Additional features through combination of existing features

In [11]:
# Data pre-processing
df_reduced = drop_columns(df)
df_reduced = remove_outliers(df_reduced)
df_reduced = remove_rows_with_NaN_target(df_reduced)
df_reduced = impute_NaNs(df_reduced)
df_reduced = pd.get_dummies(df_reduced)
y = df_reduced.pop("baseRent")

# A transformer that generates interaction features of (up to) degree 2
pf = PolynomialFeatures(degree=2, interaction_only=True)
# Interaction features: kreis x living Space
kreis_columns = "regio2_" + df["regio2"].unique()
for col in kreis_columns:
    features = pf.fit_transform(df_reduced[[col, "livingSpace"]])
    df_reduced[col + "_livingSpace"] = features[:, -1]

# Training-Test-Split
X_train, X_test, y_train, y_test = train_test_split(df_reduced, y, test_size=0.2, random_state=42)

# Training
model_lr = linear_model.LinearRegression()
model_lr.fit(X_train, y_train)
y_train_pred = model_lr.predict(X_train)
y_test_pred = model_lr.predict(X_test)

# Evaluation
print_evaluation(model_lr, X_train, X_test, y_train, y_test, y_train_pred, y_test_pred,
                 feature_names=df_reduced.columns)

LinearRegression() Evaluation:
               R² |           RMSE |        MAE |     rows |  columns
Train     0.86526 |         150.16 |      95.47 |   207773 |      937
Test      0.86528 |         151.60 |      96.48 |    51944 |      937


Further useful features (here: Interaction feature between district and living space, i.e. additional rent per additional square metre of living space, whereby this coefficient can vary depending on the district), the model quality can be increased somewhat. We now have 931 features and the computing time for training is now significantly longer than with 512 features.