# Backwards Propagation

## Imports

In [1]:
import pandas as pd

from helpers.training import *
from pathlib import Path
from sklearn import model_selection

## Dataset

In [2]:
df = pd.read_csv(Path('./data/model/immoscout_robust.csv'))

X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.6, random_state=42)

df.head(10)

Unnamed: 0,ForestDensityM,Latitude,Longitude,NoisePollutionRailwayM,NoisePollutionRoadM,PopulationDensityM,RiversAndLakesM,RiversAndLakesS,distanceToTrainStation,gde_area_agriculture_percentage,...,type_penthouse,type_rustico,type_secondary-suite,type_semi-detached-house,type_single-room,type_stepped-apartment,type_stepped-house,type_studio,type_terrace-house,type_villa
0,0.688099,0.686833,0.130463,0.0,-1.131549,0.009053,3.466803,0.108954,0.809961,-0.102607,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.688099,0.686833,0.130463,0.0,-1.131549,0.009053,3.466803,0.108954,0.809961,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.19858,0.66892,0.106747,0.0,0.71514,0.80289,35.34003,0.0,-0.078321,-0.709981,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.688099,0.686833,0.130463,0.0,-1.131549,0.009053,3.466803,0.108954,0.809961,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.673461,0.676133,0.112097,0.0,-0.744323,-0.303501,30.638062,0.302994,0.208265,-0.025255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.402338,0.672546,0.089462,0.0,-0.68395,0.136669,0.0,0.0,0.844996,-0.029724,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.688099,0.686833,0.130463,0.0,-1.131549,0.009053,3.466803,0.108954,0.809961,-0.102607,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,-0.473383,0.660603,0.106216,10.189522,0.676815,0.18526,0.0,0.0,-0.184291,-0.709981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.963411,0.67232,0.122047,2.610778,0.881928,0.140491,16.075488,0.185477,-0.015675,-0.709981,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.338853,0.666867,0.089861,0.0,-0.237389,0.00648,0.0,0.0,0.737623,-0.029724,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Setup

In [3]:
TRAINING_FUNCTIONS = [train_random_forest]

## Backwards propagation

In [4]:
for func in TRAINING_FUNCTIONS:
    temp_X_train, temp_X_test = X_train.copy(), X_test.copy()
    best_model = func(temp_X_train, X_test, y_train, y_test)
    score_map = {}

    while len(list(filter(lambda x: x['score_diff'] >= 0, score_map.values()))) > 0 or len(score_map) == 0:
        if len(score_map) > 0:
            max_val, max_index = 0, -1
            for key in score_map.keys():
                if score_map[key]['score_diff'] > max_val:
                    max_val, max_index = score_map[key]['score_diff'], key
            print('removing ', max_index)
            best_model = score_map[max_index]
            temp_X_train, temp_X_test = temp_X_train.drop(max_index, axis=1), temp_X_test.drop(max_index, axis=1)

        columns = temp_X_train.columns
        score_map = {}
        for column in columns:
            result = func(
                temp_X_train.drop(column, axis = 1),
                temp_X_test.drop(column, axis = 1),
                y_train,
                y_test
            )
            result['score_diff'] = result['score'] - best_model['score']
            score_map[column] = result
    display(best_model)


removing  type_detached-secondary-suite
removing  gde_politics_pda


{'columns': ['ForestDensityM',
  'Latitude',
  'Longitude',
  'NoisePollutionRailwayM',
  'NoisePollutionRoadM',
  'PopulationDensityM',
  'RiversAndLakesM',
  'RiversAndLakesS',
  'distanceToTrainStation',
  'gde_area_agriculture_percentage',
  'gde_area_forest_percentage',
  'gde_area_nonproductive_percentage',
  'gde_average_house_hold',
  'gde_empty_apartments',
  'gde_foreigners_percentage',
  'gde_new_homes_per_1000',
  'gde_politics_bdp',
  'gde_politics_cvp',
  'gde_politics_evp',
  'gde_politics_fdp',
  'gde_politics_glp',
  'gde_politics_gps',
  'gde_politics_rights',
  'gde_politics_sp',
  'gde_politics_svp',
  'gde_pop_per_km2',
  'gde_population',
  'gde_social_help_quota',
  'gde_tax',
  'gde_workers_sector1',
  'rooms',
  'Floor space_m2',
  'Plot area_m2',
  'living_space_m2',
  'type_attic-flat',
  'type_attic-room',
  'type_castle',
  'type_chalet',
  'type_detached-house',
  'type_duplex-maisonette',
  'type_farmhouse',
  'type_flat',
  'type_furnished-residential-pr