# Develop a simple regression model to be used in the web service

### Imports

In [1]:
import __main__ as main

from helpers.paths import Paths
from joblib import load, dump
from helpers.is_interactive import is_interactive
from functools import reduce

### Run dependency notebooks

In [2]:
if is_interactive(main):
    %run 01_0_data_wrangling.ipynb -p
    %run 02_0_scaling.ipynb -p

Running previous notebooks...


### Load Dataframe

In [3]:
data = load(Paths.REGRESSOR_DATA_WRANGLING_DATA)

df = data['dataset']

df = df.drop('price_cleaned', axis=1)

df.head(10)

Unnamed: 0,Living space,Plot area,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,gde_area_nonproductive_percentage,...,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa,NoisePollutionRailway,NoisePollutionRoad,PopulationDensity,RiversAndLakes,WorkplaceDensity,ForestDensity
0,100.0,1282.323307,4.0,47.415927,8.08584,5023.0,3.038467,30.676329,51.449275,4.589372,...,0.0,0.0,0.0,0.0,0.0,0.076399,0.223039,0.031951,0.060414,0.296178
1,156.0,222.0,2.75,47.415927,8.08584,5023.0,3.038467,30.676329,51.449275,4.589372,...,0.0,0.0,1.0,0.0,0.0,0.076399,0.223039,0.031951,0.060414,0.296178
2,93.0,1198.982216,2.0,47.397416,8.04315,5000.0,0.909587,11.35442,32.197891,7.137064,...,0.0,0.0,0.0,0.0,0.0,0.33793,0.451622,0.114168,0.167442,0.08705
3,154.0,370.0,0.0,47.415927,8.08584,5023.0,3.038467,30.676329,51.449275,4.589372,...,0.0,0.0,0.0,0.0,0.0,0.076399,0.223039,0.031951,0.060414,0.296178
4,142.0,462.537377,0.0,47.40487,8.052781,5022.0,1.460245,33.13709,49.705635,1.17746,...,0.0,0.0,0.0,0.0,0.0,0.16745,0.177506,0.114288,0.046822,0.252992
5,190.0,1063.0,0.0,47.401163,8.012034,5018.0,3.147978,32.994924,53.19797,1.218274,...,0.0,0.0,0.0,0.0,0.0,0.158337,0.317534,0.0,0.068993,0.129393
6,124.0,200.0,2.75,47.415927,8.08584,5023.0,3.038467,30.676329,51.449275,4.589372,...,0.0,0.0,1.0,0.0,0.0,0.076399,0.223039,0.031951,0.060414,0.296178
7,112.0,94.942325,3.0,47.388821,8.042194,5000.0,0.738825,11.35442,32.197891,7.137064,...,0.0,0.0,0.0,0.0,0.05245,0.387507,0.26742,8.4e-05,0.303656,0.017111
8,75.0,41.432719,1.620893,47.400929,8.070691,5032.0,1.018878,11.35442,32.197891,7.137064,...,0.0,0.0,0.0,0.0,0.008914,0.370294,0.313341,0.036454,0.090869,0.347674
9,110.0,2116.0,0.0,47.395295,8.012752,5018.0,2.818486,32.994924,53.19797,1.218274,...,1.0,0.0,0.0,0.0,0.0,0.191687,0.229131,0.0,0.067679,0.145702


## Prepare the metadata for the simple model

In [4]:
asked_features = [
    'Living space',
    'Plot area',
    'Zip',
    'rooms',
    'type_unified',
    'Floor'
]

## Find mean values per zip code to complete the data needed for the model

In [5]:
feature_means = reduce(
    lambda state, zip_code: {
        **state,
        zip_code: reduce(
            lambda state, col: {**state, col: df[df.Zip == zip_code][col].mean()},
            list(set([
                feat for feat in df.columns
                if feat not in asked_features
                and not feat.startswith('type_unified_')
            ])),
            {})
        },
    df.Zip.astype('int').unique(),
    {})
feature_means

{5023: {'gde_politics_fdp': 18.3553597651,
  'NoisePollutionRoad': 0.07639862980718953,
  'RiversAndLakes': 0.031950760416377635,
  'distanceToTrainStation': 3.0384665928009302,
  'gde_workers_sector3': 308.0,
  'gde_area_settlement_percentage': 13.2850241546,
  'gde_foreigners_percentage': 9.25566343,
  'gde_population': 1545.0,
  'gde_new_homes_per_1000': 4.739336492900001,
  'NoisePollutionRailway': 0.0,
  'gde_politics_gps': 7.066813509499999,
  'gde_area_nonproductive_percentage': 4.5893719807,
  'gde_empty_apartments': 1.9946808511,
  'gde_pop_per_km2': 376.829268293,
  'gde_workers_sector1': 14.0,
  'PopulationDensity': 0.22303903568912334,
  'Longitude': 8.0858398,
  'gde_workers_total': 331.0,
  'gde_area_forest_percentage': 51.449275362299986,
  'gde_private_apartments': 686.0,
  'gde_politics_svp': 30.8094713656,
  'gde_politics_cvp': 4.5796622614,
  'WorkplaceDensity': 0.060413634567358564,
  'gde_average_house_hold': 2.23,
  'gde_workers_sector2': 9.0,
  'gde_area_agricult

## Find the possible options for Zip code and Type

In [6]:
options = {
    'Zip': sorted(df.Zip.astype('int').unique()),
    'type_unified': [
        col.split('_')[-1]
        for col in df.columns
        if col.startswith('type_unified_')
    ]
}
options

{'Zip': [1000,
  1003,
  1004,
  1005,
  1006,
  1007,
  1008,
  1009,
  1010,
  1012,
  1015,
  1020,
  1022,
  1023,
  1024,
  1025,
  1026,
  1028,
  1029,
  1030,
  1032,
  1033,
  1034,
  1035,
  1036,
  1037,
  1038,
  1040,
  1041,
  1042,
  1045,
  1052,
  1053,
  1054,
  1055,
  1059,
  1061,
  1063,
  1066,
  1070,
  1071,
  1072,
  1073,
  1080,
  1081,
  1082,
  1083,
  1088,
  1091,
  1092,
  1093,
  1095,
  1096,
  1098,
  1110,
  1112,
  1113,
  1114,
  1116,
  1117,
  1121,
  1122,
  1124,
  1128,
  1131,
  1132,
  1134,
  1135,
  1136,
  1142,
  1143,
  1145,
  1146,
  1148,
  1149,
  1162,
  1163,
  1164,
  1166,
  1167,
  1168,
  1169,
  1170,
  1172,
  1174,
  1176,
  1180,
  1182,
  1183,
  1184,
  1185,
  1186,
  1187,
  1188,
  1196,
  1197,
  1200,
  1201,
  1202,
  1203,
  1204,
  1206,
  1207,
  1208,
  1209,
  1212,
  1213,
  1214,
  1216,
  1217,
  1218,
  1219,
  1220,
  1222,
  1223,
  1224,
  1225,
  1226,
  1227,
  1228,
  1231,
  1232,
  1233,
  1234,
 

In [7]:
scaling_data = load(Paths.REGRESSOR_SCALING_DATA)

In [8]:
dump({
    'asked_features': asked_features,
    'options': options,
    'feature_means': feature_means,
    'power_options': range(2, 7),
    'scaler': scaling_data['scaler']
}, Paths.WEBSERVICE_META_DATA)

['data/web_service/01_0_meta_data.dump']