# Data Wrangling

## Imports

In [11]:
import pandas as pd

from sklearn.impute import KNNImputer
from zipfile import ZipFile
from joblib import dump
from helpers.paths import Paths

## ZIP Import

In [12]:
source_df = pd.read_csv(ZipFile(Paths.IMMOSCOUT_SOURCE_DATA).open("immo_data_202208_v2.csv"), sep=',', index_col = 0, low_memory=False)
clean_df = source_df.copy()

### Delete Unnamed Columns

In [13]:
clean_df = clean_df.loc[:, ~clean_df.columns.str.contains('^Unnamed')]
clean_df.head()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,features,description_detailed,Floor space:,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned
0,Biberstein,100 m²,,,On request,"5023 Biberstein, AG","3.5 rooms, 100 m²«Luxuriöse Attika-Wohnung mit...",DescriptionLuxuriöse Attika-Wohnung direkt an ...,https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
1,Biberstein,156 m²,222 m²,242 m²,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Stilvolle Liegenschaft - ruh...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
2,,,,,,"5022 Rombach, AG","2.5 rooms, 93 m²«Moderne, lichtdurchflutete At...","detail_responsive#description_title2,5 Zimmerw...",https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5022.0,,,,
3,Biberstein,154 m²,370 m²,257 m²,On request,"Buhaldenstrasse 8A5023 Biberstein, AG","4.5 rooms, 154 m²«AgentSelly - Luxuriöses Eckh...",DescriptionDieses äusserst grosszügige Minergi...,https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
4,Küttigen,142 m²,,,On request,"5022 Rombach, AG","4.5 rooms, 142 m²«MIT GARTENSITZPLATZ UND VIEL...",DescriptionAus ehemals zwei Wohnungen wurde ei...,https://www.immoscout24.ch//en/d/flat-buy-romb...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5022.0,,,,


### There are Column which we can merge

In [14]:
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['detail_responsive#municipality'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Gemeinde'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Commune'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Comune'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Municipality_merged'])

clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['detail_responsive#surface_living'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Wohnfläche'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Surface habitable'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Superficie abitabile'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Living_space_merged'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Living_area_unified'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Space extracted'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Surface living:'])

clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['detail_responsive#surface_property'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Grundstücksfläche'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Surface du terrain'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Superficie del terreno'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Plot_area_merged'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Plot_area_unified'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Land area:'])

clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['detail_responsive#surface_usable'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Nutzfläche'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Surface utile'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Superficie utile'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Floor_space_merged'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Floor space:'])

clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['detail_responsive#floor'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Stockwerk'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Étage'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Piano'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Floor_merged'])

clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['detail_responsive#available_from'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Verfügbarkeit'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Disponibilité'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Disponibilità'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Availability_merged'])

clean_df = clean_df.drop(['detail_responsive#municipality', 'Gemeinde', 'Commune', 'Comune', 'Municipality_merged',
                            'detail_responsive#surface_living', 'Wohnfläche', 'Surface habitable', 'Superficie abitabile', 'Living_space_merged', 'Living_area_unified', 'Space extracted', 'Surface living:',
                            'detail_responsive#surface_property', 'Grundstücksfläche', 'Surface du terrain', 'Superficie del terreno', 'Plot_area_merged', 'Plot_area_unified', 'Land area:',
                            'detail_responsive#surface_usable', 'Nutzfläche', 'Surface utile', 'Superficie utile', 'Floor_space_merged', 'Floor space:',
                            'detail_responsive#floor', 'Stockwerk', 'Étage', 'Piano', 'Floor_merged',
                            'detail_responsive#available_from', 'Verfügbarkeit', 'Disponibilité', 'Disponibilità', 'Availability_merged'], axis=1)

clean_df.head()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,Year built:,features,description_detailed,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned
0,Biberstein,100 m²,,,On request,"5023 Biberstein, AG","3.5 rooms, 100 m²«Luxuriöse Attika-Wohnung mit...",DescriptionLuxuriöse Attika-Wohnung direkt an ...,https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
1,Biberstein,156 m²,222 m²,242 m²,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Stilvolle Liegenschaft - ruh...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
2,Küttigen,93 m²,,,Immediately,"5022 Rombach, AG","2.5 rooms, 93 m²«Moderne, lichtdurchflutete At...","detail_responsive#description_title2,5 Zimmerw...",https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5022.0,,,,
3,Biberstein,154 m²,370 m²,257 m²,On request,"Buhaldenstrasse 8A5023 Biberstein, AG","4.5 rooms, 154 m²«AgentSelly - Luxuriöses Eckh...",DescriptionDieses äusserst grosszügige Minergi...,https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
4,Küttigen,142 m²,,,On request,"5022 Rombach, AG","4.5 rooms, 142 m²«MIT GARTENSITZPLATZ UND VIEL...",DescriptionAus ehemals zwei Wohnungen wurde ei...,https://www.immoscout24.ch//en/d/flat-buy-romb...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5022.0,,,,


### Take Informations from Column "details", and put the Informations into "rooms" and "Living space" if they are nan or 0

In [15]:
clean_df['rooms_from_details'] = (clean_df['details'].str.extract(r'(\d+) rooms')).astype(float)
clean_df['space_from_details'] = (clean_df['details'].str.extract(r'(\d+) m²')).astype(float)

clean_df['rooms'] = clean_df['rooms'].mask(clean_df['rooms'] == 0.0, clean_df['rooms_from_details'])
clean_df['rooms'] = clean_df['rooms'].fillna(clean_df['rooms_from_details'])

clean_df['Living space'] = clean_df['Living space'].fillna(clean_df['space_from_details'])

clean_df = clean_df.drop(['details', 'rooms_from_details', 'space_from_details'], axis=1)

clean_df.head()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,Year built:,features,description_detailed,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned
0,Biberstein,100 m²,,,On request,"5023 Biberstein, AG","3.5 rooms, 100 m²«Luxuriöse Attika-Wohnung mit...",DescriptionLuxuriöse Attika-Wohnung direkt an ...,https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
1,Biberstein,156 m²,222 m²,242 m²,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Stilvolle Liegenschaft - ruh...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
2,Küttigen,93 m²,,,Immediately,"5022 Rombach, AG","2.5 rooms, 93 m²«Moderne, lichtdurchflutete At...","detail_responsive#description_title2,5 Zimmerw...",https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5022.0,,,,
3,Biberstein,154 m²,370 m²,257 m²,On request,"Buhaldenstrasse 8A5023 Biberstein, AG","4.5 rooms, 154 m²«AgentSelly - Luxuriöses Eckh...",DescriptionDieses äusserst grosszügige Minergi...,https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
4,Küttigen,142 m²,,,On request,"5022 Rombach, AG","4.5 rooms, 142 m²«MIT GARTENSITZPLATZ UND VIEL...",DescriptionAus ehemals zwei Wohnungen wurde ei...,https://www.immoscout24.ch//en/d/flat-buy-romb...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5022.0,,,,


### Remove m² from Columns

In [16]:
clean_df['Living space'] = clean_df['Living space'].astype(str)
clean_df['Living space'] = clean_df['Living space'].str.replace('m²', '')
clean_df['Living space'] = clean_df['Living space'].str.replace('m2', '')
clean_df['Living space'] = clean_df['Living space'].astype(float)

clean_df['Plot area'] = clean_df['Plot area'].astype(str)
clean_df['Plot area'] = clean_df['Plot area'].str.replace('m²', '')
clean_df['Plot area'] = clean_df['Plot area'].str.replace(',', '')
clean_df['Plot area'] = clean_df['Plot area'].astype(float)

clean_df['Floor space'] = clean_df['Floor space'].astype(str)
clean_df['Floor space'] = clean_df['Floor space'].str.replace('m²', '')
clean_df['Floor space'] = clean_df['Floor space'].str.replace('m2', '')
clean_df['Floor space'] = clean_df['Floor space'].astype(float)

clean_df['Floor'] = clean_df['Floor'].astype(str)
clean_df['Floor'] = clean_df['Floor'].str.replace('Ground floor', '0')
clean_df['Floor'] = clean_df['Floor'].str.replace('GF', '0')
clean_df['Floor'] = clean_df['Floor'].str.replace('. floor', '', regex=False)
clean_df['Floor'] = clean_df['Floor'].str.replace(r'(\d+)\. Basement', lambda x: str(int(x.group(1)) * -1), regex=True)
clean_df['Floor'] = clean_df['Floor'].astype(float)

clean_df['rooms'] = clean_df['rooms'].astype(str)
clean_df['rooms'] = clean_df['rooms'].str.replace('rm', '')
clean_df['rooms'] = clean_df['rooms'].astype(float)

clean_df.head()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,Year built:,features,description_detailed,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned
0,Biberstein,100.0,,,On request,"5023 Biberstein, AG","3.5 rooms, 100 m²«Luxuriöse Attika-Wohnung mit...",DescriptionLuxuriöse Attika-Wohnung direkt an ...,https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
1,Biberstein,156.0,222.0,242.0,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Stilvolle Liegenschaft - ruh...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
2,Küttigen,93.0,,,Immediately,"5022 Rombach, AG","2.5 rooms, 93 m²«Moderne, lichtdurchflutete At...","detail_responsive#description_title2,5 Zimmerw...",https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5022.0,,,,
3,Biberstein,154.0,370.0,257.0,On request,"Buhaldenstrasse 8A5023 Biberstein, AG","4.5 rooms, 154 m²«AgentSelly - Luxuriöses Eckh...",DescriptionDieses äusserst grosszügige Minergi...,https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5023.0,,,,
4,Küttigen,142.0,,,On request,"5022 Rombach, AG","4.5 rooms, 142 m²«MIT GARTENSITZPLATZ UND VIEL...",DescriptionAus ehemals zwei Wohnungen wurde ei...,https://www.immoscout24.ch//en/d/flat-buy-romb...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,,,5022.0,,,,


### How many NAN's has each column

In [17]:
(clean_df.isnull().sum() / len(clean_df)) * 100

Municipality             1.023086
Living space             2.668920
Plot area               63.480272
Floor space             74.609670
Availability            40.594280
                          ...    
plz                      0.075619
Number of toilets:      99.675281
Gross yield:            99.937725
Minimum floor space:    99.991104
space_cleaned           59.405720
Length: 95, dtype: float64

### Remove redundant columns

In [18]:
columns = ['Municipality',#unbrauchbar wegen long und lat
            'Availability',#zu wenig Daten
            'location',#long und lat
            'location_parsed',#long und lat
            'description',#unbrauchbar         
            'detailed_description',#unbrauchbar
            'url',#unbrauchbar
            'table',#unbrauchbar
            'Gross return',#zu wenig Daten
            'title',#unbrauchbar
            'address',#unbrauchbar wegen long und lat
            'price',#gibt price_cleaned
            'link',#unbrauchbar
            'details_structured',#unbrauchbar
            'lat',#gibt latitute
            'lon',#gibt longitude
            'index',#unbrauchbar
            'Locality',#long und lat
            'plz_parsed',#gibt ZIP
            'type',#gibt type_unified
            'Floor_unified',#gibt Floor
            'provider',#unbrauchbar
            'space',#gibt Living space
            'price_s',#gibt price_cleaned
            'address_s',#unbrauchbar wegen long und lat
            'No. of rooms:',#gibt schon rooms
            'Number of apartments:',#unbrauchbar
            'Room height:',#zu wenig Daten
            'Last refurbishment:',#zu wenig Daten
            'Year built:',#zu wenig Daten
            'features',#unbrauchbar
            'description_detailed',#unbrauchbar
            'Number of floors:',#zu wenig Daten
            'Volume:',#unbrauchbar
            'plz',#gibt ZIP
            'Number of toilets:',#zu wenig Daten
            'Gross yield:',#zu wenig Daten
            'Minimum floor space:',#zu wenig Daten
            'space_cleaned',#gibt Living space
            'gde_politics_bdp',#zu wenig Daten
            'gde_politics_evp',#zu wenig Daten
            'gde_politics_glp',#zu wenig Daten
            'gde_politics_pda',#zu wenig Daten
            'gde_politics_rights'#zu wenig Daten
]

clean_df = clean_df.drop(columns, axis=1)
clean_df.head()

Unnamed: 0,Living space,Plot area,Floor space,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,rooms,type_unified
0,100.0,,,4.0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1150000.0,5.0,penthouse
1,156.0,222.0,242.0,,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1420000.0,5.0,terrace-house
2,93.0,,,2.0,0.163362,0.095877,0.001911,47.397416,8.04315,0.0,...,10149.0,3.54901,6.05,37.0,3092.0,30364.0,33493.0,720000.0,5.0,penthouse
3,154.0,370.0,257.0,,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1430000.0,5.0,detached-house
4,142.0,,,0.0,0.333865,0.279276,0.145835,47.40487,8.052781,0.0,...,2638.0,1.708126,6.3,65.0,349.0,941.0,1355.0,995000.0,5.0,flat


### How many NAN's has each column

In [19]:
(clean_df.isnull().sum() / len(clean_df)) * 100

Living space                          2.668920
Plot area                            63.480272
Floor space                          74.609670
Floor                                54.383702
ForestDensityL                        0.000000
ForestDensityM                        0.000000
ForestDensityS                        0.000000
Latitude                              0.000000
Longitude                             0.000000
NoisePollutionRailwayL                0.000000
NoisePollutionRailwayM                0.000000
NoisePollutionRailwayS                0.000000
NoisePollutionRoadL                   0.000000
NoisePollutionRoadM                   0.000000
NoisePollutionRoadS                   0.000000
PopulationDensityL                    0.000000
PopulationDensityM                    0.000000
PopulationDensityS                    0.000000
RiversAndLakesL                       0.000000
RiversAndLakesM                       0.000000
RiversAndLakesS                       0.000000
WorkplaceDens

### Check for every columns if there are absurd values, which are not possible
### e.g. negative prices, negative living space, negative plot area, etc.

In [20]:
stats_df = pd.DataFrame(columns=["min", "0.25 quantile", "mean", "median", "0.75 quantile", "max"])
clean_df_numeric = clean_df.copy().drop(['type_unified'], axis=1)
stats_df["min"] = clean_df_numeric.min()
stats_df["0.25 quantile"] = clean_df_numeric.quantile(0.25)
stats_df["mean"] = clean_df_numeric.mean()
stats_df["median"] = clean_df_numeric.median()
stats_df["0.75 quantile"] = clean_df_numeric.quantile(0.75)
stats_df["max"] = clean_df_numeric.max()
stats_df = stats_df.round(2)
stats_df = stats_df[~stats_df.index.str.startswith("type_")]
stats_df

Unnamed: 0,min,0.25 quantile,mean,median,0.75 quantile,max
Living space,0.0,97.0,176.42,130.0,185.0,9681.0
Plot area,1.0,347.25,1377.37,646.0,1137.0,350917.0
Floor space,1.0,101.0,201.05,148.0,234.0,7798.0
Floor,-4.0,0.0,4.44,1.0,2.0,1011.0
ForestDensityL,0.0,0.02,0.19,0.11,0.3,0.9
ForestDensityM,0.0,0.0,0.12,0.03,0.19,1.0
ForestDensityS,0.0,0.0,0.08,0.0,0.05,1.0
Latitude,45.83,46.22,46.64,46.5,47.06,47.79
Longitude,5.97,6.93,7.58,7.36,8.29,10.4
NoisePollutionRailwayL,0.0,0.0,0.01,0.0,0.02,0.26


Living space hat einen komischen min Wert

Plot area hat einen komischen min Wert

Floor space Hat einen komischen min Wert

Floor hat komische max Werte

price_cleaned hat einen komischen min Wert

In [23]:
floor_space_cutoff = 5
clean_df[clean_df['Floor space'] <= floor_space_cutoff]

Unnamed: 0,Living space,Plot area,Floor space,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,rooms,type_unified
2203,68.0,,4.0,,0.290155,0.249285,0.152095,46.68832,7.844555,0.000488,...,2717.0,2.88225,8.09,56.0,517.0,2389.0,2962.0,560000.0,5.0,flat
4238,74.0,,5.0,,0.390434,0.167669,0.067203,46.808449,9.2592,0.0,...,873.0,1.731602,4.32,23.0,121.0,1243.0,1387.0,695000.0,5.0,flat
13210,76.0,,4.0,3.0,0.286686,0.139504,0.0,47.246667,8.607851,0.0,...,8762.0,2.548622,4.15,87.0,1660.0,7636.0,9383.0,730000.0,5.0,flat
20135,75.0,,4.0,8.0,0.019785,0.0,0.0,47.09324,6.814634,0.033235,...,18259.0,11.936237,10.29,217.0,10364.0,15056.0,25637.0,250000.0,3.5,flat
22111,195.0,578.0,1.0,,0.298248,0.155929,0.01441,47.332588,8.339168,0.0,...,3437.0,1.839228,6.16,55.0,999.0,3300.0,4354.0,1400000.0,8.5,detached-house


In [24]:
floor_cutoff = 20
clean_df[clean_df['Floor'] > floor_cutoff]

Unnamed: 0,Living space,Plot area,Floor space,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,rooms,type_unified
3133,96.0,,,100.0,0.4595,0.283642,0.064622,46.559132,7.081714,0.0,...,554.0,2.013943,6.38,41.0,203.0,195.0,439.0,550000.0,5.0,flat
5754,138.0,,,21.0,0.100651,0.000672,0.0,47.6828,8.615092,0.022311,...,4872.0,4.711128,6.49,38.0,2017.0,3674.0,5729.0,1320000.0,5.0,flat
10857,162.0,,,999.0,0.032747,0.0,0.0,46.422879,6.260065,0.001111,...,5324.0,3.488646,8.44,49.0,655.0,5519.0,6223.0,1160000.0,5.0,flat
10884,147.0,,,999.0,0.598226,0.324451,0.239227,46.447648,6.13341,0.0,...,1177.0,5.034793,8.37,18.0,44.0,246.0,308.0,1050000.0,5.0,flat
11113,159.0,,,999.0,0.005083,0.0,0.0,46.472305,6.832813,0.039872,...,1019.0,2.71639,8.56,36.0,37.0,347.0,420.0,1740000.0,5.0,flat
11308,219.0,,,999.0,0.009171,0.0,0.0,46.43688,6.911776,0.033026,...,12024.0,7.35066,8.56,72.0,1288.0,11020.0,12380.0,3900000.0,5.0,flat
11323,180.0,,,999.0,0.009171,0.0,0.0,46.43688,6.911776,0.033026,...,12024.0,7.35066,8.56,72.0,1288.0,11020.0,12380.0,2590000.0,5.0,flat
12081,113.0,,113.0,23.0,0.0,0.0,0.0,46.107754,7.076626,0.04508,...,8000.0,3.552207,4.16,217.0,2176.0,10485.0,12878.0,555000.0,5.0,flat
12214,156.0,,156.0,23.0,0.091187,0.0,0.0,46.315045,6.901379,0.0,...,1123.0,2.055336,4.3,33.0,488.0,331.0,852.0,790000.0,5.0,duplex-maisonette
13475,97.0,,,999.0,0.103021,0.096711,0.051118,46.552803,6.597701,0.0,...,522.0,1.909477,8.29,0.0,45.0,82.0,127.0,1750000.0,3.5,flat


In [25]:
price_cleaned_cutoff = 10_000
clean_df[clean_df['price_cleaned'] < price_cleaned_cutoff]

Unnamed: 0,Living space,Plot area,Floor space,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,rooms,type_unified
3614,,,,,0.061377,0.065835,0.0,46.179057,6.119311,0.005745,...,12461.0,5.213814,3.24,23.0,2072.0,17171.0,19266.0,1600.0,,flat
3811,,,,,0.034854,0.02574,0.0,46.262419,6.214042,0.0,...,688.0,1.289043,3.02,10.0,34.0,258.0,302.0,3500.0,5.0,flat
3906,140.0,,,,0.262601,0.387136,0.151863,46.166641,6.170931,0.0,...,3717.0,2.128981,3.08,38.0,197.0,1870.0,2105.0,2.0,7.0,detached-house
4398,75.0,,,,0.17494,0.091614,0.0,47.406451,7.040699,0.0,...,323.0,1.295337,9.88,28.0,154.0,64.0,246.0,1100.0,5.0,flat
5021,45.0,,,8.0,0.161326,0.300538,0.280474,47.053017,6.753855,0.0,...,4830.0,9.661049,9.98,83.0,5371.0,2812.0,8266.0,745.0,2.0,flat
5026,110.0,,,2.0,0.161326,0.300538,0.280474,47.053017,6.753855,0.0,...,4830.0,9.661049,9.98,83.0,5371.0,2812.0,8266.0,1275.0,5.0,flat
10213,170.0,,,1.0,0.024808,0.028412,0.0,46.522587,6.635715,0.0,...,67913.0,8.784994,9.07,145.0,6470.0,111090.0,117705.0,4450.0,5.0,flat
10530,19.0,,,,0.016314,0.007802,0.0,46.529557,6.560561,0.001126,...,5015.0,3.728427,8.42,14.0,2668.0,13165.0,15847.0,950.0,,single-room
10690,,,,,0.001743,0.0,0.0,46.487053,6.421055,0.0,...,1061.0,1.361007,8.37,95.0,393.0,2246.0,2734.0,1.0,,terrace-house
10798,,,,,0.200541,0.047633,0.0,46.47533,6.336503,0.0,...,1005.0,1.031322,8.46,98.0,43.0,354.0,495.0,1850.0,3.0,flat


In [26]:
clean_df = clean_df[(clean_df['Floor space'] > floor_space_cutoff) | (clean_df['Floor space'].isnull())]
clean_df = clean_df[(clean_df['Floor'] < floor_cutoff) | (clean_df['Floor'].isnull())]
clean_df = clean_df[(clean_df['price_cleaned'] > price_cleaned_cutoff) | (clean_df['price_cleaned'].isnull())]
clean_df.head()

Unnamed: 0,Living space,Plot area,Floor space,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,rooms,type_unified
0,100.0,,,4.0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1150000.0,5.0,penthouse
1,156.0,222.0,242.0,,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1420000.0,5.0,terrace-house
2,93.0,,,2.0,0.163362,0.095877,0.001911,47.397416,8.04315,0.0,...,10149.0,3.54901,6.05,37.0,3092.0,30364.0,33493.0,720000.0,5.0,penthouse
3,154.0,370.0,257.0,,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1430000.0,5.0,detached-house
4,142.0,,,0.0,0.333865,0.279276,0.145835,47.40487,8.052781,0.0,...,2638.0,1.708126,6.3,65.0,349.0,941.0,1355.0,995000.0,5.0,flat


In [27]:
clean_df = clean_df.drop(["Floor space"], axis = 1)
clean_df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,rooms,type_unified
0,100.0,,4.0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1150000.0,5.0,penthouse
1,156.0,222.0,,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1420000.0,5.0,terrace-house
2,93.0,,2.0,0.163362,0.095877,0.001911,47.397416,8.04315,0.0,0.0,...,10149.0,3.54901,6.05,37.0,3092.0,30364.0,33493.0,720000.0,5.0,penthouse
3,154.0,370.0,,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1430000.0,5.0,detached-house
4,142.0,,0.0,0.333865,0.279276,0.145835,47.40487,8.052781,0.0,0.0,...,2638.0,1.708126,6.3,65.0,349.0,941.0,1355.0,995000.0,5.0,flat


### Imputation

In [28]:
clean_df = clean_df[clean_df.type_unified.notna()]

In [29]:
nan_percentage = (clean_df.isnull().sum() / len(clean_df)) * 100

cols = nan_percentage[(nan_percentage < 20) & (nan_percentage > 0)].index
cols

Index(['Living space', 'gde_politics_cvp', 'gde_politics_fdp',
       'gde_politics_gps', 'gde_politics_sp', 'gde_politics_svp',
       'price_cleaned', 'rooms'],
      dtype='object')

In [30]:
TYPES_WITH_NO_FLOOR = ['villa', 'farmhouse', 'castle', 'chalet', 'detached-house', 'detached-house'] 

for type_ in TYPES_WITH_NO_FLOOR:
    clean_df.loc[clean_df.type_unified == type_, 'Floor'] = clean_df[clean_df.type_unified == type_].Floor.fillna(0.)

clean_df[clean_df.type_unified == TYPES_WITH_NO_FLOOR[2]]

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,rooms,type_unified
1795,2111.0,2111.0,0.0,0.014403,0.0,0.0,46.976675,7.482068,0.0,0.0,...,5184.0,6.883231,7.22,18.0,921.0,11495.0,12434.0,,2.0,castle
2612,,,0.0,0.025704,0.0,0.0,47.466193,7.814191,0.041077,0.000443,...,2957.0,1.871981,5.94,115.0,1370.0,2584.0,4069.0,510000.0,,castle
3294,685.0,7548.0,0.0,0.059019,0.0,0.0,46.754509,7.146302,0.0,0.0,...,414.0,1.798561,6.32,46.0,63.0,123.0,232.0,,7.0,castle
3570,900.0,7623.0,0.0,0.275512,0.156081,0.210352,46.496443,6.871439,0.0,0.0,...,1555.0,5.140866,8.64,26.0,667.0,732.0,1425.0,2950000.0,3.0,castle
7445,721.0,77314.0,0.0,0.717481,0.709,0.589752,46.154143,8.637526,0.0,0.0,...,544.0,2.307692,4.03,35.0,73.0,302.0,410.0,,4.0,castle
9911,275.0,302.0,0.0,0.000167,0.0,0.0,46.593586,6.569423,0.0,0.0,...,383.0,2.164502,8.8,23.0,19.0,57.0,99.0,2350000.0,8.0,castle
10528,465.0,2917.0,0.0,0.08874,0.104456,0.019483,46.502319,6.725259,0.001658,0.0,...,2274.0,1.107082,8.37,337.0,186.0,1328.0,1851.0,,6.0,castle
10873,1034.0,38780.0,0.0,0.019735,0.052701,0.049039,46.405333,6.174877,0.0,0.0,...,473.0,0.572363,7.98,38.0,106.0,130.0,274.0,,5.0,castle
10939,700.0,10519.0,0.0,0.089547,0.091108,0.062574,46.42909,6.249294,0.0,0.0,...,401.0,0.0,8.72,18.0,293.0,353.0,664.0,,0.0,castle
13617,275.0,302.0,0.0,0.000801,0.0,0.0,46.593945,6.565308,0.0,0.0,...,383.0,2.164502,8.8,23.0,19.0,57.0,99.0,2350000.0,8.0,castle


In [31]:
types = clean_df.type_unified.unique()

imputers = {}

columns = clean_df.columns

for type_ in types:
    indexes = clean_df.type_unified == type_
    df_type = clean_df[indexes].copy()
    type_unified = df_type.type_unified
    df_type = df_type.drop(['type_unified'], axis=1)

    imputer = KNNImputer(n_neighbors=15, weights='distance', keep_empty_features=True)

    df_type[df_type.columns] = imputer.fit_transform(df_type)
    df_type.insert(2, 'type_unified', type_unified.values)

    clean_df.loc[indexes, columns] = df_type
    imputers[type_] = imputer

clean_df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,rooms,type_unified
0,100.0,964.462853,4.0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1150000.0,5.0,penthouse
1,156.0,222.0,2.92054,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1420000.0,5.0,terrace-house
2,93.0,903.562312,2.0,0.163362,0.095877,0.001911,47.397416,8.04315,0.0,0.0,...,10149.0,3.54901,6.05,37.0,3092.0,30364.0,33493.0,720000.0,5.0,penthouse
3,154.0,370.0,0.0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,1430000.0,5.0,detached-house
4,142.0,1149.735642,0.0,0.333865,0.279276,0.145835,47.40487,8.052781,0.0,0.0,...,2638.0,1.708126,6.3,65.0,349.0,941.0,1355.0,995000.0,5.0,flat


In [32]:
#take from "NoisePollutionRailwayS" and "NoisePollutionRailwayM" "NoisePollutionRailwayL" the mean
clean_df["NoisePollutionRailway"] = clean_df[["NoisePollutionRailwayS", "NoisePollutionRailwayM", "NoisePollutionRailwayL"]].mean(axis=1)
clean_df = clean_df.drop(["NoisePollutionRailwayS", "NoisePollutionRailwayM", "NoisePollutionRailwayL"], axis = 1)

clean_df["NoisePollutionRoad"] = clean_df[["NoisePollutionRoadS", "NoisePollutionRoadM", "NoisePollutionRoadL"]].mean(axis=1)
clean_df = clean_df.drop(["NoisePollutionRoadS", "NoisePollutionRoadM", "NoisePollutionRoadL"], axis = 1)

clean_df["PopulationDensity"] = clean_df[["PopulationDensityS", "PopulationDensityM", "PopulationDensityL"]].mean(axis=1)
clean_df = clean_df.drop(["PopulationDensityS", "PopulationDensityM", "PopulationDensityL"], axis = 1)

clean_df["RiversAndLakes"] = clean_df[["RiversAndLakesS", "RiversAndLakesM", "RiversAndLakesL"]].mean(axis=1)
clean_df = clean_df.drop(["RiversAndLakesS", "RiversAndLakesM", "RiversAndLakesL"], axis = 1)

clean_df["WorkplaceDensity"] = clean_df[["WorkplaceDensityS", "WorkplaceDensityM", "WorkplaceDensityL"]].mean(axis=1)
clean_df = clean_df.drop(["WorkplaceDensityS", "WorkplaceDensityM", "WorkplaceDensityL"], axis = 1)

clean_df["ForestDensity"] = clean_df[["ForestDensityS", "ForestDensityM", "ForestDensityL"]].mean(axis=1)
clean_df = clean_df.drop(["ForestDensityS", "ForestDensityM", "ForestDensityL"], axis = 1)

In [33]:
dump({
    'dataset': clean_df,
    'imputers': imputers
}, Paths.CLASSIFIER_DATA_WRANGLING_DATA)

['data/classifier/01_0_data_wrangling.dump']