# Data Wrangling

### Imports

In [13]:
import pandas as pd
import __main__ as main

from pathlib import Path
from joblib import load, dump
from helpers.is_interactive import is_interactive

### Run dependency notebooks

In [14]:
if is_interactive(main):
    %run 01_0_data_wrangling.ipynb -p

Running previous notebooks...


## ZIP Import

In [15]:
#import data/kaggle/test_data-Kaggle-v0.10.csv.zip as a pandas dataframe
source_df = pd.read_csv('data/kaggle/test_data-Kaggle-v0.10.csv.zip', compression='zip', header=0, sep=',', quotechar='"')
clean_df = source_df.copy()

  source_df = pd.read_csv('data/kaggle/test_data-Kaggle-v0.10.csv.zip', compression='zip', header=0, sep=',', quotechar='"')


### Delete Unnamed Columns

In [16]:
clean_df = clean_df.loc[:, ~clean_df.columns.str.contains('^Unnamed')]
clean_df.head()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,Floor space:,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned,Type:,Hall height:
0,Suhr,220 m²,733 m²,,On request,"Galeggenweg 95034 Suhr, AG","6.5 rooms, 220 m²«Landhausvilla mit einzigarti...",DescriptionVilla lädt zum Träumen ein Wir ver...,https://www.immoscout24.ch//en/d/villa-buy-suh...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,220.0,,
1,Suhr,230 m²,702 m²,,On request,"Hofstattmattenweg 195034 Suhr, AG","7.5 rooms, 230 m²«Grosszügiges Anwesen mit tra...","DescriptionLicht, Glas und Holz - ein freisteh...",https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,230.0,,
2,Biberstein,131 m²,,,On request,"Gheldweg 105023 Biberstein, AG","4.5 rooms, 131 m²«Terrassenhaus mit traumhafte...",DescriptionLage Das 4.5-Zimmer-Terrassenhaus ...,https://www.immoscout24.ch//en/d/stepped-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5023.0,,,,131.0,,
3,Suhr,140 m²,206 m²,140 m²,Immediately,"Obere Dorfstrasse 275034 Suhr, AG","6.5 rooms, 140 m²«Mittelhaus in Suhr an attrak...",DescriptionReiheneinfamilienhaus (Mittelhaus) ...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,140.0,,
4,Biberstein,156 m²,222 m²,242 m²,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Bezugsbereit - ruhige Lage u...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5023.0,,,,156.0,,


### There are Column which we can merge

In [17]:
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['detail_responsive#municipality'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Gemeinde'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Commune'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Comune'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Municipality_merged'])

clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['detail_responsive#surface_living'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Wohnfläche'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Surface habitable'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Superficie abitabile'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Living_space_merged'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Living_area_unified'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Space extracted'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Surface living:'])

clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['detail_responsive#surface_property'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Grundstücksfläche'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Surface du terrain'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Superficie del terreno'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Plot_area_merged'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Plot_area_unified'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Land area:'])

clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['detail_responsive#surface_usable'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Nutzfläche'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Surface utile'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Superficie utile'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Floor_space_merged'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Floor space:'])

clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['detail_responsive#floor'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Stockwerk'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Étage'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Piano'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Floor_merged'])

clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['detail_responsive#available_from'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Verfügbarkeit'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Disponibilité'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Disponibilità'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Availability_merged'])

clean_df = clean_df.drop(['detail_responsive#municipality', 'Gemeinde', 'Commune', 'Comune', 'Municipality_merged',
                            'detail_responsive#surface_living', 'Wohnfläche', 'Surface habitable', 'Superficie abitabile', 'Living_space_merged', 'Living_area_unified', 'Space extracted', 'Surface living:',
                            'detail_responsive#surface_property', 'Grundstücksfläche', 'Surface du terrain', 'Superficie del terreno', 'Plot_area_merged', 'Plot_area_unified', 'Land area:',
                            'detail_responsive#surface_usable', 'Nutzfläche', 'Surface utile', 'Superficie utile', 'Floor_space_merged', 'Floor space:',
                            'detail_responsive#floor', 'Stockwerk', 'Étage', 'Piano', 'Floor_merged',
                            'detail_responsive#available_from', 'Verfügbarkeit', 'Disponibilité', 'Disponibilità', 'Availability_merged'], axis=1)

clean_df.head()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,description_detailed,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned,Type:,Hall height:
0,Suhr,220 m²,733 m²,,On request,"Galeggenweg 95034 Suhr, AG","6.5 rooms, 220 m²«Landhausvilla mit einzigarti...",DescriptionVilla lädt zum Träumen ein Wir ver...,https://www.immoscout24.ch//en/d/villa-buy-suh...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,220.0,,
1,Suhr,230 m²,702 m²,,On request,"Hofstattmattenweg 195034 Suhr, AG","7.5 rooms, 230 m²«Grosszügiges Anwesen mit tra...","DescriptionLicht, Glas und Holz - ein freisteh...",https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,230.0,,
2,Biberstein,131 m²,,,On request,"Gheldweg 105023 Biberstein, AG","4.5 rooms, 131 m²«Terrassenhaus mit traumhafte...",DescriptionLage Das 4.5-Zimmer-Terrassenhaus ...,https://www.immoscout24.ch//en/d/stepped-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5023.0,,,,131.0,,
3,Suhr,140 m²,206 m²,140 m²,Immediately,"Obere Dorfstrasse 275034 Suhr, AG","6.5 rooms, 140 m²«Mittelhaus in Suhr an attrak...",DescriptionReiheneinfamilienhaus (Mittelhaus) ...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,140.0,,
4,Biberstein,156 m²,222 m²,242 m²,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Bezugsbereit - ruhige Lage u...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5023.0,,,,156.0,,


### Take Informations from Column "details", and put the Informations into "rooms" and "Living space" if they are nan or 0

In [18]:
clean_df['rooms_from_details'] = (clean_df['details'].str.extract(r'(\d+) rooms')).astype(float)
clean_df['space_from_details'] = (clean_df['details'].str.extract(r'(\d+) m²')).astype(float)

clean_df['rooms'] = clean_df['rooms'].mask(clean_df['rooms'] == 0.0, clean_df['rooms_from_details'])
clean_df['rooms'] = clean_df['rooms'].fillna(clean_df['rooms_from_details'])

clean_df['Living space'] = clean_df['Living space'].fillna(clean_df['space_from_details'])

clean_df = clean_df.drop(['details', 'rooms_from_details', 'space_from_details'], axis=1)

clean_df.head()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,description_detailed,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned,Type:,Hall height:
0,Suhr,220 m²,733 m²,,On request,"Galeggenweg 95034 Suhr, AG","6.5 rooms, 220 m²«Landhausvilla mit einzigarti...",DescriptionVilla lädt zum Träumen ein Wir ver...,https://www.immoscout24.ch//en/d/villa-buy-suh...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,220.0,,
1,Suhr,230 m²,702 m²,,On request,"Hofstattmattenweg 195034 Suhr, AG","7.5 rooms, 230 m²«Grosszügiges Anwesen mit tra...","DescriptionLicht, Glas und Holz - ein freisteh...",https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,230.0,,
2,Biberstein,131 m²,,,On request,"Gheldweg 105023 Biberstein, AG","4.5 rooms, 131 m²«Terrassenhaus mit traumhafte...",DescriptionLage Das 4.5-Zimmer-Terrassenhaus ...,https://www.immoscout24.ch//en/d/stepped-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5023.0,,,,131.0,,
3,Suhr,140 m²,206 m²,140 m²,Immediately,"Obere Dorfstrasse 275034 Suhr, AG","6.5 rooms, 140 m²«Mittelhaus in Suhr an attrak...",DescriptionReiheneinfamilienhaus (Mittelhaus) ...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,140.0,,
4,Biberstein,156 m²,222 m²,242 m²,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Bezugsbereit - ruhige Lage u...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5023.0,,,,156.0,,


### Remove m² from Columns

In [19]:
clean_df['Living space'] = clean_df['Living space'].astype(str)
clean_df['Living space'] = clean_df['Living space'].str.replace('m²', '')
clean_df['Living space'] = clean_df['Living space'].str.replace('m2', '')
clean_df['Living space'] = clean_df['Living space'].astype(float)

clean_df['Plot area'] = clean_df['Plot area'].astype(str)
clean_df['Plot area'] = clean_df['Plot area'].str.replace('m²', '')
clean_df['Plot area'] = clean_df['Plot area'].str.replace(',', '')
clean_df['Plot area'] = clean_df['Plot area'].astype(float)

clean_df['Floor space'] = clean_df['Floor space'].astype(str)
clean_df['Floor space'] = clean_df['Floor space'].str.replace('m²', '')
clean_df['Floor space'] = clean_df['Floor space'].str.replace('m2', '')
clean_df['Floor space'] = clean_df['Floor space'].astype(float)

clean_df['Floor'] = clean_df['Floor'].astype(str)
clean_df['Floor'] = clean_df['Floor'].str.replace('Ground floor', '0')
clean_df['Floor'] = clean_df['Floor'].str.replace('GF', '0')
clean_df['Floor'] = clean_df['Floor'].str.replace('. floor', '')
clean_df['Floor'] = clean_df['Floor'].str.replace(r'(\d+)\. Basement', lambda x: str(int(x.group(1)) * -1))
clean_df['Floor'] = clean_df['Floor'].astype(float)

clean_df['rooms'] = clean_df['rooms'].astype(str)
clean_df['rooms'] = clean_df['rooms'].str.replace('rm', '')
clean_df['rooms'] = clean_df['rooms'].astype(float)

clean_df.head()

  clean_df['Floor'] = clean_df['Floor'].str.replace('. floor', '')
  clean_df['Floor'] = clean_df['Floor'].str.replace(r'(\d+)\. Basement', lambda x: str(int(x.group(1)) * -1))


Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,description_detailed,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned,Type:,Hall height:
0,Suhr,220.0,733.0,,On request,"Galeggenweg 95034 Suhr, AG","6.5 rooms, 220 m²«Landhausvilla mit einzigarti...",DescriptionVilla lädt zum Träumen ein Wir ver...,https://www.immoscout24.ch//en/d/villa-buy-suh...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,220.0,,
1,Suhr,230.0,702.0,,On request,"Hofstattmattenweg 195034 Suhr, AG","7.5 rooms, 230 m²«Grosszügiges Anwesen mit tra...","DescriptionLicht, Glas und Holz - ein freisteh...",https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,230.0,,
2,Biberstein,131.0,,,On request,"Gheldweg 105023 Biberstein, AG","4.5 rooms, 131 m²«Terrassenhaus mit traumhafte...",DescriptionLage Das 4.5-Zimmer-Terrassenhaus ...,https://www.immoscout24.ch//en/d/stepped-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5023.0,,,,131.0,,
3,Suhr,140.0,206.0,140.0,Immediately,"Obere Dorfstrasse 275034 Suhr, AG","6.5 rooms, 140 m²«Mittelhaus in Suhr an attrak...",DescriptionReiheneinfamilienhaus (Mittelhaus) ...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5034.0,,,,140.0,,
4,Biberstein,156.0,222.0,242.0,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Bezugsbereit - ruhige Lage u...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,...,,,,5023.0,,,,156.0,,


### How many NAN's has each column

In [20]:
(clean_df.isnull().sum() / len(clean_df)) * 100

Municipality             0.736791
Living space             5.633803
Plot area               61.385655
Floor space             79.406497
Availability            49.263209
                          ...    
Gross yield:            99.776113
Minimum floor space:    99.983717
space_cleaned           46.918505
Type:                   50.736791
Hall height:            99.983717
Length: 93, dtype: float64

In [21]:
clean_df.columns

Index(['Municipality', 'Living space', 'Plot area', 'Floor space',
       'Availability', 'location', 'description', 'detailed_description',
       'url', 'table', 'Floor', 'Gross return', 'location_parsed', 'title',
       'address', 'link', 'details_structured', 'lat', 'lon', 'index',
       'ForestDensityL', 'ForestDensityM', 'ForestDensityS', 'Latitude',
       'Locality', 'Longitude', 'NoisePollutionRailwayL',
       'NoisePollutionRailwayM', 'NoisePollutionRailwayS',
       'NoisePollutionRoadL', 'NoisePollutionRoadM', 'NoisePollutionRoadS',
       'PopulationDensityL', 'PopulationDensityM', 'PopulationDensityS',
       'RiversAndLakesL', 'RiversAndLakesM', 'RiversAndLakesS',
       'WorkplaceDensityL', 'WorkplaceDensityM', 'WorkplaceDensityS', 'Zip',
       'distanceToTrainStation', 'gde_area_agriculture_percentage',
       'gde_area_forest_percentage', 'gde_area_nonproductive_percentage',
       'gde_area_settlement_percentage', 'gde_average_house_hold',
       'gde_empty_apart

### Remove redundant columns

In [22]:
columns = ['Municipality',#unbrauchbar wegen long und lat
            'Availability',#zu wenig Daten
            'location',#long und lat
            'location_parsed',#long und lat
            'description',#unbrauchbar         
            'detailed_description',#unbrauchbar
            'url',#unbrauchbar
            'table',#unbrauchbar
            'Gross return',#zu wenig Daten
            'title',#unbrauchbar
            'address',#unbrauchbar wegen long und lat
            #'price',#gibt price_cleaned
            'link',#unbrauchbar
            'details_structured',#unbrauchbar
            'lat',#gibt latitute
            'lon',#gibt longitude
            'index',#unbrauchbar
            'Locality',#long und lat
            'plz_parsed',#gibt ZIP
            #'type',#gibt type_unified
            'Floor_unified',#gibt Floor
            'provider',#unbrauchbar
            'space',#gibt Living space
            #'price_s',#gibt price_cleaned
            'address_s',#unbrauchbar wegen long und lat
            'No. of rooms:',#gibt schon rooms
            'Number of apartments:',#unbrauchbar
            'Room height:',#zu wenig Daten
            'Last refurbishment:',#zu wenig Daten
            'Year built:',#zu wenig Daten
            'features',#unbrauchbar
            'description_detailed',#unbrauchbar
            'Number of floors:',#zu wenig Daten
            'Volume:',#unbrauchbar
            'plz',#gibt ZIP
            'Number of toilets:',#zu wenig Daten
            'Gross yield:',#zu wenig Daten
            'Minimum floor space:',#zu wenig Daten
            'space_cleaned',#gibt Living space
            'gde_politics_bdp',#zu wenig Daten
            'gde_politics_evp',#zu wenig Daten
            'gde_politics_glp',#zu wenig Daten
            'gde_politics_pda',#zu wenig Daten
            'gde_politics_rights',#zu wenig Daten
            'Type:',#gibt type_unified
            'Hall height:',#zu wenig Daten
]


clean_df = clean_df.drop(columns, axis=1)
clean_df.head()

Unnamed: 0,Living space,Plot area,Floor space,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,...,gde_population,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,rooms,type_unified
0,220.0,733.0,,,0.164382,0.10003,0.063548,47.37211,8.075858,0.003811,...,9990.0,4212.0,4.281099,6.35,38.0,1015.0,4188.0,5241.0,5.0,villa
1,230.0,702.0,,,0.260855,0.170434,0.083253,47.371558,8.07311,0.002623,...,9990.0,4212.0,4.281099,6.35,38.0,1015.0,4188.0,5241.0,5.0,detached-house
2,131.0,,,,0.434114,0.357984,0.125505,47.413754,8.082318,0.0,...,1545.0,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,5.0,stepped-house
3,140.0,206.0,140.0,,0.14819,0.07661,0.0,47.373327,8.076892,0.005193,...,9990.0,4212.0,4.281099,6.35,38.0,1015.0,4188.0,5241.0,5.0,terrace-house
4,156.0,222.0,242.0,,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,...,1545.0,686.0,2.234259,5.89,14.0,9.0,308.0,331.0,5.0,terrace-house


### First part of imputation

In [23]:
#groupby type_unified and take median
df_median = clean_df.groupby('type_unified').median()
#only columns "Living space", "Plot area", "Floor", "rooms"
df_median = df_median[['Living space', 'Plot area', 'Floor', 'rooms']]
#fillna with 0 
df_median = df_median.fillna(0)
df_median

Unnamed: 0_level_0,Living space,Plot area,Floor,rooms
type_unified,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
attic-flat,125.0,803.5,2.0,4.5
attic-room,95.0,144.0,2.0,5.0
castle,300.0,1912.0,0.0,8.0
chalet,160.0,851.0,2.0,5.5
detached-house,180.0,601.0,2.0,5.0
detached-secondary-suite,78.0,1100.0,0.0,6.0
duplex-maisonette,140.0,427.0,2.0,5.0
farmhouse,237.5,1476.0,3.0,5.5
flat,104.0,330.5,1.0,5.0
furnished-residential-property,80.0,482.0,2.0,4.0


In [24]:
#if plot area is null and living space is not null, plot area = living space 
clean_df['Plot area'] = clean_df['Plot area'].fillna(clean_df['Living space'])

#fill 'Living space', 'Plot area', 'Floor', 'rooms' with median of their type
for i in clean_df['type_unified'].unique():
    clean_df['Living space'] = clean_df['Living space'].mask((clean_df['Living space'].isnull()) & (clean_df['type_unified'] == i), df_median.loc[i, 'Living space'])
    clean_df['Plot area'] = clean_df['Plot area'].mask((clean_df['Plot area'].isnull()) & (clean_df['type_unified'] == i), df_median.loc[i, 'Plot area'])
    clean_df['Floor'] = clean_df['Floor'].mask((clean_df['Floor'].isnull()) & (clean_df['type_unified'] == i), df_median.loc[i, 'Floor'])
    clean_df['rooms'] = clean_df['rooms'].mask((clean_df['rooms'].isnull()) & (clean_df['type_unified'] == i), df_median.loc[i, 'rooms'])
clean_df.head()

KeyError: nan

### Divide categorical "type" variable into dummy variables

In [None]:
clean_df["type_unified"].unique()

array(['villa', 'detached-house', 'stepped-house', 'terrace-house',
       'flat', 'penthouse', 'attic-flat', 'stepped-apartment',
       'semi-detached-house', 'furnished-residential-property', 'studio',
       'duplex-maisonette', 'farmhouse', 'loft', 'chalet',
       'secondary-suite', 'castle', 'rustico', 'single-room',
       'detached-secondary-suite', 'attic-room', nan], dtype=object)

In [None]:
clean_df = pd.get_dummies(clean_df, columns=['type_unified'])

### How many NAN's has each column

In [None]:
(clean_df.isnull().sum() / len(clean_df)) * 100

#Floor und Floorspace????

Living space                       5.633803
Plot area                         61.385655
Floor space                       79.406497
Floor                             56.761378
ForestDensityL                     0.000000
                                    ...    
type_unified_stepped-apartment     0.000000
type_unified_stepped-house         0.000000
type_unified_studio                0.000000
type_unified_terrace-house         0.000000
type_unified_villa                 0.000000
Length: 70, dtype: float64

In [None]:
#Florspace Temporär rausgenommen
clean_df = clean_df.drop(["Floor space"], axis = 1)
clean_df['Plot area'] = clean_df['Plot area'].fillna(clean_df['Living space'])
clean_df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
0,220.0,733.0,,0.164382,0.10003,0.063548,47.37211,8.075858,0.003811,0.0,...,0,0,0,0,0,0,0,0,0,1
1,230.0,702.0,,0.260855,0.170434,0.083253,47.371558,8.07311,0.002623,0.0,...,0,0,0,0,0,0,0,0,0,0
2,131.0,131.0,,0.434114,0.357984,0.125505,47.413754,8.082318,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
3,140.0,206.0,,0.14819,0.07661,0.0,47.373327,8.076892,0.005193,0.0,...,0,0,0,0,0,0,0,0,1,0
4,156.0,222.0,,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [None]:
clean_df.columns

Index(['Living space', 'Plot area', 'Floor', 'ForestDensityL',
       'ForestDensityM', 'ForestDensityS', 'Latitude', 'Longitude',
       'NoisePollutionRailwayL', 'NoisePollutionRailwayM',
       'NoisePollutionRailwayS', 'NoisePollutionRoadL', 'NoisePollutionRoadM',
       'NoisePollutionRoadS', 'PopulationDensityL', 'PopulationDensityM',
       'PopulationDensityS', 'RiversAndLakesL', 'RiversAndLakesM',
       'RiversAndLakesS', 'WorkplaceDensityL', 'WorkplaceDensityM',
       'WorkplaceDensityS', 'Zip', 'distanceToTrainStation',
       'gde_area_agriculture_percentage', 'gde_area_forest_percentage',
       'gde_area_nonproductive_percentage', 'gde_area_settlement_percentage',
       'gde_average_house_hold', 'gde_empty_apartments',
       'gde_foreigners_percentage', 'gde_new_homes_per_1000',
       'gde_politics_cvp', 'gde_politics_fdp', 'gde_politics_gps',
       'gde_politics_sp', 'gde_politics_svp', 'gde_pop_per_km2',
       'gde_population', 'gde_private_apartments', 'gde_soci

### Second part of imputation

In [None]:
nan_percentage = (clean_df.isnull().sum() / len(clean_df)) * 100

cols = nan_percentage[(nan_percentage < 20) & (nan_percentage > 0)].index
cols

Index(['Living space', 'Plot area', 'gde_politics_cvp', 'gde_politics_fdp',
       'gde_politics_gps', 'gde_politics_sp', 'gde_politics_svp', 'rooms'],
      dtype='object')

In [None]:
source_path = Path('./data/model/01_clean_data.pkl')
imputer = load(source_path)['imputer']

In [None]:
clean_df = pd.DataFrame(
    imputer.transform(clean_df),
    columns=clean_df.columns)
clean_df.head()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
0,220.0,733.0,1.0,0.164382,0.10003,0.063548,47.37211,8.075858,0.003811,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,230.0,702.0,1.0,0.260855,0.170434,0.083253,47.371558,8.07311,0.002623,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,131.0,131.0,1.0,0.434114,0.357984,0.125505,47.413754,8.082318,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,140.0,206.0,1.0,0.14819,0.07661,0.0,47.373327,8.076892,0.005193,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,156.0,222.0,1.0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
target_path = Path('./data/kaggle/01_clean_data.pkl')
dump({
  'dataset': clean_df,
  'imputer': imputer
}, target_path)

['data/kaggle/01_clean_data.pkl']