# Data Wrangling

### Imports

In [1]:
import pandas as pd
import __main__ as main

from helpers.paths import Paths
from joblib import load, dump
from helpers.is_interactive import is_interactive

### Run dependency notebooks

In [2]:
if is_interactive(main):
    %run 01_0_data_wrangling.ipynb -p

Running previous notebooks...


## ZIP Import

In [3]:
#import data/kaggle/test_data-Kaggle-v0.10.csv.zip as a pandas dataframe
source_df = pd.read_csv(Paths.KAGGLE_SOURCE_DATA, compression='zip', header=0, sep=',', quotechar='"', low_memory=False)
clean_df = source_df.copy()

### Delete Unnamed Columns

In [4]:
dump(clean_df['Unnamed: 0.1'], Paths.KAGGLE_IDS_TO_PREDICT_DATA)
clean_df = clean_df.loc[:, ~clean_df.columns.str.contains('^Unnamed')]
clean_df.index.name = None
clean_df.tail()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,Floor space:,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned,Type:,Hall height:
24551,Wildhaus,,,,,"Hof 2, 9658 Wildhaus",1.5 Zimmerwohnung an zentraler Lage mit schöne...,"Description\n""Schöne Kleinwohnung mit Cheminée...",https://www.homegate.ch/buy/3002217323,,...,,,,9658.0,,,,36.0,Apartment,
24552,Wildhaus,,,,,9658 Wildhaus,OBJEKT IST BEREITS RESERVIERT!!Sie suchen eine...,"Description\n""BEREITS RESERVIERT!""\nOBJEKT IST...",https://www.homegate.ch/buy/3002219541,,...,,6.0,,9658.0,,,,,Apartment,
24553,Wildhaus,,,,,9658 Wildhaus,Mitten im Ober - Toggenburg ist ein 4 Zimmer C...,"Description\n""Chalet mit grossem Grundstück""\n...",https://www.homegate.ch/buy/3002218959,,...,,1.0,,9658.0,,,,800.0,Single house,
24554,Wildhaus,,,,,9658 Wildhaus,Zu verkaufen ist hier in Wildhaus eine schön u...,"Description\n""5½ Zimmer Ferienwohnung mit Gara...",https://www.homegate.ch/buy/3002218961,,...,,4.0,,9658.0,,,,,Apartment,
24555,Wildhaus,,,,,"Steinrütistrasse 28, 9658 Wildhaus",1-Zi-Whg. (EG) / 4-Zi-Whg. (OG) / 2 1/2-Zi-Whg...,"Description\n""Haus mit 2 Einliegerwohnungen in...",https://www.homegate.ch/buy/3001967293,,...,,,1463 m3,9658.0,,,,,Single house,


### There are Column which we can merge

In [5]:
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['detail_responsive#municipality'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Gemeinde'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Commune'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Comune'])
clean_df['Municipality']    = clean_df['Municipality'].fillna(clean_df['Municipality_merged'])

clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['detail_responsive#surface_living'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Wohnfläche'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Surface habitable'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Superficie abitabile'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Living_space_merged'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Living_area_unified'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Space extracted'])
clean_df['Living space']    = clean_df['Living space'].fillna(clean_df['Surface living:'])

clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['detail_responsive#surface_property'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Grundstücksfläche'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Surface du terrain'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Superficie del terreno'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Plot_area_merged'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Plot_area_unified'])
clean_df['Plot area']       = clean_df['Plot area'].fillna(clean_df['Land area:'])

clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['detail_responsive#surface_usable'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Nutzfläche'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Surface utile'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Superficie utile'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Floor_space_merged'])
clean_df['Floor space']     = clean_df['Floor space'].fillna(clean_df['Floor space:'])

clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['detail_responsive#floor'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Stockwerk'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Étage'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Piano'])
clean_df['Floor']           = clean_df['Floor'].fillna(clean_df['Floor_merged'])

clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['detail_responsive#available_from'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Verfügbarkeit'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Disponibilité'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Disponibilità'])
clean_df['Availability']  = clean_df['Availability'].fillna(clean_df['Availability_merged'])

clean_df = clean_df.drop(['detail_responsive#municipality', 'Gemeinde', 'Commune', 'Comune', 'Municipality_merged',
                            'detail_responsive#surface_living', 'Wohnfläche', 'Surface habitable', 'Superficie abitabile', 'Living_space_merged', 'Living_area_unified', 'Space extracted', 'Surface living:',
                            'detail_responsive#surface_property', 'Grundstücksfläche', 'Surface du terrain', 'Superficie del terreno', 'Plot_area_merged', 'Plot_area_unified', 'Land area:',
                            'detail_responsive#surface_usable', 'Nutzfläche', 'Surface utile', 'Superficie utile', 'Floor_space_merged', 'Floor space:',
                            'detail_responsive#floor', 'Stockwerk', 'Étage', 'Piano', 'Floor_merged',
                            'detail_responsive#available_from', 'Verfügbarkeit', 'Disponibilité', 'Disponibilità', 'Availability_merged'], axis=1)

clean_df.tail()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,description_detailed,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned,Type:,Hall height:
24551,Wildhaus,36.0,,,,"Hof 2, 9658 Wildhaus",1.5 Zimmerwohnung an zentraler Lage mit schöne...,"Description\n""Schöne Kleinwohnung mit Cheminée...",https://www.homegate.ch/buy/3002217323,,...,"Description\n""Schöne Kleinwohnung mit Cheminée...",,,9658.0,,,,36.0,Apartment,
24552,Wildhaus,40.0,,,,9658 Wildhaus,OBJEKT IST BEREITS RESERVIERT!!Sie suchen eine...,"Description\n""BEREITS RESERVIERT!""\nOBJEKT IST...",https://www.homegate.ch/buy/3002219541,,...,"Description\n""BEREITS RESERVIERT!""\nOBJEKT IST...",6.0,,9658.0,,,,,Apartment,
24553,Wildhaus,86.0,2803.0,,,9658 Wildhaus,Mitten im Ober - Toggenburg ist ein 4 Zimmer C...,"Description\n""Chalet mit grossem Grundstück""\n...",https://www.homegate.ch/buy/3002218959,,...,"Description\n""Chalet mit grossem Grundstück""\n...",1.0,,9658.0,,,,800.0,Single house,
24554,Wildhaus,127.0,,,,9658 Wildhaus,Zu verkaufen ist hier in Wildhaus eine schön u...,"Description\n""5½ Zimmer Ferienwohnung mit Gara...",https://www.homegate.ch/buy/3002218961,,...,"Description\n""5½ Zimmer Ferienwohnung mit Gara...",4.0,,9658.0,,,,,Apartment,
24555,Wildhaus,,610.0,,,"Steinrütistrasse 28, 9658 Wildhaus",1-Zi-Whg. (EG) / 4-Zi-Whg. (OG) / 2 1/2-Zi-Whg...,"Description\n""Haus mit 2 Einliegerwohnungen in...",https://www.homegate.ch/buy/3001967293,,...,"Description\n""Haus mit 2 Einliegerwohnungen in...",,1463 m3,9658.0,,,,,Single house,


### Take Informations from Column "details", and put the Informations into "rooms" and "Living space" if they are nan or 0

In [6]:
clean_df['rooms_from_details'] = (clean_df['details'].str.extract(r'(\d+) rooms')).astype(float)
clean_df['space_from_details'] = (clean_df['details'].str.extract(r'(\d+) m²')).astype(float)

clean_df['rooms'] = clean_df['rooms'].mask(clean_df['rooms'] == 0.0, clean_df['rooms_from_details'])
clean_df['rooms'] = clean_df['rooms'].fillna(clean_df['rooms_from_details'])

clean_df['Living space'] = clean_df['Living space'].fillna(clean_df['space_from_details'])

clean_df = clean_df.drop(['details', 'rooms_from_details', 'space_from_details'], axis=1)

clean_df.tail()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,description_detailed,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned,Type:,Hall height:
24551,Wildhaus,36.0,,,,"Hof 2, 9658 Wildhaus",1.5 Zimmerwohnung an zentraler Lage mit schöne...,"Description\n""Schöne Kleinwohnung mit Cheminée...",https://www.homegate.ch/buy/3002217323,,...,"Description\n""Schöne Kleinwohnung mit Cheminée...",,,9658.0,,,,36.0,Apartment,
24552,Wildhaus,40.0,,,,9658 Wildhaus,OBJEKT IST BEREITS RESERVIERT!!Sie suchen eine...,"Description\n""BEREITS RESERVIERT!""\nOBJEKT IST...",https://www.homegate.ch/buy/3002219541,,...,"Description\n""BEREITS RESERVIERT!""\nOBJEKT IST...",6.0,,9658.0,,,,,Apartment,
24553,Wildhaus,86.0,2803.0,,,9658 Wildhaus,Mitten im Ober - Toggenburg ist ein 4 Zimmer C...,"Description\n""Chalet mit grossem Grundstück""\n...",https://www.homegate.ch/buy/3002218959,,...,"Description\n""Chalet mit grossem Grundstück""\n...",1.0,,9658.0,,,,800.0,Single house,
24554,Wildhaus,127.0,,,,9658 Wildhaus,Zu verkaufen ist hier in Wildhaus eine schön u...,"Description\n""5½ Zimmer Ferienwohnung mit Gara...",https://www.homegate.ch/buy/3002218961,,...,"Description\n""5½ Zimmer Ferienwohnung mit Gara...",4.0,,9658.0,,,,,Apartment,
24555,Wildhaus,,610.0,,,"Steinrütistrasse 28, 9658 Wildhaus",1-Zi-Whg. (EG) / 4-Zi-Whg. (OG) / 2 1/2-Zi-Whg...,"Description\n""Haus mit 2 Einliegerwohnungen in...",https://www.homegate.ch/buy/3001967293,,...,"Description\n""Haus mit 2 Einliegerwohnungen in...",,1463 m3,9658.0,,,,,Single house,


### Remove m² from Columns

In [7]:
clean_df['Living space'] = clean_df['Living space'].astype(str)
clean_df['Living space'] = clean_df['Living space'].str.replace('m²', '')
clean_df['Living space'] = clean_df['Living space'].str.replace('m2', '')
clean_df['Living space'] = clean_df['Living space'].astype(float)

clean_df['Plot area'] = clean_df['Plot area'].astype(str)
clean_df['Plot area'] = clean_df['Plot area'].str.replace('m²', '')
clean_df['Plot area'] = clean_df['Plot area'].str.replace(',', '')
clean_df['Plot area'] = clean_df['Plot area'].astype(float)

clean_df['Floor space'] = clean_df['Floor space'].astype(str)
clean_df['Floor space'] = clean_df['Floor space'].str.replace('m²', '')
clean_df['Floor space'] = clean_df['Floor space'].str.replace('m2', '')
clean_df['Floor space'] = clean_df['Floor space'].astype(float)

clean_df['Floor'] = clean_df['Floor'].astype(str)
clean_df['Floor'] = clean_df['Floor'].str.replace('Ground floor', '0')
clean_df['Floor'] = clean_df['Floor'].str.replace('GF', '0')
clean_df['Floor'] = clean_df['Floor'].str.replace('. floor', '', regex=False)
clean_df['Floor'] = clean_df['Floor'].str.replace(r'(\d+)\. Basement', lambda x: str(int(x.group(1)) * -1), regex=True)
clean_df['Floor'] = clean_df['Floor'].astype(float)

clean_df['rooms'] = clean_df['rooms'].astype(str)
clean_df['rooms'] = clean_df['rooms'].str.replace('rm', '')
clean_df['rooms'] = clean_df['rooms'].astype(float)

clean_df.tail()

Unnamed: 0,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,url,table,...,description_detailed,Number of floors:,Volume:,plz,Number of toilets:,Gross yield:,Minimum floor space:,space_cleaned,Type:,Hall height:
24551,Wildhaus,36.0,,,,"Hof 2, 9658 Wildhaus",1.5 Zimmerwohnung an zentraler Lage mit schöne...,"Description\n""Schöne Kleinwohnung mit Cheminée...",https://www.homegate.ch/buy/3002217323,,...,"Description\n""Schöne Kleinwohnung mit Cheminée...",,,9658.0,,,,36.0,Apartment,
24552,Wildhaus,40.0,,,,9658 Wildhaus,OBJEKT IST BEREITS RESERVIERT!!Sie suchen eine...,"Description\n""BEREITS RESERVIERT!""\nOBJEKT IST...",https://www.homegate.ch/buy/3002219541,,...,"Description\n""BEREITS RESERVIERT!""\nOBJEKT IST...",6.0,,9658.0,,,,,Apartment,
24553,Wildhaus,86.0,2803.0,,,9658 Wildhaus,Mitten im Ober - Toggenburg ist ein 4 Zimmer C...,"Description\n""Chalet mit grossem Grundstück""\n...",https://www.homegate.ch/buy/3002218959,,...,"Description\n""Chalet mit grossem Grundstück""\n...",1.0,,9658.0,,,,800.0,Single house,
24554,Wildhaus,127.0,,,,9658 Wildhaus,Zu verkaufen ist hier in Wildhaus eine schön u...,"Description\n""5½ Zimmer Ferienwohnung mit Gara...",https://www.homegate.ch/buy/3002218961,,...,"Description\n""5½ Zimmer Ferienwohnung mit Gara...",4.0,,9658.0,,,,,Apartment,
24555,Wildhaus,,610.0,,,"Steinrütistrasse 28, 9658 Wildhaus",1-Zi-Whg. (EG) / 4-Zi-Whg. (OG) / 2 1/2-Zi-Whg...,"Description\n""Haus mit 2 Einliegerwohnungen in...",https://www.homegate.ch/buy/3001967293,,...,"Description\n""Haus mit 2 Einliegerwohnungen in...",,1463 m3,9658.0,,,,,Single house,


### How many NAN's has each column

In [8]:
(clean_df.isnull().sum() / len(clean_df)) * 100

Municipality             0.737091
Living space             5.623880
Plot area               61.378075
Floor space             79.414400
Availability            49.250692
                          ...    
Gross yield:            99.776022
Minimum floor space:    99.983711
space_cleaned           46.905033
Type:                   50.749308
Hall height:            99.983711
Length: 93, dtype: float64

In [9]:
clean_df.columns

Index(['Municipality', 'Living space', 'Plot area', 'Floor space',
       'Availability', 'location', 'description', 'detailed_description',
       'url', 'table', 'Floor', 'Gross return', 'location_parsed', 'title',
       'address', 'link', 'details_structured', 'lat', 'lon', 'index',
       'ForestDensityL', 'ForestDensityM', 'ForestDensityS', 'Latitude',
       'Locality', 'Longitude', 'NoisePollutionRailwayL',
       'NoisePollutionRailwayM', 'NoisePollutionRailwayS',
       'NoisePollutionRoadL', 'NoisePollutionRoadM', 'NoisePollutionRoadS',
       'PopulationDensityL', 'PopulationDensityM', 'PopulationDensityS',
       'RiversAndLakesL', 'RiversAndLakesM', 'RiversAndLakesS',
       'WorkplaceDensityL', 'WorkplaceDensityM', 'WorkplaceDensityS', 'Zip',
       'distanceToTrainStation', 'gde_area_agriculture_percentage',
       'gde_area_forest_percentage', 'gde_area_nonproductive_percentage',
       'gde_area_settlement_percentage', 'gde_average_house_hold',
       'gde_empty_apart

### Remove redundant columns

In [10]:
columns = ['Municipality',#unbrauchbar wegen long und lat
            'Availability',#zu wenig Daten
            'location',#long und lat
            'location_parsed',#long und lat
            'description',#unbrauchbar         
            'detailed_description',#unbrauchbar
            'url',#unbrauchbar
            'table',#unbrauchbar
            'Gross return',#zu wenig Daten
            'title',#unbrauchbar
            'address',#unbrauchbar wegen long und lat
            #'price',#gibt price_cleaned
            'link',#unbrauchbar
            'details_structured',#unbrauchbar
            'lat',#gibt latitute
            'lon',#gibt longitude
            'index',#unbrauchbar
            'Locality',#long und lat
            'plz_parsed',#gibt ZIP
            #'type',#gibt type_unified
            'Floor_unified',#gibt Floor
            'provider',#unbrauchbar
            'space',#gibt Living space
            #'price_s',#gibt price_cleaned
            'address_s',#unbrauchbar wegen long und lat
            'No. of rooms:',#gibt schon rooms
            'Number of apartments:',#unbrauchbar
            'Room height:',#zu wenig Daten
            'Last refurbishment:',#zu wenig Daten
            'Year built:',#zu wenig Daten
            'features',#unbrauchbar
            'description_detailed',#unbrauchbar
            'Number of floors:',#zu wenig Daten
            'Volume:',#unbrauchbar
            'plz',#gibt ZIP
            'Number of toilets:',#zu wenig Daten
            'Gross yield:',#zu wenig Daten
            'Minimum floor space:',#zu wenig Daten
            'space_cleaned',#gibt Living space
            'gde_politics_bdp',#zu wenig Daten
            'gde_politics_evp',#zu wenig Daten
            'gde_politics_glp',#zu wenig Daten
            'gde_politics_pda',#zu wenig Daten
            'gde_politics_rights',#zu wenig Daten
            'Type:',#gibt type_unified
            'Hall height:',#zu wenig Daten
]


clean_df = clean_df.drop(columns, axis=1)
clean_df.tail()

Unnamed: 0,Living space,Plot area,Floor space,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,...,gde_population,gde_private_apartments,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,rooms,type_unified
24551,36.0,,,2.0,0.186881,0.018827,0.0,47.204125,9.353275,0.0,...,2667.0,1162.0,1.087364,6.58,259.0,270.0,969.0,1498.0,1.5,flat
24552,40.0,,,3.0,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,...,2667.0,1162.0,1.087364,6.58,259.0,270.0,969.0,1498.0,1.5,flat
24553,86.0,2803.0,,,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,...,2667.0,1162.0,1.087364,6.58,259.0,270.0,969.0,1498.0,4.0,detached-house
24554,127.0,,,2.0,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,...,2667.0,1162.0,1.087364,6.58,259.0,270.0,969.0,1498.0,5.5,flat
24555,,610.0,,,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,...,2667.0,1162.0,1.087364,6.58,259.0,270.0,969.0,1498.0,,detached-house


### Divide categorical "type" variable into dummy variables

In [11]:
clean_df["type_unified"].unique()

array(['villa', 'detached-house', 'stepped-house', 'terrace-house',
       'flat', 'penthouse', 'attic-flat', 'stepped-apartment',
       'semi-detached-house', 'furnished-residential-property', 'studio',
       'duplex-maisonette', 'farmhouse', 'loft', 'chalet',
       'secondary-suite', 'castle', 'rustico', 'single-room',
       'detached-secondary-suite', 'attic-room', nan], dtype=object)

In [12]:
#get the indexes from rows where type is nan and save them in a pkl file
indexNames = clean_df[ clean_df['type_unified'].isnull() ].index.tolist()
target_path = Paths.KAGGLE_IDS_TYPE_NONE_DATA
dump(indexNames, target_path)

['data/kaggle/01_1_no_type_ids_id.dump']

In [13]:
clean_df = pd.get_dummies(clean_df, columns=['type_unified'])

### How many NAN's has each column

In [14]:
(clean_df.isnull().sum() / len(clean_df)) * 100

#Floor und Floorspace????

Living space                       5.623880
Plot area                         61.378075
Floor space                       79.414400
Floor                             56.755986
ForestDensityL                     0.000000
                                    ...    
type_unified_stepped-apartment     0.000000
type_unified_stepped-house         0.000000
type_unified_studio                0.000000
type_unified_terrace-house         0.000000
type_unified_villa                 0.000000
Length: 70, dtype: float64

In [15]:
#Florspace Temporär rausgenommen
clean_df = clean_df.drop(["Floor space"], axis = 1)
clean_df['Plot area'] = clean_df['Plot area'].fillna(clean_df['Living space'])
clean_df.tail()

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
24551,36.0,36.0,2.0,0.186881,0.018827,0.0,47.204125,9.353275,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
24552,40.0,40.0,3.0,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
24553,86.0,2803.0,,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
24554,127.0,127.0,2.0,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
24555,,610.0,,0.141426,0.003706,0.011718,47.203747,9.350731,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
clean_df.columns

Index(['Living space', 'Plot area', 'Floor', 'ForestDensityL',
       'ForestDensityM', 'ForestDensityS', 'Latitude', 'Longitude',
       'NoisePollutionRailwayL', 'NoisePollutionRailwayM',
       'NoisePollutionRailwayS', 'NoisePollutionRoadL', 'NoisePollutionRoadM',
       'NoisePollutionRoadS', 'PopulationDensityL', 'PopulationDensityM',
       'PopulationDensityS', 'RiversAndLakesL', 'RiversAndLakesM',
       'RiversAndLakesS', 'WorkplaceDensityL', 'WorkplaceDensityM',
       'WorkplaceDensityS', 'Zip', 'distanceToTrainStation',
       'gde_area_agriculture_percentage', 'gde_area_forest_percentage',
       'gde_area_nonproductive_percentage', 'gde_area_settlement_percentage',
       'gde_average_house_hold', 'gde_empty_apartments',
       'gde_foreigners_percentage', 'gde_new_homes_per_1000',
       'gde_politics_cvp', 'gde_politics_fdp', 'gde_politics_gps',
       'gde_politics_sp', 'gde_politics_svp', 'gde_pop_per_km2',
       'gde_population', 'gde_private_apartments', 'gde_soci

### Imputation

In [17]:
nan_percentage = (clean_df.isnull().sum() / len(clean_df)) * 100

cols = nan_percentage[(nan_percentage > 0)].index
cols

Index(['Living space', 'Plot area', 'Floor', 'gde_politics_cvp',
       'gde_politics_fdp', 'gde_politics_gps', 'gde_politics_sp',
       'gde_politics_svp', 'rooms'],
      dtype='object')

In [18]:
TYPES_WITH_NO_FLOOR = ['type_unified_villa', 'type_unified_farmhouse', 'type_unified_castle', 'type_unified_chalet', 'type_unified_detached-house', 'type_unified_semi-detached-house']

for col in TYPES_WITH_NO_FLOOR:
    clean_df.loc[clean_df[col] == 1., 'Floor'] = clean_df[clean_df[col] == 1.].Floor.fillna(0.)

clean_df[clean_df[TYPES_WITH_NO_FLOOR[2]] == 1.]

Unnamed: 0,Living space,Plot area,Floor,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,...,type_unified_penthouse,type_unified_rustico,type_unified_secondary-suite,type_unified_semi-detached-house,type_unified_single-room,type_unified_stepped-apartment,type_unified_stepped-house,type_unified_studio,type_unified_terrace-house,type_unified_villa
4113,450.0,2850.0,0.0,0.062448,0.00447,0.0,46.755407,7.149263,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4538,900.0,7623.0,0.0,0.039354,0.034158,0.0,46.510841,6.849226,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
12239,275.0,302.0,0.0,0.000801,0.0,0.0,46.593945,6.565308,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
12251,503.0,2214.0,0.0,0.000801,0.0,0.0,46.593945,6.565308,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
12257,228.0,1912.0,0.0,0.000801,0.0,0.0,46.593945,6.565308,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
12285,300.0,302.0,0.0,0.000801,0.0,0.0,46.593945,6.565308,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
12288,228.0,1912.0,0.0,0.000801,0.0,0.0,46.593945,6.565308,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
13443,228.0,1912.0,0.0,0.000801,0.0,0.0,46.593945,6.565308,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
13444,228.0,1912.0,0.0,0.000801,0.0,0.0,46.593945,6.565308,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
13447,275.0,302.0,0.0,0.000801,0.0,0.0,46.593945,6.565308,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
source_data = load(Paths.REGRESSOR_DATA_WRANGLING_DATA)
imputers = source_data['imputers']
base_imputer = source_data['base_imputer']

In [20]:
columns = clean_df.columns

for col in imputers.keys():
    clean_df.loc[clean_df[col] == 1., columns] = imputers[col].transform(clean_df.loc[clean_df[col] == 1.])

clean_df[columns] = base_imputer.transform(clean_df)

In [21]:
clean_df["NoisePollutionRailway"] = clean_df[["NoisePollutionRailwayS", "NoisePollutionRailwayM", "NoisePollutionRailwayL"]].mean(axis=1)
clean_df = clean_df.drop(["NoisePollutionRailwayS", "NoisePollutionRailwayM", "NoisePollutionRailwayL"], axis = 1)

clean_df["NoisePollutionRoad"] = clean_df[["NoisePollutionRoadS", "NoisePollutionRoadM", "NoisePollutionRoadL"]].mean(axis=1)
clean_df = clean_df.drop(["NoisePollutionRoadS", "NoisePollutionRoadM", "NoisePollutionRoadL"], axis = 1)

clean_df["PopulationDensity"] = clean_df[["PopulationDensityS", "PopulationDensityM", "PopulationDensityL"]].mean(axis=1)
clean_df = clean_df.drop(["PopulationDensityS", "PopulationDensityM", "PopulationDensityL"], axis = 1)

clean_df["RiversAndLakes"] = clean_df[["RiversAndLakesS", "RiversAndLakesM", "RiversAndLakesL"]].mean(axis=1)
clean_df = clean_df.drop(["RiversAndLakesS", "RiversAndLakesM", "RiversAndLakesL"], axis = 1)

clean_df["WorkplaceDensity"] = clean_df[["WorkplaceDensityS", "WorkplaceDensityM", "WorkplaceDensityL"]].mean(axis=1)
clean_df = clean_df.drop(["WorkplaceDensityS", "WorkplaceDensityM", "WorkplaceDensityL"], axis = 1)

clean_df["ForestDensity"] = clean_df[["ForestDensityS", "ForestDensityM", "ForestDensityL"]].mean(axis=1)
clean_df = clean_df.drop(["ForestDensityS", "ForestDensityM", "ForestDensityL"], axis = 1)

In [22]:
dump({
  'dataset': clean_df,
  'base_imputer': base_imputer,
  'imputers': imputers
}, Paths.KAGGLE_DATA_WRANGLING_DATA)

['data/kaggle/01_0_data_wrangling.dump']