# Data Wrangling

## Imports

In [1]:
import pandas as pd
from pathlib import Path

## CSV Import

In [2]:
source_path = Path('data/original/immoscout.csv')
source_df = pd.read_csv(source_path)
clean_df = source_df.copy()

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
source_df.head(30)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Municipality,Living space,Plot area,Floor space,Availability,location,description,detailed_description,...,gde_social_help_quota,gde_tax,gde_workers_sector1,gde_workers_sector2,gde_workers_sector3,gde_workers_total,price_cleaned,type,Space extracted,rooms
0,0,0,Biberstein,100 m²,,,On request,"5023 Biberstein, AG","3.5 rooms, 100 m²«Luxuriöse Attika-Wohnung mit...",DescriptionLuxuriöse Attika-Wohnung direkt an ...,...,2.234259,5.89,14.0,9.0,308.0,331.0,1150000.0,penthouse,100.0,5.0
1,1,1,Biberstein,156 m²,222 m²,242 m²,On request,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Stilvolle Liegenschaft - ruh...",DescriptionStilvolle Liegenschaft an ruhiger L...,...,2.234259,5.89,14.0,9.0,308.0,331.0,1420000.0,terrace-house,156.0,5.0
2,2,2,,,,,,"5022 Rombach, AG","2.5 rooms, 93 m²«Moderne, lichtdurchflutete At...","detail_responsive#description_title2,5 Zimmerw...",...,3.54901,6.05,37.0,3092.0,30364.0,33493.0,720000.0,penthouse,93.0,5.0
3,3,3,Biberstein,154 m²,370 m²,257 m²,On request,"Buhaldenstrasse 8A5023 Biberstein, AG","4.5 rooms, 154 m²«AgentSelly - Luxuriöses Eckh...",DescriptionDieses äusserst grosszügige Minergi...,...,2.234259,5.89,14.0,9.0,308.0,331.0,1430000.0,detached-house,154.0,5.0
4,4,4,Küttigen,142 m²,,,On request,"5022 Rombach, AG","4.5 rooms, 142 m²«MIT GARTENSITZPLATZ UND VIEL...",DescriptionAus ehemals zwei Wohnungen wurde ei...,...,1.708126,6.3,65.0,349.0,941.0,1355.0,995000.0,flat,142.0,5.0
5,5,5,Erlinsbach (AG),190 m²,1063 m²,220 m²,On request,"Buchhalde 365018 Erlinsbach, AG","5.5 rooms, 190 m²«Modernes, grosszügiges Maiso...",DescriptionDer Blick in die Weite vermittelt R...,...,2.54216,6.16,64.0,73.0,829.0,966.0,2160000.0,detached-house,190.0,5.0
6,6,6,Biberstein,124 m²,200 m²,,Immediately,"5023 Biberstein, AG","4.5 rooms, 124 m²«Kompakt, doch geräumiges 4½ ...",DescriptionZum Objekt:Kompakt und doch sehr ge...,...,2.234259,5.89,14.0,9.0,308.0,331.0,550000.0,terrace-house,124.0,5.0
7,7,7,Aarau,,,,On request,"5004 Aarau, AG",4.5 rooms«Preishit! Grossräumige Wohnung mitte...,DescriptionNaturnah und doch am Zentrum diese ...,...,3.54901,6.05,37.0,3092.0,30364.0,33493.0,590000.0,flat,,5.0
8,8,8,Aarau,75 m²,,,On request,"Siebenmatten 495032 Aarau Rohr, AG","3.5 rooms, 75 m²«Gepflegte 3.5 Zimmer Dachwohn...",DescriptionDie Überbauung Siebenmatten in Aara...,...,3.54901,6.05,37.0,3092.0,30364.0,33493.0,547000.0,flat,75.0,5.0
9,9,9,Erlinsbach (AG),110 m²,2116 m²,,On request,"5018 Erlinsbach, AG","4.5 rooms, 110 m²«Renovierte 4.5 Zimmer Terras...","DescriptionTreten Sie ein, in Ihr neues, liebe...",...,2.54216,6.16,64.0,73.0,829.0,966.0,1125000.0,stepped-house,110.0,5.0


In [4]:
source_df.count()

Unnamed: 0           13378
Unnamed: 0.1         13378
Municipality         12446
Living space         11634
Plot area             4696
                     ...  
gde_workers_total    13378
price_cleaned        12362
type                 13378
Space extracted      12308
rooms                12799
Length: 108, dtype: int64

### Helpers

In [5]:
def delete_clean_df_columns(columns):
    global clean_df
    for c in columns:
        if c in clean_df.columns:
            clean_df = clean_df.drop([c], axis=1)

### Delete Unnamed Columns

In [6]:
clean_df = clean_df.loc[:, ~clean_df.columns.str.contains('^Unnamed')]

### Delete Mostly Empty Columns

In [7]:
mostly_empty_columns = [
    "detail_responsive#municipality",
    "detail_responsive#surface_living",
    "detail_responsive#floor",
    "detail_responsive#available_from",
    "Gemeinde",
    "Wohnfläche",
    "Stockwerk",
    "Nutzfläche",
    "Verfügbarkeit",
    "Grundstücksfläche",
    "detail_responsive#surface_property",
    "detail_responsive#surface_usable",
    "Commune",
    "Surface habitable",
    "Surface du terrain",
    "Surface utile",
    "Disponibilité",
    "Étage",
    "Comune",
    "Superficie abitabile",
    "Disponibilità",
    "Gross return",
    "Piano",
    "Superficie del terreno",
    "Superficie utile"
]

delete_clean_df_columns(mostly_empty_columns)

### Take Informations from Column "details"

In [8]:
clean_df["rooms_from_details"] = (clean_df["details"].str.extract(r'(\d+) rooms')).astype(float)
clean_df["space_from_details"] = (clean_df["details"].str.extract(r'(\d+) m²')).astype(float)

### Put the Information from column "details" into "rooms" and "Space extracted" if they are nan or 0.0

In [9]:
clean_df["rooms"] = clean_df["rooms"].mask(clean_df["rooms"] == 0.0, clean_df["rooms_from_details"])
clean_df["rooms"] = clean_df["rooms"].fillna(clean_df["rooms_from_details"])

clean_df["Space extracted"] = clean_df["Space extracted"].fillna(clean_df["space_from_details"])

### Move m2 to column header

In [10]:
m2_columns = ["Floor_space_merged", "Plot_area_merged"]

for col in m2_columns:
    clean_df[col + "_m2"] = clean_df[col].str[:-3]

### Extract floor information

In [11]:
floor_col = clean_df["Floor_merged"]
floor_col = floor_col.str.replace("Ground", "0.")
floor_col = floor_col.str[:-7]

clean_df["floor"] = floor_col

### Remove redundant columns

In [12]:
redundant_columns = [
    "Municipality",
    "Living space",
    "Plot area",
    "Floor space",
    "Availability",
    "Floor",
    "Living_space_merged",
    "location_parsed",
    "details",
    "price",
    "details_structured",
    "index",
    "Floor_space_merged",
    "Plot_area_merged",
    "lat",
    "lon",
    "Floor_merged",
    "space_from_details",
    "rooms_from_details"
]

delete_clean_df_columns(redundant_columns);

### Make naming more consistent

In [13]:
name_mapping = [
    ["Space extracted", "living_space_m2"],
    ["Floor_space_merged_m2", "floor_space_m2"],
    ["Plot_area_merged_m2", "plot_area_m2"],
    ["Availability_merged", "availability"],
    ["Municipality_merged", "municipality"],
    ["price_cleaned", "price"]
]

for name in name_mapping:
    if name[0] not in clean_df.columns:
        continue
    clean_df[name[1]] = clean_df[name[0]].copy()
    clean_df = clean_df.drop(name[0], axis=1)

In [14]:
#rows where price is over 40 000 000
test = clean_df[clean_df.price > 40000000]
test

Unnamed: 0,location,description,detailed_description,url,table,title,address,link,ForestDensityL,ForestDensityM,...,gde_workers_total,type,rooms,floor,living_space_m2,floor_space_m2,plot_area_m2,availability,municipality,price
10680,"1162 St-Prex, VD",10 rooms«Somptueuse propriété pieds dans l eau...,DescriptionCette propriété de maître ####pieds...,https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,Somptueuse propriété pieds dans l'eau à l'abri...,"1162 St-Prex, VD",/en/d/detached-house-buy-st-prex/7242923,0.001743,0.0,...,2734.0,detached-house,10.0,,5000.0,,5000,On request,Saint-Prex,45000000.0


## Save clean data

In [15]:
clean_df.head(50)

Unnamed: 0,location,description,detailed_description,url,table,title,address,link,ForestDensityL,ForestDensityM,...,gde_workers_total,type,rooms,floor,living_space_m2,floor_space_m2,plot_area_m2,availability,municipality,price
0,"5023 Biberstein, AG","3.5 rooms, 100 m²«Luxuriöse Attika-Wohnung mit...",DescriptionLuxuriöse Attika-Wohnung direkt an ...,https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,Luxuriöse Attika-Wohnung mit herrlicher Aussicht,"5023 Biberstein, AG",/en/d/penthouse-buy-biberstein/7255200,0.511176,0.286451,...,331.0,penthouse,5.0,4.0,100.0,,,On request,Biberstein,1150000.0
1,"Buhldenstrasse 8d5023 Biberstein, AG","4.5 rooms, 156 m²«Stilvolle Liegenschaft - ruh...",DescriptionStilvolle Liegenschaft an ruhiger L...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,"Stilvolle Liegenschaft - ruhige Lage, unverbau...","Buhldenstrasse 8d, 5023 Biberstein, AG",/en/d/terrace-house-buy-biberstein/7266694,0.511176,0.286451,...,331.0,terrace-house,5.0,,156.0,242.0,222.0,On request,Biberstein,1420000.0
2,"5022 Rombach, AG","2.5 rooms, 93 m²«Moderne, lichtdurchflutete At...","detail_responsive#description_title2,5 Zimmerw...",https://www.immoscout24.ch//en/d/penthouse-buy...,b <article class=####Box-cYFBPY hKrxoH####><h2...,"Moderne, lichtdurchflutete Attikawohnung mit E...","5022 Rombach, AG",/en/d/penthouse-buy-rombach/7261389,0.163362,0.095877,...,33493.0,penthouse,5.0,,93.0,,,,,720000.0
3,"Buhaldenstrasse 8A5023 Biberstein, AG","4.5 rooms, 154 m²«AgentSelly - Luxuriöses Eckh...",DescriptionDieses äusserst grosszügige Minergi...,https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,AgentSelly - Luxuriöses Eckhaus an toller Süd-...,"Buhaldenstrasse 8A, 5023 Biberstein, AG",/en/d/detached-house-buy-biberstein/7047212,0.511176,0.286451,...,331.0,detached-house,5.0,,154.0,257.0,370.0,On request,Biberstein,1430000.0
4,"5022 Rombach, AG","4.5 rooms, 142 m²«MIT GARTENSITZPLATZ UND VIEL...",DescriptionAus ehemals zwei Wohnungen wurde ei...,https://www.immoscout24.ch//en/d/flat-buy-romb...,b <article class=####Box-cYFBPY hKrxoH####><h2...,MIT GARTENSITZPLATZ UND VIELEN EXTRAS,"5022 Rombach, AG",/en/d/flat-buy-rombach/7293107,0.333865,0.279276,...,1355.0,flat,5.0,0.0,142.0,,,On request,Küttigen,995000.0
5,"Buchhalde 365018 Erlinsbach, AG","5.5 rooms, 190 m²«Modernes, grosszügiges Maiso...",DescriptionDer Blick in die Weite vermittelt R...,https://www.immoscout24.ch//en/d/detached-hous...,b <article class=####Box-cYFBPY hKrxoH####><h2...,"Modernes, grosszügiges Maisonette Terrassenhau...","Buchhalde 36, 5018 Erlinsbach, AG",/en/d/detached-house-buy-erlinsbach/7257554,0.190581,0.162838,...,966.0,detached-house,5.0,,190.0,220.0,1063.0,On request,Erlinsbach (AG),2160000.0
6,"5023 Biberstein, AG","4.5 rooms, 124 m²«Kompakt, doch geräumiges 4½ ...",DescriptionZum Objekt:Kompakt und doch sehr ge...,https://www.immoscout24.ch//en/d/terrace-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,"Kompakt, doch geräumiges 4½ Zimmer-Mittelhaus ...","5023 Biberstein, AG",/en/d/terrace-house-buy-biberstein/7048767,0.511176,0.286451,...,331.0,terrace-house,5.0,,124.0,,200.0,Immediately,Biberstein,550000.0
7,"5004 Aarau, AG",4.5 rooms«Preishit! Grossräumige Wohnung mitte...,DescriptionNaturnah und doch am Zentrum diese ...,https://www.immoscout24.ch//en/d/flat-buy-aara...,b <article class=####Box-cYFBPY hKrxoH####><h2...,Preishit! Grossräumige Wohnung mitten in Aarau,"5004 Aarau, AG",/en/d/flat-buy-aarau/7262713,0.051334,0.0,...,33493.0,flat,5.0,3.0,,,,On request,Aarau,590000.0
8,"Siebenmatten 495032 Aarau Rohr, AG","3.5 rooms, 75 m²«Gepflegte 3.5 Zimmer Dachwohn...",DescriptionDie Überbauung Siebenmatten in Aara...,https://www.immoscout24.ch//en/d/flat-buy-aara...,b <article class=####Box-cYFBPY hKrxoH####><h2...,Gepflegte 3.5 Zimmer Dachwohnung mit Balkon in...,"Siebenmatten 49, 5032 Aarau Rohr, AG",/en/d/flat-buy-aarau-rohr/7269563,0.277149,0.438344,...,33493.0,flat,5.0,,75.0,,,On request,Aarau,547000.0
9,"5018 Erlinsbach, AG","4.5 rooms, 110 m²«Renovierte 4.5 Zimmer Terras...","DescriptionTreten Sie ein, in Ihr neues, liebe...",https://www.immoscout24.ch//en/d/stepped-house...,b <article class=####Box-cYFBPY hKrxoH####><h2...,Renovierte 4.5 Zimmer Terrassenwohnung mit tra...,"5018 Erlinsbach, AG",/en/d/stepped-house-buy-erlinsbach/7214319,0.140373,0.140085,...,966.0,stepped-house,5.0,,110.0,,2116.0,On request,Erlinsbach (AG),1125000.0


## Save clean data

In [16]:
target_path = Path('data/clean/immoscout.csv')
clean_df.to_csv(target_path, index=False)

## Remove non numeric values

In [17]:
non_numeric_columns = [
    "location",
    "description",
    "detailed_description",
    "url",
    "table",
    "title",
    "address",
    "link",
    "Locality"
]

delete_clean_df_columns(non_numeric_columns)

In [18]:
clean_df.head(50)

Unnamed: 0,ForestDensityL,ForestDensityM,ForestDensityS,Latitude,Longitude,NoisePollutionRailwayL,NoisePollutionRailwayM,NoisePollutionRailwayS,NoisePollutionRoadL,NoisePollutionRoadM,...,gde_workers_total,type,rooms,floor,living_space_m2,floor_space_m2,plot_area_m2,availability,municipality,price
0,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,0.0,0.058298,0.067048,...,331.0,penthouse,5.0,4.0,100.0,,,On request,Biberstein,1150000.0
1,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,0.0,0.058298,0.067048,...,331.0,terrace-house,5.0,,156.0,242.0,222.0,On request,Biberstein,1420000.0
2,0.163362,0.095877,0.001911,47.397416,8.04315,0.0,0.0,0.0,0.334957,0.381257,...,33493.0,penthouse,5.0,,93.0,,,,,720000.0
3,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,0.0,0.058298,0.067048,...,331.0,detached-house,5.0,,154.0,257.0,370.0,On request,Biberstein,1430000.0
4,0.333865,0.279276,0.145835,47.40487,8.052781,0.0,0.0,0.0,0.133498,0.132933,...,1355.0,flat,5.0,0.0,142.0,,,On request,Küttigen,995000.0
5,0.190581,0.162838,0.034759,47.401163,8.012034,0.0,0.0,0.0,0.194222,0.143205,...,966.0,detached-house,5.0,,190.0,220.0,1063.0,On request,Erlinsbach (AG),2160000.0
6,0.511176,0.286451,0.090908,47.415927,8.08584,0.0,0.0,0.0,0.058298,0.067048,...,331.0,terrace-house,5.0,,124.0,,200.0,Immediately,Biberstein,550000.0
7,0.051334,0.0,0.0,47.388821,8.042194,0.04845,0.057053,0.051848,0.348125,0.374736,...,33493.0,flat,5.0,3.0,,,,On request,Aarau,590000.0
8,0.277149,0.438344,0.327528,47.400929,8.070691,0.022997,0.003745,0.0,0.400723,0.409636,...,33493.0,flat,5.0,,75.0,,,On request,Aarau,547000.0
9,0.140373,0.140085,0.15665,47.395295,8.012752,0.0,0.0,0.0,0.200754,0.219187,...,966.0,stepped-house,5.0,,110.0,,2116.0,On request,Erlinsbach (AG),1125000.0


In [19]:
target_path = Path('data/clean/immoscout_only_numeric.csv')
clean_df.to_csv(target_path, index=False)