# Data Cleaning

#### Importing raw data 

In [None]:
import pandas as pd

In [None]:
immo24_all=pd.read_csv("immo24_warm.csv")

In [None]:
immo24_all.head()

#### Dropping useless information

In [None]:
immo24=immo24_all.drop(["onlySmall","result-list-entry__brand-title-container href", "result-list-entry__brand-logo-container href","font-s href", "result-list-entry__brand-logo-container href 2", "slick-slide href", "image-index-label", "gallery__count","total-media-count-label","result-list-entry__criteria href", "block 2", "result-list-entry__new-flag","onlySmall 2", "font-tabular 3" ],  axis=1)

In [None]:
immo24

#### Getting the unique values of multiple columns

In [None]:
# Ahmet hat die uniquen values verwickelt (values.ravel)
pd.unique(immo24[["margin-top-none", "margin-top-none 2", "margin-top-none 3"]].values.ravel('K'))
# the letter 'K' is a shorthand for the order 'K' or 'F', which stands for 'Fortran-style', 
# meaning the elements are arranged in column-major order
# in this case, passing 'K' as an argument to ravel() ensures that the elements are flattened 
# in column-major order, meaning the elements of each column are concatenated before moving to the next column. 
# since the input is a dataframe, this ensures that values from the same column are concatenated together 
# in the flattened array.







#### Adding new empty columns

In [None]:
immo24["Balkon/Terrasse"] = ""

In [None]:
immo24['Einbauküche'], immo24['Keller'], immo24['Garten'] = "", "", ""

In [None]:
immo24['Aufzug'], immo24['Gäste-WC'], immo24['WG-geeignet'] = "", "", ""

In [None]:
immo24['Stufenlos']=""

#### Creating dummy-value (0 or 1) for features

In [None]:
immo24.tail()

In [None]:
for i in range (len(immo24)):
    if immo24.iloc[i,6]== "Balkon/Terrasse":
        immo24.iloc[i,9]=1
    elif immo24.iloc[i,7]== "Balkon/Terrasse":
        immo24.iloc[i,9]=1
    elif immo24.iloc[i,8]== "Balkon/Terrasse":
        immo24.iloc[i,9]=1
    else:
        immo24.iloc[i,9]=0
        

In [None]:
for i in range (len(immo24)):
    if immo24.iloc[i,6]== "Einbauküche" or immo24.iloc[i,7]== "Einbauküche" or immo24.iloc[i,8]== "Einbauküche":
        immo24.iloc[i,10]=1
    else:
        immo24.iloc[i,10]=0

In [None]:
for i in range (len(immo24)):
    if immo24.iloc[i,6]== "Keller" or immo24.iloc[i,7]== "Keller" or immo24.iloc[i,8]== "Keller":
        immo24.iloc[i,11]=1
    else:
        immo24.iloc[i,11]=0

In [None]:
for i in range (len(immo24)):
    if immo24.iloc[i,6]== "Garten" or immo24.iloc[i,7]== "Garten" or immo24.iloc[i,8]== "Garten":
        immo24.iloc[i,12]=1
    else:
        immo24.iloc[i,12]=0

In [None]:
for i in range (len(immo24)):
    if immo24.iloc[i,6]== "Aufzug" or immo24.iloc[i,7]== "Aufzug" or immo24.iloc[i,8]== "Aufzug":
        immo24.iloc[i,13]=1
    else:
        immo24.iloc[i,13]=0

In [None]:
for i in range (len(immo24)):
    if immo24.iloc[i,6]== "Gäste-WC" or immo24.iloc[i,7]== "Gäste-WC" or immo24.iloc[i,8]== "Gäste-WC":
        immo24.iloc[i,14]=1
    else:
        immo24.iloc[i,14]=0

In [None]:
for i in range (len(immo24)):
    if immo24.iloc[i,6]== "WG-geeignet" or immo24.iloc[i,7]== "WG-geeignet" or immo24.iloc[i,8]== "WG-geeignet":
        immo24.iloc[i,15]=1
    else:
        immo24.iloc[i,15]=0

In [None]:
for i in range (len(immo24)):
    if immo24.iloc[i,6]== "Stufenlos" or immo24.iloc[i,7]== "Stufenlos" or immo24.iloc[i,8]== "Stufenlos":
        immo24.iloc[i,16]=1
    else:
        immo24.iloc[i,16]=0

In [None]:
immo24.tail()

#### Getting the specific district from address

In [None]:
Stadtteil=[]
for i in range (len(immo24)):
    elem= immo24["result-list-entry__map-link"][i].split(",")[(immo24["result-list-entry__map-link"][i].split(",").index(" Berlin"))-1]
    elem= elem.lstrip()
    elem=elem.replace("(Ortsteil)","")
    elem= elem.rstrip()
    Stadtteil.append(elem)

In [None]:
len(Stadtteil)


In [None]:
immo24["Stadtteil"]= Stadtteil

In [None]:
len(immo24["Stadtteil"].unique())

In [None]:
immo24["Stadtteil"].unique()

#### Checking NaN's

In [None]:
immo24["onlyLarge"].value_counts()

In [None]:
immo24["onlyLarge"].isna().sum()

In [None]:
immo24.info()

In [None]:
immo24["font-highlight"].unique()

In [None]:
immo24["font-highlight"].isna().sum()

#### Renaming and rearranging columns

In [None]:
immo24.columns


In [None]:
new_column_names = {"result-list-entry__brand-title":"description","block":"landlord","result-list-entry__map-link":"address","font-highlight":"total rent","font-highlight 2":"area","onlyLarge":"number of rooms","margin-top-none":"feature 1","margin-top-none 2":"feature 2","margin-top-none 3":"feature 3","Balkon/Terrasse":"balcony","Einbauküche":"built-in kitchen","Keller":"basement","Garten":"garden","Aufzug":"elevator","Gäste-WC":"guest toilet","WG-geeignet":"flat share possible","Stufenlos":"stepless","Stadtteil":"district"}

In [None]:
df = immo24.rename(columns=new_column_names)

In [None]:
#df["district"]=df["district"].str.lower()

In [None]:
df = df[["district","address","description","landlord","total rent","area","number of rooms","balcony","built-in kitchen","basement","garden","elevator","stepless","guest toilet","flat share possible"]]

In [None]:
df.landlord.unique()

In [None]:
# replace values in landlord column based on conditions
df["landlord"] = df["landlord"].fillna("").astype(str)
df["landlord"] = df["landlord"].apply(lambda x: "estate agent" if "Frau" in x or "Herr" in x else x)
df["landlord"] = df["landlord"].apply(lambda x: "private offer" if "Privatangebot" in x or "privat" in x or "Privat" in x else x)

df.landlord.unique()

In [None]:
pd.set_option('display.max_rows', 1000)
df.landlord.value_counts()

#### Deleting, useless characters and changing , to .

In [None]:
for i in range (len(df)):
    df["total rent"][i]= df["total rent"][i].replace("€","").replace("~","").rstrip().replace(".","").replace(",",".")
    

In [None]:
for i in range (len(df)):
    df["area"][i]= df["area"][i].replace("m²","").replace("~","").rstrip().replace(".","").replace(",",".")

In [None]:
df.info()

In [None]:
df["number of rooms"].unique()

#### Dropping rows with incomplete information

In [None]:
df["number of rooms"].isna().sum()

In [None]:
df.drop(df.loc[df["number of rooms"].isna()==True].index, inplace=True)
    

In [None]:
df.reset_index(inplace=True)

In [None]:
df["number of rooms"] = df["number of rooms"].astype(str)

In [None]:
for i in range (len(df)):
    df["number of rooms"][i]= df["number of rooms"][i].replace(",",".")

#### Adapting datatypes

In [None]:
df.info()

In [None]:
df[["number of rooms", "total rent","area"]] = df[["number of rooms", "total rent","area"]].astype(float)

In [None]:
df[['balcony', 'built-in kitchen', 'basement', 'garden','elevator', 'stepless', 'guest toilet', 'flat share possible']]=df[['balcony', 'built-in kitchen', 'basement', 'garden','elevator', 'stepless', 'guest toilet', 'flat share possible']].astype(int)

In [None]:
df.info()

In [None]:
df.drop("index", axis=1, inplace=True)

In [None]:
#df.to_csv("df_cleaned.csv",index=False)
#saved as df_cleaned in ordner 
#continued to manually clean in excel with excel df_cleaned (names of landlord, combined to estate agent, private offer...)

## Creating numeric df as basis for machine learning

In [1]:
import pandas as pd
# uploading cleaned df
data = pd.read_csv("df_cleaned.csv")

In [2]:
data = pd.get_dummies(data, columns = ["district","landlord"], drop_first= True)

In [3]:
data.columns = data.columns.str.lower()

In [4]:
pd.set_option("display.max_columns", None)
data.columns
data.tail()

Unnamed: 0,address,description,total rent,area,number of rooms,balcony,built-in kitchen,basement,garden,elevator,stepless,guest toilet,flat share possible,district_alt-hohenschönhausen,district_alt-treptow,district_altglienicke,district_baumschulenweg,district_biesdorf,district_blankenburg,district_bohnsdorf,district_borsigwalde,district_britz,district_buch,district_buckow,district_charlottenburg,district_charlottenburg-nord,district_dahlem,district_falkenberg,district_falkenhagener feld,district_fennpfuhl,district_französisch buchholz,district_friedenau,district_friedrichsfelde,district_friedrichshagen,district_friedrichshain,district_gatow,district_gesundbrunnen,district_gropiusstadt,district_grunewald,district_grünau,district_hakenfelde,district_halensee,district_hansaviertel,district_haselhorst,district_heiligensee,district_heinersdorf,district_hellersdorf,district_hermsdorf,district_johannisthal,district_karlshorst,district_karow,district_kaulsdorf,district_konradshöhe,district_kreuzberg,district_köpenick,district_lankwitz,district_lichtenberg,district_lichtenrade,district_lichterfelde,district_lübars,district_mahlsdorf,district_mariendorf,district_marienfelde,district_marzahn,district_mitte,district_moabit,district_märkisches viertel,district_müggelheim,district_neu-hohenschönhausen,district_neukölln,district_niederschöneweide,district_niederschönhausen,district_nikolassee,district_oberschöneweide,district_pankow,district_plänterwald,district_prenzlauer berg,district_rahnsdorf,district_reinickendorf,district_rosenthal,district_rummelsburg,district_schmargendorf,district_schmöckwitz,district_schöneberg,district_siemensstadt,district_spandau,district_staaken,district_steglitz,district_tegel,district_tempelhof,district_tiergarten,district_wannsee,district_wedding,district_weißensee,district_westend,district_wilhelmsruh,district_wilhelmstadt,district_wilmersdorf,district_wittenau,district_zehlendorf,landlord_degewo,landlord_estate agent,landlord_housinganywhere b.v.,landlord_howoge,landlord_numa group,landlord_private offer,landlord_tauschwohnung wohnungstausch,landlord_visionapartments,landlord_wohnungsswap.de
3480,"Wilhelmsmühlenweg 12, Kaulsdorf, Berlin",3- Raumwohnung in Kaulsdorf in Bahnhofnähe,1035.0,80.0,3.0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3481,"Nollendorfstr. 17, Schöneberg, Berlin",Schöne 2 - Zimmer - Wohnung zum Wohlfühlen,1663.0,84.1,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3482,"Tegel, Berlin","Alt-Tegel, seenahe ruhige Maisonette-Whg, 72m²...",1260.0,72.0,2.5,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3483,"Potsdamer Str. 100, Tiergarten, Berlin",Sonnendurchflutete repräsentative Wohnung / Bü...,2995.0,158.0,4.0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3484,"Kaiserdamm 6, Charlottenburg, Berlin",Grossraumwohnung ( zur Eigenrenovierung/Sanier...,1999.0,136.32,4.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3485 entries, 0 to 3484
Columns: 109 entries, address to landlord_wohnungsswap.de
dtypes: float64(3), int64(8), object(2), uint8(96)
memory usage: 680.8+ KB


In [6]:
data.columns

Index(['address', 'description', 'total rent', 'area', 'number of rooms',
       'balcony', 'built-in kitchen', 'basement', 'garden', 'elevator',
       ...
       'district_zehlendorf', 'landlord_degewo', 'landlord_estate agent',
       'landlord_housinganywhere b.v.', 'landlord_howoge',
       'landlord_numa group', 'landlord_private offer',
       'landlord_tauschwohnung wohnungstausch', 'landlord_visionapartments',
       'landlord_wohnungsswap.de'],
      dtype='object', length=109)

#### Changing datatypes of new dummy columns to integer

In [7]:
for i in range (7,-1):
    data[data.columns[i]]=data[data.columns[i]].astype(int)

In [8]:
data["number of rooms"].dtype


dtype('float64')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3485 entries, 0 to 3484
Columns: 109 entries, address to landlord_wohnungsswap.de
dtypes: float64(3), int64(8), object(2), uint8(96)
memory usage: 680.8+ KB


#### Removing Outliers

In [10]:
def remove_outliers(df, column_names):
    """ returns a df without outliers for the specified column names"""
    if isinstance(df, pd.core.frame.DataFrame):
        Q1 = df[column_names].quantile(0.25)
        Q3 = df[column_names].quantile(0.75)
        IQR = Q3 - Q1
        true_list = ~((df[column_names] < (Q1 - 1.5*IQR)) | (df[column_names] > (Q3 + 1.5*IQR)))
        return df[true_list]
    else:
        raise TypeError


In [14]:
def what_are_outliers(df, column_names):
    """ returns a df with outliers for the specified column names"""
    if isinstance(df, pd.core.frame.DataFrame):
        Q1 = df[column_names].quantile(0.25)
        Q3 = df[column_names].quantile(0.75)
        IQR = Q3 - Q1
        true_list = (df[column_names] < (Q1 - 1.5*IQR)) | (df[column_names] > (Q3 + 1.5*IQR))
        return df[true_list]
    else:
        raise TypeError


In [11]:
df_clean = remove_outliers(data, "number of rooms")

In [12]:
df_clean.tail()

Unnamed: 0,address,description,total rent,area,number of rooms,balcony,built-in kitchen,basement,garden,elevator,stepless,guest toilet,flat share possible,district_alt-hohenschönhausen,district_alt-treptow,district_altglienicke,district_baumschulenweg,district_biesdorf,district_blankenburg,district_bohnsdorf,district_borsigwalde,district_britz,district_buch,district_buckow,district_charlottenburg,district_charlottenburg-nord,district_dahlem,district_falkenberg,district_falkenhagener feld,district_fennpfuhl,district_französisch buchholz,district_friedenau,district_friedrichsfelde,district_friedrichshagen,district_friedrichshain,district_gatow,district_gesundbrunnen,district_gropiusstadt,district_grunewald,district_grünau,district_hakenfelde,district_halensee,district_hansaviertel,district_haselhorst,district_heiligensee,district_heinersdorf,district_hellersdorf,district_hermsdorf,district_johannisthal,district_karlshorst,district_karow,district_kaulsdorf,district_konradshöhe,district_kreuzberg,district_köpenick,district_lankwitz,district_lichtenberg,district_lichtenrade,district_lichterfelde,district_lübars,district_mahlsdorf,district_mariendorf,district_marienfelde,district_marzahn,district_mitte,district_moabit,district_märkisches viertel,district_müggelheim,district_neu-hohenschönhausen,district_neukölln,district_niederschöneweide,district_niederschönhausen,district_nikolassee,district_oberschöneweide,district_pankow,district_plänterwald,district_prenzlauer berg,district_rahnsdorf,district_reinickendorf,district_rosenthal,district_rummelsburg,district_schmargendorf,district_schmöckwitz,district_schöneberg,district_siemensstadt,district_spandau,district_staaken,district_steglitz,district_tegel,district_tempelhof,district_tiergarten,district_wannsee,district_wedding,district_weißensee,district_westend,district_wilhelmsruh,district_wilhelmstadt,district_wilmersdorf,district_wittenau,district_zehlendorf,landlord_degewo,landlord_estate agent,landlord_housinganywhere b.v.,landlord_howoge,landlord_numa group,landlord_private offer,landlord_tauschwohnung wohnungstausch,landlord_visionapartments,landlord_wohnungsswap.de
3480,"Wilhelmsmühlenweg 12, Kaulsdorf, Berlin",3- Raumwohnung in Kaulsdorf in Bahnhofnähe,1035.0,80.0,3.0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3481,"Nollendorfstr. 17, Schöneberg, Berlin",Schöne 2 - Zimmer - Wohnung zum Wohlfühlen,1663.0,84.1,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3482,"Tegel, Berlin","Alt-Tegel, seenahe ruhige Maisonette-Whg, 72m²...",1260.0,72.0,2.5,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3483,"Potsdamer Str. 100, Tiergarten, Berlin",Sonnendurchflutete repräsentative Wohnung / Bü...,2995.0,158.0,4.0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3484,"Kaiserdamm 6, Charlottenburg, Berlin",Grossraumwohnung ( zur Eigenrenovierung/Sanier...,1999.0,136.32,4.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [15]:
what_are_outliers(data, "number of rooms")

Unnamed: 0,address,description,total rent,area,number of rooms,balcony,built-in kitchen,basement,garden,elevator,stepless,guest toilet,flat share possible,district_alt-hohenschönhausen,district_alt-treptow,district_altglienicke,district_baumschulenweg,district_biesdorf,district_blankenburg,district_bohnsdorf,district_borsigwalde,district_britz,district_buch,district_buckow,district_charlottenburg,district_charlottenburg-nord,district_dahlem,district_falkenberg,district_falkenhagener feld,district_fennpfuhl,district_französisch buchholz,district_friedenau,district_friedrichsfelde,district_friedrichshagen,district_friedrichshain,district_gatow,district_gesundbrunnen,district_gropiusstadt,district_grunewald,district_grünau,district_hakenfelde,district_halensee,district_hansaviertel,district_haselhorst,district_heiligensee,district_heinersdorf,district_hellersdorf,district_hermsdorf,district_johannisthal,district_karlshorst,district_karow,district_kaulsdorf,district_konradshöhe,district_kreuzberg,district_köpenick,district_lankwitz,district_lichtenberg,district_lichtenrade,district_lichterfelde,district_lübars,district_mahlsdorf,district_mariendorf,district_marienfelde,district_marzahn,district_mitte,district_moabit,district_märkisches viertel,district_müggelheim,district_neu-hohenschönhausen,district_neukölln,district_niederschöneweide,district_niederschönhausen,district_nikolassee,district_oberschöneweide,district_pankow,district_plänterwald,district_prenzlauer berg,district_rahnsdorf,district_reinickendorf,district_rosenthal,district_rummelsburg,district_schmargendorf,district_schmöckwitz,district_schöneberg,district_siemensstadt,district_spandau,district_staaken,district_steglitz,district_tegel,district_tempelhof,district_tiergarten,district_wannsee,district_wedding,district_weißensee,district_westend,district_wilhelmsruh,district_wilhelmstadt,district_wilmersdorf,district_wittenau,district_zehlendorf,landlord_degewo,landlord_estate agent,landlord_housinganywhere b.v.,landlord_howoge,landlord_numa group,landlord_private offer,landlord_tauschwohnung wohnungstausch,landlord_visionapartments,landlord_wohnungsswap.de
5,"Leipziger Str. 12, Mitte (Ortsteil), Berlin",5-Zimmerwohnung mit großer Terrasse im Herzen ...,8627.22,343.41,5.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
7,"Wilmersdorf, Berlin",Belle Etage im Wohnquartier Güntzelkiez! 6-Zim...,4300.00,220.50,7.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
56,"Martin Luther Str. 84, Schöneberg, Berlin",4 Jahre befristet - Großzügiger Altbau in zent...,1853.00,150.81,5.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
62,"Rothenburgstraße 44, Steglitz, Berlin","Beeindruckendes Dachgeschoss, 5 Zimmer, 2 Bäde...",2250.00,113.00,5.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
114,"Tempelhof, Berlin",KEINE WG-Whg: Ehemals hochherschaftliche 6 Zi-...,2697.44,230.00,6.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3389,"Charlottenburg, Berlin",Herrschaftlicher Altbau / teilgewerbliche Nutz...,3483.00,162.00,5.0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3408,"Bohnsdorf, Berlin","Neubauwohnungen, geeignet für Pflegebedarf",8400.00,246.84,18.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3459,"Am Panke Park 33, Mitte (Ortsteil), Berlin",Pankepark - Townhouse mit Garten,4374.00,176.80,5.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3463,"Greifenhagener Straße 30, Prenzlauer Berg, Berlin",Neues wunderschönes Dachgeschoss / teilgewerblich,3830.00,211.00,5.0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [16]:
df_cleanest = remove_outliers(data, "total rent")

In [17]:
check = what_are_outliers(data, "total rent")
check.describe()

Unnamed: 0,total rent,area,number of rooms,balcony,built-in kitchen,basement,garden,elevator,stepless,guest toilet,flat share possible,district_alt-hohenschönhausen,district_alt-treptow,district_altglienicke,district_baumschulenweg,district_biesdorf,district_blankenburg,district_bohnsdorf,district_borsigwalde,district_britz,district_buch,district_buckow,district_charlottenburg,district_charlottenburg-nord,district_dahlem,district_falkenberg,district_falkenhagener feld,district_fennpfuhl,district_französisch buchholz,district_friedenau,district_friedrichsfelde,district_friedrichshagen,district_friedrichshain,district_gatow,district_gesundbrunnen,district_gropiusstadt,district_grunewald,district_grünau,district_hakenfelde,district_halensee,district_hansaviertel,district_haselhorst,district_heiligensee,district_heinersdorf,district_hellersdorf,district_hermsdorf,district_johannisthal,district_karlshorst,district_karow,district_kaulsdorf,district_konradshöhe,district_kreuzberg,district_köpenick,district_lankwitz,district_lichtenberg,district_lichtenrade,district_lichterfelde,district_lübars,district_mahlsdorf,district_mariendorf,district_marienfelde,district_marzahn,district_mitte,district_moabit,district_märkisches viertel,district_müggelheim,district_neu-hohenschönhausen,district_neukölln,district_niederschöneweide,district_niederschönhausen,district_nikolassee,district_oberschöneweide,district_pankow,district_plänterwald,district_prenzlauer berg,district_rahnsdorf,district_reinickendorf,district_rosenthal,district_rummelsburg,district_schmargendorf,district_schmöckwitz,district_schöneberg,district_siemensstadt,district_spandau,district_staaken,district_steglitz,district_tegel,district_tempelhof,district_tiergarten,district_wannsee,district_wedding,district_weißensee,district_westend,district_wilhelmsruh,district_wilhelmstadt,district_wilmersdorf,district_wittenau,district_zehlendorf,landlord_degewo,landlord_estate agent,landlord_housinganywhere b.v.,landlord_howoge,landlord_numa group,landlord_private offer,landlord_tauschwohnung wohnungstausch,landlord_visionapartments,landlord_wohnungsswap.de
count,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0,150.0
mean,6658.134267,164.5432,3.663333,0.506667,0.526667,0.133333,0.086667,0.133333,0.106667,0.246667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006667,0.0,0.0,0.0,0.0,0.193333,0.0,0.006667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.0,0.013333,0.0,0.013333,0.0,0.0,0.006667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006667,0.0,0.0,0.273333,0.026667,0.0,0.0,0.0,0.033333,0.006667,0.006667,0.006667,0.006667,0.0,0.0,0.053333,0.0,0.0,0.0,0.006667,0.033333,0.006667,0.04,0.0,0.02,0.0,0.0,0.006667,0.0,0.04,0.006667,0.013333,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.573333,0.2,0.006667,0.013333,0.046667,0.02,0.006667,0.133333
std,3256.124417,91.544976,2.222895,0.50163,0.500961,0.341073,0.282289,0.341073,0.309723,0.432515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08165,0.0,0.0,0.0,0.0,0.396235,0.0,0.08165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238282,0.0,0.115082,0.0,0.115082,0.0,0.0,0.08165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.261556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08165,0.0,0.0,0.447164,0.161647,0.0,0.0,0.0,0.180107,0.08165,0.08165,0.08165,0.08165,0.0,0.0,0.22545,0.0,0.0,0.0,0.08165,0.180107,0.08165,0.196616,0.0,0.140469,0.0,0.0,0.08165,0.0,0.196616,0.08165,0.115082,0.0,0.0,0.0,0.0,0.180107,0.0,0.0,0.0,0.49625,0.40134,0.08165,0.115082,0.211631,0.140469,0.08165,0.341073
min,4200.0,30.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4569.75,113.25,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5549.0,147.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7400.0,200.0,4.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,28000.0,706.0,18.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
df_numeric = df_cleanest.select_dtypes(include=["int","float"])

In [21]:
df_numeric.reset_index()

Unnamed: 0,index,total rent,area,number of rooms,balcony,built-in kitchen,basement,garden,elevator,stepless,guest toilet,flat share possible
0,0,3940.00,175.00,3.0,1,1,0,0,0,0,0,0
1,1,4030.00,147.00,4.0,1,1,0,0,0,0,0,0
2,2,1404.41,72.86,3.0,1,1,0,0,0,0,0,0
3,3,2663.13,107.89,1.0,1,1,0,0,0,0,0,0
4,4,2790.64,93.36,3.0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3330,3480,1035.00,80.00,3.0,1,0,1,1,0,0,0,0
3331,3481,1663.00,84.10,2.0,0,0,0,1,0,0,0,0
3332,3482,1260.00,72.00,2.5,0,0,1,1,0,0,0,0
3333,3483,2995.00,158.00,4.0,0,0,0,0,1,0,1,0


In [22]:
#df_numeric.to_csv("df_numeric.csv",index=False)