In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

df = pd.read_csv("house_data_price_outlier_remove.csv",index_col="Unnamed: 0" )
zips = pd.read_csv("Belgium_zip_municipalities.txt", sep="\t")
zips = zips.iloc[:,1:3]
zips.columns = ["Zip_code", "city"]
pd.options.display.float_format = '{:.0f}'.format
print(df.shape)
df = df.drop_duplicates(["Area", "Price", "State_of_building", "Zip_code","Land_surface", "Garden_surface"])
print(df.shape)
df.to_csv("house_data_ultimate_cleaned")
df.columns

(8237, 18)
(7801, 18)


Index(['Area', 'Price', 'State_of_building', 'Facades', 'Bedrooms', 'Kitchen',
       'Furnished', 'Open_fire', 'Zip_code', 'Land_surface', 'Terrace',
       'Terrace_surface', 'Swimming_pool', 'Subtype_property', 'Garden',
       'Garden_surface', 'Regions', 'Price_per_square_meter'],
      dtype='object')

In [232]:
# Dropping outlier
df.drop(df[df["Zip_code"] == 1404].index,inplace=True)

In [233]:
# Grouping into municipalities.
def rounding_down(x):
    rest = x % 10
    if rest != 0:
        return x - rest
    else:
        return x

df["municipalities"] = df["Zip_code"].apply(lambda x: rounding_down(x))
df["municipalities"]

0       3290
1       2070
2       8800
3       8000
4       8800
        ... 
8328    4280
8329    4040
8330    7390
8331    4470
8332    9290
Name: municipalities, Length: 7801, dtype: int64

In [234]:
# Dropping if only 2 valid price rows
threshold = 5
value_counts = df.municipalities.value_counts()
to_remove = list(value_counts[value_counts <= threshold].index)
df_threshold = df.copy()

for label, row in df_threshold.iterrows():
    if row.municipalities in to_remove:
        df_threshold.drop(index=label, inplace=True)

print(df.shape)




(7801, 19)


In [235]:
# Most expensive areas (By price) - Belgium

most_expensive_Belgium_zip = df_threshold.groupby(["municipalities"])["Price"].mean().nlargest(5)
new_df = df_threshold[df_threshold["municipalities"].isin(list(most_expensive_Belgium_zip.index))]
print(new_df["municipalities"].value_counts())
df_expensive_Belgium = zips[zips["Zip_code"].isin(most_expensive_Belgium_zip.index)]
print(df_expensive_Belgium)
df_expensive_Belgium = df_expensive_Belgium.drop_duplicates("Zip_code", keep="first")
df_expensive_Belgium.insert(2, "mean_price", most_expensive_Belgium_zip.values, True)
df_expensive_Belgium = df_expensive_Belgium.reset_index(drop=True)
print(df_expensive_Belgium)

1180    122
8300     86
1640     52
1050     50
1950     18
Name: municipalities, dtype: int64
     Zip_code                city
3        1050             Ixelles
17       1180               Uccle
200      1640  Rhode-Saint-Genèse
278      1950            Kraainem
962      8300        Knokke-Heist
963      8300              Knokke
   Zip_code                city  mean_price
0      1050             Ixelles     1682692
1      1180               Uccle     1537943
2      1640  Rhode-Saint-Genèse     1373209
3      1950            Kraainem     1276667
4      8300        Knokke-Heist     1238990


In [236]:
# # Most expensive areas (By price-square meter) - Belgium
most_expensive_Belgium_zip = df_threshold.groupby(["municipalities"])["Price_per_square_meter"].mean().nlargest(5)
new_df = df_threshold[df_threshold["municipalities"].isin(list(most_expensive_Belgium_zip.index))]
print(new_df["municipalities"].value_counts())
df_expensive_Belgium = zips[zips["Zip_code"].isin(most_expensive_Belgium_zip.index)]
print(df_expensive_Belgium)
df_expensive_Belgium = df_expensive_Belgium.drop_duplicates("Zip_code", keep="first")
df_expensive_Belgium.insert(2, "mean_sqm", most_expensive_Belgium_zip.values, True)
df_expensive_Belgium = df_expensive_Belgium.reset_index(drop=True)
print(df_expensive_Belgium)

1180    122
8300     86
1150     53
1640     52
1970     13
Name: municipalities, dtype: int64
     Zip_code                 city
14       1150  Woluwe-Saint-Pierre
17       1180                Uccle
200      1640   Rhode-Saint-Genèse
279      1970      Wezembeek-Oppem
962      8300         Knokke-Heist
963      8300               Knokke
   Zip_code                 city  mean_sqm
0      1150  Woluwe-Saint-Pierre      5458
1      1180                Uccle      4131
2      1640   Rhode-Saint-Genèse      3883
3      1970      Wezembeek-Oppem      3848
4      8300         Knokke-Heist      3785


In [246]:
# Making a function out of it.

def make_top_5_mean(df, postal_column:str, on_what:str) -> str:
    most_expensive= df.groupby([postal_column])[on_what].mean().nlargest(5)

    print(most_expensive)
    new_df = df[df[postal_column].isin(list(most_expensive.index))]
    print(new_df[postal_column].value_counts())
    df_expensive = zips[zips["Zip_code"].isin(most_expensive.index)]
    df_expensive_Belgium = df_expensive.drop_duplicates("Zip_code", keep="first")

    dummy = pd.Series(list(most_expensive.index), name = df_expensive_Belgium["Zip_code"]).to_frame()
    df_expensive_Belgium = pd.merge(dummy, df, on = "Zip_code", how = 'left')

    df_expensive_Belgium.insert(2, "mean_sqm", list(most_expensive.values), True)
    df_expensive= df_expensive_Belgium.reset_index(drop=True)
    return df_expensive

print(make_top_5_mean(df, "municipalities", "Price_per_square_meter"))

municipalities
3900   6250
8300   5458
8660   5003
1180   4131
1640   3883
Name: Price_per_square_meter, dtype: float64
1180    122
8300     86
1640     52
8660      4
3900      1
Name: municipalities, dtype: int64


TypeError: Series.name must be a hashable type