In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [None]:
# load data
df = pd.read_csv("../data/raw/raw_immoweb_data0403.csv")

In [None]:
# we don't need rows with missing key information
df = df.dropna(subset=['price_main',"surface",'region'])
df.shape

In [None]:
# we are in belgium btw
df = df[(df['latitude'] >= 49.5) & (df['latitude'] <= 51.5) &
                 (df['longitude'] >= 2.5) & (df['longitude'] <= 6.4)]

In [None]:
# let's check for missing values
missing = df.isna().sum()
df.isna().sum()[missing > 0].sort_values()

In [None]:
# lets add a some useful columns
# add price per square meter
df['price_sqm'] = df.price_main/df.surface
# add url to easy check the lot page
df['url'] = df['id'].apply(lambda x: f"https://www.immoweb.be/en/classified/{x}")
# df.url.head()

In [None]:
df.columns

In [None]:

df.describe()

In [None]:
# let's get rid of the outliers. 
# We will use the IQR method (very liberal way 3 and 9 instead of 1.5 and 3.0)
# because we want to keep as much data as possible
# and the distribution of the data is not normal (as we can see in the plots below)
def remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3.0 * IQR
    upper_bound = Q3 + 9.0 * IQR
    # lower_bound = Q1 - 1.5 * IQR
    # upper_bound = Q3 + 1.5 * IQR
    filtered_df = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
    
    # count removed records
    total_records = len(df)
    filtered_records = len(filtered_df)
    removed_records = total_records - filtered_records
    removed_percentage = (removed_records / total_records) * 100
    
    print(f"{column_name} removed: {removed_records} из {total_records} ({removed_percentage:.2f}%)")
    
    return filtered_df
    # return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
for column in ['price_sqm', 'surface', 'price_main']:
# for column in [ 'surface', 'price_main']:
    df = remove_outliers(df, column)

In [None]:
sns.violinplot(x='subtype', y='surface', data=df)

In [None]:

sns.set_theme(style="ticks")
g = sns.jointplot(data=df, x="surface", y="price_main", kind="kde", hue="type")
g.set_axis_labels("Surface sqm", "Price €")


In [None]:
sns.set_theme(style="ticks")

# Создание графика lmplot с разделением по 'type'0.70.70.70.70.70.70.70.7
g = sns.lmplot(data=df, x="surface", y="price_main", hue="type", aspect=0.7, height=5)

# Установка заголовков осей
g.set_axis_labels("Surface sqm", "Price €")
g.add_legend(title="Type")

In [None]:
g = sns.jointplot(data=df, x="surface", y="price_main", hue="type", color="#4CB391")

In [None]:
g = sns.jointplot(data=df, x="surface", y="price_sqm", hue="type", color="#4CB391")

In [None]:
# df = df[(df['latitude'] >= 49.5) & (df['latitude'] <= 51.5) &
#                  (df['longitude'] >= 2.5) & (df['longitude'] <= 6.4)]
df_geo = df[df.latitude.notna() & df.longitude.notna()]

df_geo_houses = df_geo[df_geo.type == "HOUSE"]
df_geo_apartments = df_geo[df_geo.type == "APARTMENT"]
df_geo.shape

In [None]:
sns.jointplot(data=df_geo, x="longitude", y="latitude", kind="hex", color="#4CB391")

In [None]:

import folium
#from folium.plugins import HeatMap
map = folium.Map(location=[50.8503, 4.3517], zoom_start=12)
heat_df = df_geo[['latitude', 'longitude','price_sqm']] 
for i, row in heat_df.iterrows():
    price = row['price_sqm']
    if price <= 4400:
        color = 'orange'
        folium.CircleMarker([row['latitude'], row['longitude']], radius=0.01, color=color, fill=True, fill_color=color,opacity=0.3).add_to(map)
    # elif price <= 4000: color = 'orange'
    # elif price <= 3000: color = 'yellow'
    elif price > 4400:
        color = 'red'
        folium.CircleMarker([row['latitude'], row['longitude']], radius=0.01, color=color, fill=True, fill_color='blue').add_to(map)

    # folium.CircleMarker([row['latitude'], row['longitude']], radius=0.01, color='red', fill=True, fill_color='blue').add_to(map)
# for i in range(0, len(heat_df)):
#     # folium.CircleMarker([heat_df.iloc[i]['latitude'], heat_df.iloc[i]['longitude']], radius=0.01, color='red', fill=True, fill_color='blue').add_to(map)
#     price = heat_df.pric
map

In [None]:
# map for houses
import folium
#from folium.plugins import HeatMap
map = folium.Map(location=[50.8503, 4.3517], zoom_start=12)
heat_df = df_geo_houses[['latitude', 'longitude','price_sqm']] 
for i, row in heat_df.iterrows():
    price = row['price_sqm']
    if price <= 4400:
        color = 'orange'
        folium.CircleMarker([row['latitude'], row['longitude']], radius=0.01, color=color, fill=True, fill_color=color,opacity=0.3).add_to(map)
    # elif price <= 4000: color = 'orange'
    # elif price <= 3000: color = 'yellow'
    elif price > 4400:
        color = 'red'
        folium.CircleMarker([row['latitude'], row['longitude']], radius=0.01, color=color, fill=True, fill_color='blue').add_to(map)

    # folium.CircleMarker([row['latitude'], row['longitude']], radius=0.01, color='red', fill=True, fill_color='blue').add_to(map)
# for i in range(0, len(heat_df)):
#     # folium.CircleMarker([heat_df.iloc[i]['latitude'], heat_df.iloc[i]['longitude']], radius=0.01, color='red', fill=True, fill_color='blue').add_to(map)
#     price = heat_df.pric
map

In [None]:
# map for apartments
import folium
#from folium.plugins import HeatMap
map = folium.Map(location=[50.8503, 4.3517], zoom_start=12)
heat_df = df_geo_apartments[['latitude', 'longitude','price_sqm']] 
for i, row in heat_df.iterrows():
    price = row['price_sqm']
    if price <= 4400:
        color = 'orange'
        folium.CircleMarker([row['latitude'], row['longitude']], radius=0.01, color=color, fill=True, fill_color=color,opacity=0.3).add_to(map)
    # elif price <= 4000: color = 'orange'
    # elif price <= 3000: color = 'yellow'
    elif price > 4400:
        color = 'red'
        folium.CircleMarker([row['latitude'], row['longitude']], radius=0.01, color=color, fill=True, fill_color='blue').add_to(map)

    # folium.CircleMarker([row['latitude'], row['longitude']], radius=0.01, color='red', fill=True, fill_color='blue').add_to(map)
# for i in range(0, len(heat_df)):
#     # folium.CircleMarker([heat_df.iloc[i]['latitude'], heat_df.iloc[i]['longitude']], radius=0.01, color='red', fill=True, fill_color='blue').add_to(map)
#     price = heat_df.pric
map

In [None]:
df_numbers = df.select_dtypes(include=['float64', 'int64'])
correlation_matrix= df_numbers.corr().round(2)
correlation_matrix

In [None]:
plt.figure(figsize = (10,8))
sns.heatmap(correlation_matrix, cmap = 'viridis', vmin = -1, vmax = 1, center = 0)
plt.show()

In [None]:
df.condition.value_counts(dropna=False) 
df.floodZone.value_counts(dropna=False) 

In [None]:

for col in df.columns:
    if len(df[col].unique()) < 100:
        print(f"----- Column '{col}' has {len(df[col].unique())} unique values. ------")
        print(df[col].value_counts(dropna=False))

# print(df.columns[df.nunique() < 100])

# df.columns[df.nunique() < 100].map(lambda col: print(f"Column '{col}' has {df[col].nunique()} unique values."))