In [None]:
#Creating data for analysis
"""Removing outliers and filling missing values in based on Multivariate Approach:
pip install scipy,  Scikit-learn, sklearn, run it after outliers are removed"""

import pandas as pd
import urllib.parse
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors

import seaborn as sns

from typing import List

import sklearn
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

#df for houses and apartments: 
df = pd.read_csv("clean_data.csv")

df["Primary energy consumption"] = np.where((df["Primary energy consumption"] > 5000) | (df["Primary energy consumption"] < 20), 0, df["Primary energy consumption"])

df['Primary energy consumption'] = df['Primary energy consumption'].replace({-1: np.nan,
0 : np.nan, 1:np.nan
})

df['Garden surface'] = df['Garden surface'].replace({-1: np.nan,
 1:np.nan
})

df['Terrace surface'] = df['Terrace surface'].replace({-1: np.nan,
 1:np.nan
})

df['Building Cond. values'] = df['Building Cond. values'].replace({-1: np.nan,
 1:np.nan
})



df_for_houses = df[df['Type of property'] == 'house']
df_for_apartments = df[df['Type of property'] == 'apartment']

columns_houses = ['Price','Price of square meter', 'Living area','Number of rooms','Garden surface','Terrace surface','Open fire','Surface of the land','Number of facades','Swimming pool','Building Cond. values','Kitchen values','Primary energy consumption', 'Energy efficiency']
columns_apartments = ['Price','Price of square meter', 'Living area','Number of rooms','Furnished', 'Garden surface', 'Terrace surface', 'Open fire','Building Cond. values','Kitchen values','Primary energy consumption', 'Energy efficiency']

df_houses = df_for_houses[columns_houses]
df_apartments = df_for_apartments[columns_apartments]
df_all = df[columns_houses]



# Remove outliers
def remove_outliers(df: pd.DataFrame, columns: List[str], n_std: int) -> pd.DataFrame:
    for col in columns:
        mean = df[col].mean()
        sd = df[col].std()
        
        df = df[(df[col] <= mean+(n_std*sd))]
        
    return df

clean_houses = remove_outliers(df_houses, ['Price', 'Living area'], 3)
clean_apartments = remove_outliers(df_apartments, ['Price', 'Living area'], 3)
clean_all = remove_outliers(df, ['Price', 'Living area'], 3)
clean_numeric_all = remove_outliers(df_all, ['Price', 'Living area'], 3)

columns_h = columns_houses
columns_a = columns_apartments

#impute_it = IterativeImputer()
#np_array_houses = impute_it.fit_transform(clean_houses).astype(int)
#np_array_apartments = impute_it.fit_transform(clean_apartments).astype(int)
#np_array_all = impute_it.fit_transform(clean_all)

from sklearn.impute import KNNImputer
impute_knn = KNNImputer(n_neighbors=5)
k_apartments = impute_knn.fit_transform(clean_apartments)
k_houses = impute_knn.fit_transform(clean_houses)

complete_houses = pd.DataFrame(k_houses, columns = columns_h)
complete_apartments = pd.DataFrame(k_apartments, columns = columns_a)

#complete_houses = pd.DataFrame(np_array_houses,columns = columns_h)
#complete_apartments = pd.DataFrame(np_array_apartments, columns = columns_a)

#complete_houses.to_csv('complete_houses.csv')
#complete_apartments.to_csv('complete_apartments.csv')

In [None]:
complete_houses

In [None]:
clean_apartments['Primary energy consumption'].max()

In [None]:
complete_apartments['Primary energy consumption']

In [None]:
complete_houses['Garden surface'].min()

In [None]:
#corr_complete_houses = complete_houses.corr(method = 'spearman').sort_values(['Price'], ascending=False)
#corr_complete_apartments = complete_apartments.corr(method = 'spearman').sort_values(['Price'], ascending=False)

In [None]:
corr_complete_houses = complete_houses.corr(method = 'spearman')
corr_complete_apartments = complete_apartments.corr(method = 'spearman')

In [None]:

plt.figure(figsize = (16,5))  
colors = sns.color_palette('coolwarm', 11)
levels = [-1, -0.8, -0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
cmap, norm = matplotlib.colors.from_levels_and_colors(levels, colors, extend="max")
sns.heatmap(corr_complete_houses, cmap = cmap, annot=True, linewidths=.5, norm=norm)
plt.title('Correlation heatmap for houses')

In [None]:
plt.figure(figsize = (16,5))  
sns.heatmap(corr_complete_apartments, cmap = cmap, annot=True, linewidths=.5, norm=norm)
plt.title('Correlation heatmap for apartments')

In [None]:
price_per_province1 = clean_all.groupby(['Province'], as_index=True)['Price'].mean().astype(int).sort_values(ascending = False)
price_per_province2 = clean_all.groupby(['Province'], as_index=True)['Price'].median().astype(int).sort_values(ascending = False)
ppm = clean_all.groupby(['Province'], as_index=True)['Price of square meter'].mean().astype(int).sort_values(ascending = False)

In [None]:
y= [0, 2000, 3000, 4000, 5000, 6000, 7000, 10000, 100000, 200000, 300000, 400000, 500000]
ax1 = price_per_province1.plot(x='Province ', y='Price', kind='bar', rot=90, colormap='jet', title = 'Average and median prices by province').legend(['Average price','Median price'])
ax2 = price_per_province2.plot(x='Province ', y='Price in €', kind='bar',rot=90,   title = 'Average and median prices by province').legend([ 'Average price','Median price'])
plt.ylabel('Price in €')
plt.xlabel('Province')
plt.show()

In [None]:
ax3= ppm.plot(x='Province ', y='Price in €', kind='bar',colormap ='jet', rot=90, title = 'Average price per sqm by province').legend()
plt.ylabel('Price in €')
plt.xlabel('Province')
plt.show()

In [None]:
clean_houses = remove_outliers(df_houses, ['Surface of the land'], 1)
clean_houses['Surface of the land'].mean()

In [None]:
clean_houses.hist(column='Surface of the land', bins=100)
clean_houses['Surface of the land'].mean()
plt.title('Number of houses by surface of the land')
plt.ylabel('Number of houses')
plt.xlabel('Surface in m2')
plt.show()

In [None]:
clean_apartments = remove_outliers(df_houses, ['Living area'], 1)
clean_houses['Living area'].mean()

In [None]:
clean_apartments.hist(column='Living area', bins=100)

plt.title('Number of apartments by living area')
plt.ylabel('Number of apartments')
plt.xlabel('Living area in m2')
plt.show()

In [None]:
#Plotting outliers
plt.figure(figsize=(10,2)) 
plt.subplot(1,2,1) 
sns.histplot(df_houses['Price'], kde=False)
plt.axvline(x=df_houses['Price'].mean()+ 3*df_houses['Price'].std(), color ='red')
plt.title('Outliers for houses')
plt.legend(['3 std'])
plt.subplot(1,2,2)
sns.histplot(df_apartments['Price'], kde=False) 
plt.axvline(x=df_apartments['Price'].mean()+ 3*df_apartments['Price'].std(), color ='red')
plt.title('Outliers for apartments')
plt.legend(['3 std'])
plt.show()

In [None]:
clean_per_m = clean_all[clean_all['Energy efficiency'] != -1]
clean_per_m2 = clean_per_m[clean_per_m['Energy efficiency'] != 0]

energy_per = clean_per_m2.groupby('Energy efficiency', as_index=True)['Price of square meter'].mean().astype(int)


In [None]:
energy_consumption = complete_houses[complete_houses['Primary energy consumption'] != -1]
energy_consumption['Primary energy consumption'].max()

In [None]:
complete_apartments['Living area'].mean()

In [None]:

complete_apartments['Energy efficiency'].hist(bins=20)

In [None]:
energy_per.plot(kind='line', title = 'Energy efficiency and price per sq meter for houses')
plt.legend(['Price', 'Energy effiency class'])
plt.ylabel('Price in € per m2')
plt.xlabel('Energy effeciency class from 7 (<100 kWh/m²), to 1 (>600kWh/m²)')
plt.show()