In [84]:
import pandas as pd
import urllib.parse
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [85]:
df = pd.read_csv('clean_data.csv')
df = df.drop_duplicates(subset='id')
df = df.drop(df[df["Type"].isin(["house group", "apartment group"])].index)

In [86]:
duplicate_columns = df.columns[df.columns.duplicated()]
df = df.loc[:, ~df.columns.duplicated()]

filtered_columns = ["id","location","Zip","Type","Subtype",
    "Price","Transaction Type","Bedrooms","Living area","Kitchen type",
    "Furnished","How many fireplaces?","Terrace","Terrace surface",
    "Garden","Garden surface","Surface of the plot","Number of frontages",
    "Swimming pool","Building condition","Primary energy consumption"]
df = df[filtered_columns]

In [87]:
df = df.rename(columns={
    'location' :'Locality',
    'Transaction Type' : 'Type of sale',
    'Type' :'Type of property',
    'Subtype' : 'Subtype of property',
    'Number of frontages': 'Number of facades',
    'Bedrooms':'Number of rooms',
    'Surface of the plot' :'Surface of the land',
    'Kitchen type' : 'Fully equipped kitchen',
    'How many fireplaces?' : 'Open fire'
})

In [88]:
#put the columns in a specific order
df = df.reindex(columns=['id', 'Locality',"Zip", 'Type of property', 'Subtype of property',
                         'Type of sale', 'Price', 'Number of facades', 'Number of rooms', 
                         'Living area', 'Fully equipped kitchen', 'Furnished','Primary energy consumption',
                         'Surface of the land', 'Terrace', 'Terrace surface',
                         'Garden','Garden surface', 'Open fire', 
                         'Swimming pool'])

In [89]:
def clean_and_convert(column):
    column = column.apply(lambda x: re.sub('\D+', '', str(x)))
    column = column.replace('', np.nan).fillna(0).astype(int)
    return column

In [90]:
df['Locality'] = df['Locality'].apply(urllib.parse.unquote)

df['Living area'] = clean_and_convert(df['Living area'])
df['Terrace'] = clean_and_convert(df['Terrace'])
df['Garden'] = clean_and_convert(df['Garden'])
df['Surface of the land'] = clean_and_convert(df['Surface of the land'])

In [91]:
df_houses = df[df['Type of property'] == 'house']
df_apartments = df[df['Type of property'] == 'apartment']

In [92]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']

apartdf = df_apartments.select_dtypes(include=numerics)
apartdf = df_apartments.reset_index()
apartdf = apartdf.drop('id', axis=1)
apartdf.corr(method = 'spearman')

ValueError: could not convert string to float: 'deinze'

In [None]:
housedf = df_houses.select_dtypes(include=numerics)
housedf.corr(method = 'spearman')

Unnamed: 0,id,Zip,Price,Number of facades,Number of rooms,Living area,Furnished,Primary energy consumption,Surface of the land,Terrace,Terrace surface,Garden,Garden surface,Open fire,Swimming pool
id,1.0,0.077907,0.058341,0.044601,-0.032367,-0.014029,0.006527,-0.065872,0.026976,-0.004208,-0.019704,-0.00694,-0.016981,0.02798,0.004181
Zip,0.077907,1.0,-0.333915,0.032301,-0.192857,-0.156865,0.024478,0.029041,0.022687,-0.091142,-0.159534,-0.043823,-0.097548,-0.055901,-0.083975
Price,0.058341,-0.333915,1.0,0.303484,0.526821,0.708732,0.009614,-0.332605,0.378663,0.140733,0.190916,0.056912,0.106758,0.188728,0.261026
Number of facades,0.044601,0.032301,0.303484,1.0,0.130046,0.22932,0.002399,0.041344,0.692427,0.083282,0.148873,0.072593,0.207808,0.198667,0.215351
Number of rooms,-0.032367,-0.192857,0.526821,0.130046,1.0,0.653241,0.008842,-0.141006,0.241682,0.06542,0.134844,0.044316,0.097155,0.100345,0.158382
Living area,-0.014029,-0.156865,0.708732,0.22932,0.653241,1.0,-0.010246,-0.214583,0.388784,0.076016,0.143626,0.011211,0.07616,0.16694,0.222109
Furnished,0.006527,0.024478,0.009614,0.002399,0.008842,-0.010246,1.0,-0.016623,-0.01625,0.010078,0.013108,-0.001857,0.006647,-0.012366,0.036683
Primary energy consumption,-0.065872,0.029041,-0.332605,0.041344,-0.141006,-0.214583,-0.016623,1.0,0.052492,-0.096575,-0.143077,0.013637,0.021712,-0.039273,-0.104212
Surface of the land,0.026976,0.022687,0.378663,0.692427,0.241682,0.388784,-0.01625,0.052492,1.0,0.093392,0.170641,0.177862,0.350529,0.184405,0.247502
Terrace,-0.004208,-0.091142,0.140733,0.083282,0.06542,0.076016,0.010078,-0.096575,0.093392,1.0,0.899386,0.124422,0.199821,0.022171,0.08636


In [None]:
df.corr(method='spearman')

ValueError: could not convert string to float: 'house'

In [None]:
# How are variables correlated to each other? (Why?)


In [None]:
# Which variables have the greatest influence on the price?


In [None]:
# Which variables have the least influence on the price?


In [None]:
# How many qualitative and quantitative variables are there? How would you transform these values into numerical values?


In [None]:
# Percentage of missing values per column?
