In [20]:
import pandas as pd
import urllib.parse
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [21]:
df = pd.read_csv('clean_data.csv')
df = df.drop_duplicates(subset='id')
df = df.drop(df[df["Type"].isin(["house group", "apartment group"])].index)

In [22]:
duplicate_columns = df.columns[df.columns.duplicated()]
df = df.loc[:, ~df.columns.duplicated()]

filtered_columns = ["id","location","Zip","Type","Subtype",
    "Price","Transaction Type","Bedrooms","Living area","Kitchen type",
    "Furnished","How many fireplaces?","Terrace","Terrace surface",
    "Garden","Garden surface","Surface of the plot","Number of frontages",
    "Swimming pool","Building condition","Primary energy consumption"]
df = df[filtered_columns]

In [23]:
df = df.rename(columns={
    'location' :'Locality',
    'Transaction Type' : 'Type of sale',
    'Type' :'Type of property',
    'Subtype' : 'Subtype of property',
    'Number of frontages': 'Number of facades',
    'Bedrooms':'Number of rooms',
    'Surface of the plot' :'Surface of the land',
    'Kitchen type' : 'Fully equipped kitchen',
    'How many fireplaces?' : 'Open fire'
})

In [24]:
#put the columns in a specific order
df = df.reindex(columns=['id', 'Locality',"Zip", 'Type of property', 'Subtype of property',
                         'Type of sale', 'Price', 'Number of facades', 'Number of rooms', 
                         'Living area', 'Fully equipped kitchen', 'Furnished','Primary energy consumption',
                         'Surface of the land', 'Terrace', 'Terrace surface',
                         'Garden','Garden surface', 'Open fire', 
                         'Swimming pool'])

In [25]:
def clean_and_convert(column):
    column = column.apply(lambda x: re.sub('\D+', '', str(x)))
    column = column.replace('', np.nan).fillna(0).astype(int)
    return column

In [26]:
df['Locality'] = df['Locality'].apply(urllib.parse.unquote)

df['Living area'] = clean_and_convert(df['Living area'])
df['Terrace'] = clean_and_convert(df['Terrace'])
df['Garden'] = clean_and_convert(df['Garden'])
df['Surface of the land'] = clean_and_convert(df['Surface of the land'])

In [28]:
df_houses = df[df['Type of property'] == 'house']
df_apartments = df[df['Type of property'] == 'apartment']

In [29]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']

apartdf = df_apartments.select_dtypes(include=numerics)

apartdf.corr(method = 'spearman')

Unnamed: 0,id,Zip,Price,Number of facades,Number of rooms,Living area,Furnished,Primary energy consumption,Surface of the land,Terrace,Terrace surface,Garden,Garden surface,Open fire,Swimming pool
id,1.0,0.015282,-0.013872,-0.020924,-0.01371,-0.027655,0.011486,0.016911,,0.001775,-0.030703,0.032881,-0.021537,0.012972,-0.030666
Zip,0.015282,1.0,-0.112569,0.033472,-0.034669,-0.076875,0.078544,-0.102078,,-0.017797,0.030453,0.102821,-0.001751,-0.064431,-0.00756
Price,-0.013872,-0.112569,1.0,0.053173,0.523953,0.641057,-0.00096,-0.270222,,0.250717,0.413173,0.096298,0.179086,0.095401,0.061163
Number of facades,-0.020924,0.033472,0.053173,1.0,0.112799,0.129197,-0.07793,-0.110323,,0.075554,0.177435,0.071902,0.16157,0.032966,0.028295
Number of rooms,-0.01371,-0.034669,0.523953,0.112799,1.0,0.750262,-0.095896,-0.055888,,0.151225,0.27096,0.049928,0.226179,0.074241,0.034503
Living area,-0.027655,-0.076875,0.641057,0.129197,0.750262,1.0,-0.183953,-0.152357,,0.205571,0.435751,0.097927,0.31909,0.11157,0.045659
Furnished,0.011486,0.078544,-0.00096,-0.07793,-0.095896,-0.183953,1.0,0.053933,,-0.051795,-0.052248,-0.033344,-0.035693,-0.027393,0.033344
Primary energy consumption,0.016911,-0.102078,-0.270222,-0.110323,-0.055888,-0.152357,0.053933,1.0,,-0.219076,-0.172439,-0.077292,0.108751,0.016708,0.003262
Surface of the land,,,,,,,,,,,,,,,
Terrace,0.001775,-0.017797,0.250717,0.075554,0.151225,0.205571,-0.051795,-0.219076,,1.0,,0.056691,0.038051,0.020704,0.014001


In [35]:
housedf = df_houses.select_dtypes(include=numerics)
housedf.corr(method = 'spearman')

Unnamed: 0,id,Zip,Price,Number of facades,Number of rooms,Living area,Furnished,Primary energy consumption,Surface of the land,Terrace,Terrace surface,Garden,Garden surface,Open fire,Swimming pool
id,1.0,0.077907,0.058341,0.044601,-0.032367,-0.014029,0.006527,-0.065872,0.026976,-0.004208,0.014289,-0.00694,0.028175,0.02798,0.004181
Zip,0.077907,1.0,-0.333915,0.032301,-0.192857,-0.156865,0.024478,0.029041,0.022687,-0.091142,0.002416,-0.043823,0.082546,-0.055901,-0.083975
Price,0.058341,-0.333915,1.0,0.303484,0.526821,0.708732,0.009614,-0.332605,0.378663,0.140733,0.241106,0.056912,0.227332,0.188728,0.261026
Number of facades,0.044601,0.032301,0.303484,1.0,0.130046,0.22932,0.002399,0.041344,0.692427,0.083282,0.328768,0.072593,0.623637,0.198667,0.215351
Number of rooms,-0.032367,-0.192857,0.526821,0.130046,1.0,0.653241,0.008842,-0.141006,0.241682,0.06542,0.240713,0.044316,0.168505,0.100345,0.158382
Living area,-0.014029,-0.156865,0.708732,0.22932,0.653241,1.0,-0.010246,-0.214583,0.388784,0.076016,0.321971,0.011211,0.269662,0.16694,0.222109
Furnished,0.006527,0.024478,0.009614,0.002399,0.008842,-0.010246,1.0,-0.016623,-0.01625,0.010078,0.005371,-0.001857,0.009411,-0.012366,0.036683
Primary energy consumption,-0.065872,0.029041,-0.332605,0.041344,-0.141006,-0.214583,-0.016623,1.0,0.052492,-0.096575,-0.062156,0.013637,0.079059,-0.039273,-0.104212
Surface of the land,0.026976,0.022687,0.378663,0.692427,0.241682,0.388784,-0.01625,0.052492,1.0,0.093392,0.423842,0.177862,0.871792,0.184405,0.247502
Terrace,-0.004208,-0.091142,0.140733,0.083282,0.06542,0.076016,0.010078,-0.096575,0.093392,1.0,,0.124422,0.087938,0.022171,0.08636


In [36]:
df.corr(method='spearman')

ValueError: could not convert string to float: 'house'

In [None]:
# What is the correlation between the variables and the price?


In [None]:
# How are variables correlated to each other? (Why?)


In [None]:
# Which variables have the greatest influence on the price?


In [None]:
# Which variables have the least influence on the price?


In [None]:
# How many qualitative and quantitative variables are there? How would you transform these values into numerical values?


In [None]:
# Percentage of missing values per column?
