In [None]:
import pandas as pd
import urllib.parse
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [None]:
df = pd.read_csv('clean_data.csv')
df = df.drop_duplicates(subset='id')
df = df.drop(df[df["Type"].isin(["house group", "apartment group"])].index)

In [None]:
duplicate_columns = df.columns[df.columns.duplicated()]
df = df.loc[:, ~df.columns.duplicated()]

filtered_columns = ["id","location","Zip","Type","Subtype",
    "Price","Transaction Type","Bedrooms","Living area","Kitchen type",
    "Furnished","How many fireplaces?","Terrace","Terrace surface",
    "Garden","Garden surface","Surface of the plot","Number of frontages",
    "Swimming pool","Building condition","Primary energy consumption"]
df = df[filtered_columns]

In [None]:
df = df.rename(columns={
    'location' :'Locality',
    'Transaction Type' : 'Type of sale',
    'Type' :'Type of property',
    'Subtype' : 'Subtype of property',
    'Number of frontages': 'Number of facades',
    'Bedrooms':'Number of rooms',
    'Surface of the plot' :'Surface of the land',
    'Kitchen type' : 'Fully equipped kitchen',
    'How many fireplaces?' : 'Open fire'
})

In [None]:
#put the columns in a specific order
df = df.reindex(columns=['id', 'Locality',"Zip", 'Type of property', 'Subtype of property',
                         'Type of sale', 'Price', 'Number of facades', 'Number of rooms', 
                         'Living area', 'Fully equipped kitchen', 'Furnished','Primary energy consumption',
                         'Surface of the land', 'Terrace', 'Terrace surface',
                         'Garden','Garden surface', 'Open fire', 
                         'Swimming pool'])

In [None]:
def clean_and_convert(column):
    column = column.apply(lambda x: re.sub('\D+', '', str(x)))
    column = column.replace('', np.nan).fillna(0).astype(int)
    return column

In [None]:
df['Locality'] = df['Locality'].apply(urllib.parse.unquote)

df['Living area'] = clean_and_convert(df['Living area'])
df['Terrace'] = clean_and_convert(df['Terrace'])
df['Garden'] = clean_and_convert(df['Garden'])
df['Surface of the land'] = clean_and_convert(df['Surface of the land'])

In [None]:
df_houses = df[df['Type of property'] == 'house']
df_apartments = df[df['Type of property'] == 'apartment']

In [None]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', 'bool']

apartdf = df_apartments.select_dtypes(include=numerics)

apartdf.corr(method = 'spearman')

In [None]:
housedf = df_houses.select_dtypes(include=numerics)
housedf.corr(method = 'spearman')

In [None]:
# What is the correlation between the variables and the price?


In [None]:
# How are variables correlated to each other? (Why?)


In [None]:
# Which variables have the greatest influence on the price?


In [None]:
# Which variables have the least influence on the price?


In [None]:
# How many qualitative and quantitative variables are there? How would you transform these values into numerical values?


In [None]:
# Percentage of missing values per column?
