This is my model training cleaning and preprocessing notebook

In [11]:
##Import Libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from math import radians
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

In [12]:
##Load the dataset
dataset = '../../data/properties_working.csv'
df = pd.read_csv(dataset)

In [13]:
##Drop columns that won't be used for modeling
columns_to_drop = ['immowebCode', 'priceString', 'country', 'locality', 'street', 'availableDate', 'neighborhoodOrLocality', 'floor', 'numberOfFloors', 'outdoorParkingSpaces',
'surroundingsType', 'livingRoomSurface', 'kitchenSurface', 'bedroom3Surface', 'bedroom4Surface', 'bedroom5Surface', 'basement', 'furnished', 'terraceSurface', 'elevator',
'disabilityAccess', 'tvCable', 'visioPhone', 'swimmingPool', 'internet', 'EPCReportReference', 'CO2Emission', 'yearlyEnergyConsumption', 'doubleGlazing', 'thermicSolarPanels',
'PVSolarPanels', 'planningPermission', 'subdivisionPermit', 'floodZoneType', 'landUseDesignation', 'cadastralIncome', 'tenementBuilding', 'streetFrontageWidth',
'coveredParkingSpaces', 'basementSurface', 'monthlyCharges', 'diningRoom', 'showerRooms', 'intercom', 'armoredDoor', 'propertyName', 'laundryRoom', 'terraceOrientation',
'caretaker', 'commonWaterHeater', 'livingRoom', 'terrace', 'attic', 'officeSurface', 'office', 'alarm', 'eLevel', 'heatPump', 'plotSurface', 'lotWidth', 'gasWaterElectric',
'gardenSurface', 'gardenOrientation', 'dressingRoom', 'professionalSpace', 'sewerConnection', 'jacuzzi', 'sauna', 'buildingType', 'fireplaces', 'garden', 'airConditioning',
'professionalSpaceSurface', 'atticSurface', 'isolated', 'streetFacing', 'woodedLand', 'rearPlot', 'flatLand']
df_working = df.drop(columns_to_drop, axis=1)

In [14]:
##Transform type column to binary
df_working['type'] = df_working['type'].replace(['Apartment'], 0)
df_working['type'] = df_working['type'].replace(['House'], 1)

In [None]:
##Transform postal code into string type
df_working['postalCode'] = df_working['postalCode'].astype(int)
df_working['postalCode'] = df_working['postalCode'].astype(str)

In [15]:
##Filter out all empty values
indexEmpty = df_working[(df_working['latitude'] == 99.0)|(df_working['constructionYear'] == 'None')|(df_working['buildingCondition'] == 0)|
(df_working['numberOfFrontages'] == 'None')|(df_working['livingArea'] == 'None')|(df_working['kitchenType'] == 0)|(df_working['bedrooms'] == 'None')|
(df_working['bedroom1Surface'] == 'None')|(df_working['bedroom2Surface'] == 'None')|(df_working['bathrooms'] == 'None')|(df_working['toilets'] == 'None')|
(df_working['energyConsumption'] == 0.0)|(df_working['energyClass'] == 'None')|(df_working['heatingType'] == 'None')].index
df_filtered = df_working.drop(indexEmpty)

In [16]:
#Transform columns to integer type
columns_int = ['constructionYear', 'numberOfFrontages', 'livingArea', 'bedrooms', 'bedroom1Surface', 'bedroom2Surface', 'bathrooms', 'toilets']
df_filtered[columns_int] = df_filtered[columns_int].astype(int)

In [17]:
##Transform energyClass column to ranked values
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['A'], 1)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['B'], 2)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['C'], 3)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['D'], 4)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['E'], 5)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['F'], 6)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['G'], 7)

In [18]:
##Transform latitude and longitude to radians
df_filtered['latitude'] = df_filtered['latitude'].apply(radians)
df_filtered['longitude'] = df_filtered['longitude'].apply(radians)

In [19]:
"""Save filtered dataframe to new csv"""
filtered_dataset = '../../data/properties_filtered.csv'
df_filtered.to_csv(filtered_dataset, index = False, header = True, escapechar = "\\")

In [21]:
print(df_filtered.shape)

(991, 20)
