In [199]:
"""This is my model training notebook"""

'This is my model training notebook'

In [200]:
##Import Libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from math import radians
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

In [313]:
##Load the dataset
dataset = '../../data/properties_working.csv'
df = pd.read_csv(dataset)

In [314]:
##Drop columns that won't be used for modeling
columns_to_drop = ['immowebCode', 'priceString', 'country', 'locality', 'street', 'availableDate', 'neighborhoodOrLocality', 'floor', 'numberOfFloors', 'outdoorParkingSpaces',
'surroundingsType', 'livingRoomSurface', 'kitchenSurface', 'bedroom3Surface', 'bedroom4Surface', 'bedroom5Surface', 'basement', 'furnished', 'terraceSurface', 'elevator',
'disabilityAccess', 'tvCable', 'visioPhone', 'swimmingPool', 'internet', 'EPCReportReference', 'CO2Emission', 'yearlyEnergyConsumption', 'doubleGlazing', 'thermicSolarPanels',
'PVSolarPanels', 'planningPermission', 'subdivisionPermit', 'floodZoneType', 'landUseDesignation', 'cadastralIncome', 'tenementBuilding', 'streetFrontageWidth',
'coveredParkingSpaces', 'basementSurface', 'monthlyCharges', 'diningRoom', 'showerRooms', 'intercom', 'armoredDoor', 'propertyName', 'laundryRoom', 'terraceOrientation',
'caretaker', 'commonWaterHeater', 'livingRoom', 'terrace', 'attic', 'officeSurface', 'office', 'alarm', 'eLevel', 'heatPump', 'plotSurface', 'lotWidth', 'gasWaterElectric',
'gardenSurface', 'gardenOrientation', 'dressingRoom', 'professionalSpace', 'sewerConnection', 'jacuzzi', 'sauna', 'buildingType', 'fireplaces', 'garden', 'airConditioning',
'professionalSpaceSurface', 'atticSurface', 'isolated', 'streetFacing', 'woodedLand', 'rearPlot', 'flatLand']
df_working = df.drop(columns_to_drop, axis=1)

In [315]:
##Transform type column to binary
df_working['type'] = df_working['type'].replace(['Apartment'], 0)
df_working['type'] = df_working['type'].replace(['House'], 1)
##Transform postal code into categorical data
df_working['postalCode'] = df_working['postalCode'].astype(int)
df_working['postalCode'] = df_working['postalCode'].astype(str)

In [316]:
##Filter out all empty values
indexEmpty = df_working[(df_working['latitude'] == 99.0)|(df_working['constructionYear'] == 'None')|(df_working['buildingCondition'] == 0)|
(df_working['numberOfFrontages'] == 'None')|(df_working['livingArea'] == 'None')|(df_working['kitchenType'] == 0)|(df_working['bedrooms'] == 'None')|
(df_working['bedroom1Surface'] == 'None')|(df_working['bedroom2Surface'] == 'None')|(df_working['bathrooms'] == 'None')|(df_working['toilets'] == 'None')|
(df_working['energyConsumption'] == 0.0)|(df_working['energyClass'] == 'None')|(df_working['heatingType'] == 'None')].index
df_filtered = df_working.drop(indexEmpty)

In [317]:
#Transform columns to integer type
columns_int = ['constructionYear', 'numberOfFrontages', 'livingArea', 'bedrooms', 'bedroom1Surface', 'bedroom2Surface', 'bathrooms', 'toilets']
df_filtered[columns_int] = df_filtered[columns_int].astype(int)

In [318]:
##Transform energyClass column to ranked values
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['A'], 1)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['B'], 2)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['C'], 3)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['D'], 4)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['E'], 5)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['F'], 6)
df_filtered['energyClass'] = df_filtered['energyClass'].replace(['G'], 7)

In [319]:
##Set y value
y = df_filtered['price'].values

In [320]:
##Run One Hot Encoder on categorical data
column_type = df_filtered['type'].values
columns_ohe = ['province', 'district', 'postalCode', 'heatingType']
ohe = OneHotEncoder()
cat_ohe = ohe.fit_transform(df_filtered[columns_ohe]).toarray()
cat_data = np.column_stack((column_type, cat_ohe))

In [321]:
print(df_filtered.columns)

Index(['type', 'price', 'province', 'district', 'postalCode', 'latitude',
       'longitude', 'constructionYear', 'buildingCondition',
       'numberOfFrontages', 'livingArea', 'kitchenType', 'bedrooms',
       'bedroom1Surface', 'bedroom2Surface', 'bathrooms', 'toilets',
       'energyConsumption', 'energyClass', 'heatingType'],
      dtype='object')


In [322]:
X = cat_data

In [323]:
##Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [324]:
##Declare instance of model
regressor = LinearRegression()
##Fit the model
regressor.fit(X_train, y_train)
print(regressor.score(X_train, y_train))

0.5867004218969594


In [325]:
##Evaluate the model
y_predict = regressor.predict(X_test)
print(regressor.score(X_test, y_test))
print(regressor.score(X_test, y_predict))

-1.416707119590685e+25
1.0


This section is for running a K neighbors regressor using the latitude and longitude data

In [196]:
##Run trial model (lat, long)
empty_lat_long = df[df['latitude'] == 99.0].index
df_lat_long = df_working.drop(empty_lat_long)

df_lat_long['latitude'] = df_lat_long['latitude'].apply(radians)
df_lat_long['longitude'] = df_lat_long['longitude'].apply(radians)

X = df_lat_long[['latitude', 'longitude']].values
y = df_lat_long['price'].values

In [197]:
##Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [198]:
##Declare instance of model
nbrs = KNeighborsRegressor(weights='distance', metric='haversine')
##Fit the model
nbrs.fit(X_train, y_train)
print(nbrs.score(X_train, y_train))
##Evaluate the model
y_pred = nbrs.predict(X_test)
print(nbrs.score(X_test, y_test))
print(nbrs.score(X_test, y_pred))

0.3506687097317104
0.2700067249120899
1.0
