This is my model training cleaning notebook

In [1]:
"""Import Libraries"""

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from math import radians
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor

In [2]:
"""Function to load the dataset"""

def load_dataset():
    dataset = '../../data/properties_working.csv'
    df = pd.read_csv(dataset)
    return df

In [3]:
"""Function to drop columns that won't be used for modeling"""

def drop_columns(df):
    columns_to_drop = ['immowebCode', 'priceString', 'country', 'locality', 'street', 'availableDate', 'neighborhoodOrLocality', 'floor', 'numberOfFloors', 'outdoorParkingSpaces',
'surroundingsType', 'livingRoomSurface', 'kitchenSurface', 'bedroom3Surface', 'bedroom4Surface', 'bedroom5Surface', 'basement', 'furnished', 'terraceSurface', 'elevator',
'disabilityAccess', 'tvCable', 'visioPhone', 'swimmingPool', 'internet', 'EPCReportReference', 'CO2Emission', 'yearlyEnergyConsumption', 'doubleGlazing', 'thermicSolarPanels',
'PVSolarPanels', 'planningPermission', 'subdivisionPermit', 'floodZoneType', 'landUseDesignation', 'cadastralIncome', 'tenementBuilding', 'streetFrontageWidth',
'coveredParkingSpaces', 'basementSurface', 'monthlyCharges', 'diningRoom', 'showerRooms', 'intercom', 'armoredDoor', 'propertyName', 'laundryRoom', 'terraceOrientation',
'caretaker', 'commonWaterHeater', 'livingRoom', 'terrace', 'attic', 'officeSurface', 'office', 'alarm', 'eLevel', 'heatPump', 'plotSurface', 'lotWidth', 'gasWaterElectric',
'gardenSurface', 'gardenOrientation', 'dressingRoom', 'professionalSpace', 'sewerConnection', 'jacuzzi', 'sauna', 'buildingType', 'fireplaces', 'garden', 'airConditioning',
'professionalSpaceSurface', 'atticSurface', 'isolated', 'streetFacing', 'woodedLand', 'rearPlot', 'flatLand']
    df_working = df.drop(columns_to_drop, axis=1)
    return df_working

In [4]:
"""Function to transform type column to binary, transform postal code to string type and filter out all empty values"""

def filter_dataset(df_working):
    df_working['type'] = df_working['type'].replace(['Apartment'], 0)
    df_working['type'] = df_working['type'].replace(['House'], 1)
    df_working['postalCode'] = df_working['postalCode'].astype(int)
    df_working['postalCode'] = df_working['postalCode'].astype(str)
    indexEmpty = df_working[(df_working['latitude'] == 99.0)|(df_working['constructionYear'] == 'None')|(df_working['buildingCondition'] == 0)|
    (df_working['numberOfFrontages'] == 'None')|(df_working['livingArea'] == 'None')|(df_working['kitchenType'] == 0)|(df_working['bedrooms'] == 'None')|
    (df_working['bedroom1Surface'] == 'None')|(df_working['bedroom2Surface'] == 'None')|(df_working['bathrooms'] == 'None')|(df_working['toilets'] == 'None')|
    (df_working['energyConsumption'] == 0.0)|(df_working['energyClass'] == 'None')|(df_working['heatingType'] == 'None')].index
    df_filtered = df_working.drop(indexEmpty)
    return df_filtered

In [5]:
"""Function to transform certain columns to integer type, transform energy class to ranked numerical values and transform latitude and longitude to radians"""

def transform_filtered(df_filtered):
    columns_int = ['constructionYear', 'numberOfFrontages', 'livingArea', 'bedrooms', 'bedroom1Surface', 'bedroom2Surface', 'bathrooms', 'toilets']
    df_filtered[columns_int] = df_filtered[columns_int].astype(int)
    df_filtered['energyClass'] = df_filtered['energyClass'].replace(['A'], 1)
    df_filtered['energyClass'] = df_filtered['energyClass'].replace(['B'], 2)
    df_filtered['energyClass'] = df_filtered['energyClass'].replace(['C'], 3)
    df_filtered['energyClass'] = df_filtered['energyClass'].replace(['D'], 4)
    df_filtered['energyClass'] = df_filtered['energyClass'].replace(['E'], 5)
    df_filtered['energyClass'] = df_filtered['energyClass'].replace(['F'], 6)
    df_filtered['energyClass'] = df_filtered['energyClass'].replace(['G'], 7)
    df_filtered['latitude'] = df_filtered['latitude'].apply(radians)
    df_filtered['longitude'] = df_filtered['longitude'].apply(radians)
    return df_filtered

In [6]:
"""Function to save filtered dataframe to a new csv"""

def save_filtered(df_filtered):
    filtered_dataset = '../../data/properties_filtered.csv'
    df_filtered.to_csv(filtered_dataset, index = False, header = True, escapechar = "\\")
    print(df_filtered.shape, df_filtered.head())

In [7]:
"""Function to run everything"""

def model_training_cleaning():
    df = load_dataset()
    working = drop_columns(df)
    filtered = filter_dataset(working)
    final = transform_filtered(filtered)
    save_filtered(final)

In [8]:
model_training_cleaning()

(991, 20)     type     price       province  district postalCode  latitude  longitude  \
0      0  375000.0  East Flanders      Gent       9000  0.890642   0.064926   
1      0  195000.0        Antwerp   Antwerp       2610  0.893260   0.077144   
5      0  339000.0  East Flanders      Gent       9000  0.890642   0.064926   
7      0  208000.0  East Flanders      Gent       9000  0.890642   0.064926   
10     0  395000.0       Brussels  Brussels       1050  0.887151   0.076271   

    constructionYear  buildingCondition  numberOfFrontages  livingArea  \
0               2022                  1                  2          91   
1               1958                  3                  2          90   
5               2021                  1                  4         101   
7               2004                  3                  3         105   
10              2022                  1                  2         107   

    kitchenType  bedrooms  bedroom1Surface  bedroom2Surface  bathrooms