## London House Sales Prediction

###### Given data about houses in London, let's try to predict how many houses will be sold in a given month and area.

### Getting Started

In [134]:
import numpy as np
import pandas as pd

import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [135]:
data= pd.read_csv('housing_in_london_monthly_variables.csv')

In [136]:
data.head()

Unnamed: 0,date,area,average_price,code,houses_sold,no_of_crimes,borough_flag
0,1995-01-01,city of london,91449,E09000001,17.0,,1
1,1995-02-01,city of london,82203,E09000001,7.0,,1
2,1995-03-01,city of london,79121,E09000001,14.0,,1
3,1995-04-01,city of london,77101,E09000001,7.0,,1
4,1995-05-01,city of london,84409,E09000001,10.0,,1


In [137]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13549 entries, 0 to 13548
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           13549 non-null  object 
 1   area           13549 non-null  object 
 2   average_price  13549 non-null  int64  
 3   code           13549 non-null  object 
 4   houses_sold    13455 non-null  float64
 5   no_of_crimes   7439 non-null   float64
 6   borough_flag   13549 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 741.1+ KB


### Preprocessing

In [138]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop redundant columns
    df= df.drop('code', axis=1)
    
    # Drop not relevant columns (those with >25% missing values)
    df= df.drop('no_of_crimes', axis=1)
    
    # Drop rows with missing target values (those from 'house sold' column)
    missing_target_rows = df[df['houses_sold'].isna()].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    # Extract date features (eventually the 'day' will drop)
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].apply(lambda x: x.year)
    df['month'] = df['date'].apply(lambda x: x.month)
    df = df.drop('date', axis=1)
    
    # One-Hot encode the 'area' column
    area_dummies = pd.get_dummies(df['area'], prefix='area')
    df = pd.concat([df, area_dummies], axis=1)
    df = df.drop('area', axis=1)
    
    # Split df into X, y
    y = df['houses_sold']
    X = df.drop('houses_sold', axis=1)
    
    # Train-Test split
    X_train, X_test, y_train, y_test= train_test_split (X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    
    return X_train, X_test, y_train, y_test

In [139]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [140]:
X_train

Unnamed: 0,average_price,borough_flag,year,month,area_barking and dagenham,area_barnet,area_bexley,area_brent,area_bromley,area_camden,...,area_south east,area_south west,area_southwark,area_sutton,area_tower hamlets,area_waltham forest,area_wandsworth,area_west midlands,area_westminster,area_yorks and the humber
10752,-0.707342,-1.657173,1.542734,1.315615,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
10236,-0.663007,-1.657173,-0.963307,1.024950,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
4512,-1.004551,0.603437,-1.380980,-1.300369,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
9208,0.430416,0.603437,0.985836,1.024950,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,6.754536,-0.152846,-0.152117,-0.152482,-0.151385
2672,1.130422,0.603437,1.542734,-0.428374,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,-1.051000,0.603437,-1.659429,0.734285,-0.145406,-0.150649,6.720615,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
5192,-0.306718,0.603437,-0.545633,1.606280,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
12172,-0.347968,-1.657173,0.707387,0.443620,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385
235,2.670032,0.603437,0.985836,0.443620,-0.145406,-0.150649,-0.148796,-0.155015,-0.152482,-0.148049,...,-0.148796,-0.156801,-0.152482,-0.148423,-0.146165,-0.148049,-0.152846,-0.152117,-0.152482,-0.151385


In [141]:
y_train

10752     3824.0
10236     6593.0
4512       280.0
9208       334.0
2672       220.0
          ...   
905        281.0
5192       478.0
12172     7823.0
235         27.0
13349    37892.0
Name: houses_sold, Length: 9418, dtype: float64

### Training

In [142]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.
                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.


### Results

In [143]:
for name, model in models.items():
    y_pred= model.predict(X_test)
    rmse= np.sqrt(np.mean((y_test - y_pred)**2))
    print (name + " RMSE: {:.4f}".format(rmse))

                     Linear Regression RMSE: 3336.2738
 Linear Regression (L2 Regularization) RMSE: 3336.2725
 Linear Regression (L1 Regularization) RMSE: 3336.4065
                   K-Nearest Neighbors RMSE: 2264.7955
                        Neural Network RMSE: 3322.8646
Support Vector Machine (Linear Kernel) RMSE: 10322.4358
   Support Vector Machine (RBF Kernel) RMSE: 12501.5496
                         Decision Tree RMSE: 1872.9250
                         Random Forest RMSE: 1516.1120
                     Gradient Boosting RMSE: 1741.3415
                               XGBoost RMSE: 1742.9113
                              LightGBM RMSE: 1410.0693


In [144]:
for name, model in models.items():
    print (name + " R2: {:.4f}".format(model.score(X_test, y_test)))

                     Linear Regression R2: 0.9237
 Linear Regression (L2 Regularization) R2: 0.9237
 Linear Regression (L1 Regularization) R2: 0.9237
                   K-Nearest Neighbors R2: 0.9649
                        Neural Network R2: 0.9243
Support Vector Machine (Linear Kernel) R2: 0.2698
   Support Vector Machine (RBF Kernel) R2: -0.0710
                         Decision Tree R2: 0.9760
                         Random Forest R2: 0.9842
                     Gradient Boosting R2: 0.9792
                               XGBoost R2: 0.9792
                              LightGBM R2: 0.9864


In [145]:
fig= px.scatter(
    x=y_pred,
    y=y_test,
    labels= {'x': "Predicted", 'y':"Actual"},
    title= "Actual vs. Predicted Values",
    width=700,
    height=700
)
fig.show()