In [1]:
# All imports needed
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import sklearn.metrics as metrics

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures 

### Dataset Visualizations

Dataset used: [Real Estate dataset](http://archive.ics.uci.edu/ml/datasets/Real+estate+valuation+data+set)

##### Features:
* transcation date: the transaction date (for example, 2013.250=2013 March, 2013.500=2013 June, etc.)
* house age: the house age (unit: year)
* MRT distance: the distance to the nearest MRT station (unit: meter)
* number of stores: the number of convenience stores in the living circle on foot (integer)
* latitude: the geographic coordinate, latitude. (unit: degree)
* longitude: the geographic coordinate, longitude. (unit: degree)

The output is as follow
* price: house price of unit area (10000 New Taiwan Dollar/Ping, where Ping is a local unit, 1 Ping = 3.3 meter squared)

In [2]:
# Read data from file
file_names = ['transaction date', 'house age', 'MRT distance', 'number of stores', 'latitude', 'longitude', 'price']
real_estate = pd.read_csv("../data/estate/Real estate valuation data set.csv", header = 0, names = file_names, usecols = range(1,8)) 
original_data = real_estate.copy()

#### Data exploration:

In [6]:
# original_data.info()
# original_data.describe()
# original_data.hist(figsize=(8,8))
# original_data.boxplot()
# original_data.shape

(414, 7)

#### Preprocessing:
As the data set has a column of dates which is a float value, we decided it might increas the prediction quality if we separate according to the month and year. The value after the decimal point represents the month. Taking 100 divided by 12 (12 months) its values is the corresponding month if the valued is diveded by 100/12.
We are going to test regression with different preprocessed datasets:

* original_data: imported from the csv file, without any preprocessing
* real_estate: transaction date splitted into transaction year and month (additional columns)

To try out scaling, we decided (due to the fact that not all values are normally distributed) to try RobustScaling and MinMaxScaling.

In [5]:
# preprocessing date
transaction_date = real_estate['transaction date'].astype(str).str.split('.', n = 1, expand = True)
real_estate['transaction year'] = real_estate['transaction date'].apply(np.floor)
real_estate['transaction month'] = (((abs(real_estate['transaction date']-real_estate['transaction year']))*1000)/83).round()
real_estate = real_estate.drop(columns=['transaction date'])
real_estate.head()

Unnamed: 0,house age,MRT distance,number of stores,latitude,longitude,price,transaction year,transaction month
0,32.0,84.87882,10,24.98298,121.54024,37.9,2012.0,11.0
1,19.5,306.5947,9,24.98034,121.53951,42.2,2012.0,11.0
2,13.3,561.9845,5,24.98746,121.54391,47.3,2013.0,7.0
3,13.3,561.9845,5,24.98746,121.54391,54.8,2013.0,6.0
4,5.0,390.5684,5,24.97937,121.54245,43.1,2012.0,10.0


#### Modeling:

In [6]:
def polynomial_regression(X_train, X_test, y_train, degree = 2):
    poly = PolynomialFeatures(degree)
    x_poly = poly.fit_transform(X_train)
    
    model = LinearRegression()
    model.fit(x_poly, y_train)
    
    y_pred = model.predict(poly.fit_transform(X_test))
    return y_pred

def plot_scatter(y_test, y_pred):
    # Plot outputs
    plt.scatter(y_test, y_pred)

    plt.xticks(())
    plt.yticks(())

    plt.show()
    
def print_performance(y_test, y_pred):
    # Mean Squared Error
    print("MSE: ", metrics.mean_squared_error(y_test,y_pred))
    
    print("RMSE: ", metrics.mean_squared_error(y_test,y_pred, squared=False))
    
    # R2 is between 0 and 100 percent
    # 0 indicates that the model explains none of the variability of the response data around its mean.
    # 100 indicates that the model explains all the variability of the response data around its mean.
    print("R2: ", metrics.r2_score(y_test,y_pred) * 100)

In [7]:
# setting parameters for testing
# regression models
alpha = [0.1,0.9,1,2,4]
normalize = [True, False]
degrees = [2,3]

# knn
n_neighbour = [2, 5, 8, 12] 
weights = ['uniform', 'distance']

# trees
tree_max_depth = [6, 9, 12, 15]
tree_min_split = [3, 6, 9, 12]
forest_n_estimators = [10, 50, 75]
forest_max_depth = [6, 9, 12, 15, 20]
forest_min_split = [2, 5] 

#### Linear Models:

In [8]:
X_train, X_test, y_train, y_test = train_test_split(real_estate.drop('price', axis=1),real_estate['price'], test_size=0.2)

# preprocessing with scaler
# scaler = RobustScaler()
scaler = MinMaxScaler()
scaler_X_train = scaler.fit_transform(X_train)
scaler_X_test = scaler.transform(X_test)

# Linear Regression
print('Linear Regression')
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print_performance(y_test, y_pred)

# Lasso Regression
print('\nLasso Regression')
grid = GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': alpha,
                        'normalize':normalize})
grid.fit(X_train, y_train)
print(grid.best_params_)
y_pred = grid.predict(X_test)
print_performance(y_test, y_pred)

# Ridge Regression
print('\nRidge Regression')
grid = GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': alpha,
                        'normalize':normalize})
grid.fit(X_train, y_train)
print(grid.best_params_)
y_pred = grid.predict(X_test)
print_performance(y_test, y_pred)

# Polynomial Regression
print('\nPolynomial Regression Degree 2')
y_pred2 = polynomial_regression(X_train, X_test, y_train)
print_performance(y_test, y_pred2)
print('\nPolynomial Regression Degree 3')
y_pred3 = polynomial_regression(X_train, X_test, y_train, 3)
print_performance(y_test, y_pred3)

Linear Regression
MSE:  92.5843449983826
RMSE:  9.622075919383644
R2:  54.159581531017764

Lasso Regression
{'alpha': 1, 'normalize': False}
MSE:  112.18865178706046
RMSE:  10.59191445334886
R2:  44.453084962910026

Ridge Regression
{'alpha': 0.1, 'normalize': True}
MSE:  93.11227090629585
RMSE:  9.649469980589393
R2:  53.89819452720212

Polynomial Regression Degree 2
MSE:  67.81739636332706
RMSE:  8.235131836426607
R2:  66.42220854048213

Polynomial Regression Degree 3
MSE:  149.86981362015567
RMSE:  12.242132723514954
R2:  25.79636468415555


#### Non-parametric Model:

In [9]:
# knn regression
print('KNN')
grid = GridSearchCV(estimator=neighbors.KNeighborsRegressor(),
             param_grid={'n_neighbors': n_neighbour,
                        'weights': weights})
grid.fit(X_train, y_train)
print(grid.best_params_)
y_pred = grid.predict(X_test)
print_performance(y_test, y_pred)

KNN
{'n_neighbors': 12, 'weights': 'distance'}
MSE:  68.59485768502593
RMSE:  8.282201258423145
R2:  66.03727140741988


#### Tree Models:

In [10]:
# regression tree
print('Regression Tree')
grid = GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': tree_max_depth,
                        'min_samples_leaf': tree_min_split})
grid.fit(X_train, y_train)
print(grid.best_params_)
y_pred = grid.predict(X_test)
print_performance(y_test, y_pred)

# random forest
print('\nRandom Forest')
grid = GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'n_estimators': forest_n_estimators,
                         'max_depth': forest_max_depth,
                         'min_samples_leaf': forest_min_split})
grid.fit(X_train, y_train)
print(grid.best_params_)
y_pred = grid.predict(X_test)
print_performance(y_test, y_pred)

Regression Tree
{'max_depth': 9, 'min_samples_leaf': 12}
MSE:  65.99054648582755
RMSE:  8.123456560222843
R2:  67.3267195878525

Random Forest
{'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 50}
MSE:  53.14183570954541
RMSE:  7.289844697217178
R2:  73.68838125734949
