Import Relevant Libraries

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.1.0


Inspect Data

In [16]:
resale_registration_full = pd.read_csv("data/resale-flat-prices_HDB/resale-on-registration-full.csv", index_col=0)

In [17]:
resale_approval_full = pd.read_csv("data/resale-flat-prices_HDB/resale-on-approval-full.csv")

Our raw dataset looks as such:

In [18]:
resale_registration_full.head()

Unnamed: 0,block,flat_model,flat_type,floor_area_sqm,lease_commence_date,month,remaining_lease,resale_price,storey_range,street_name,town
0,172,Improved,2 ROOM,45.0,1986,2012-03,,250000.0,06 TO 10,ANG MO KIO AVE 4,ANG MO KIO
1,510,Improved,2 ROOM,44.0,1980,2012-03,,265000.0,01 TO 05,ANG MO KIO AVE 8,ANG MO KIO
2,610,New Generation,3 ROOM,68.0,1980,2012-03,,315000.0,06 TO 10,ANG MO KIO AVE 4,ANG MO KIO
3,474,New Generation,3 ROOM,67.0,1984,2012-03,,320000.0,01 TO 05,ANG MO KIO AVE 10,ANG MO KIO
4,604,New Generation,3 ROOM,67.0,1980,2012-03,,321000.0,06 TO 10,ANG MO KIO AVE 5,ANG MO KIO


In [19]:
resale_approval_full.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,IMPROVED,1977,6000.0
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,8000.0
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,IMPROVED,1977,6000.0
4,1990-01,ANG MO KIO,3 ROOM,216,ANG MO KIO AVE 1,04 TO 06,73.0,NEW GENERATION,1976,47200.0


A look of resale by registration

In [12]:
resale_registration_full.describe()

Unnamed: 0.1,Unnamed: 0,floor_area_sqm,lease_commence_date,resale_price
count,157517.0,157517.0,157517.0,157517.0
mean,27778.789026,97.074167,1991.671089,445799.0
std,17550.452399,24.487027,11.554481,140120.1
min,0.0,31.0,1966.0,140000.0
25%,13126.0,74.0,1984.0,345000.0
50%,26252.0,95.0,1989.0,420000.0
75%,40492.0,112.0,2001.0,515000.0
max,68160.0,280.0,2016.0,1205000.0


In [13]:
resale_approval_full.describe()

Unnamed: 0,floor_area_sqm,lease_commence_date,resale_price
count,656851.0,656851.0,656851.0
mean,95.171843,1985.895573,254281.174816
std,26.43294,8.285107,123254.959795
min,28.0,1966.0,5000.0
25%,70.0,1980.0,163000.0
50%,92.0,1985.0,239000.0
75%,115.0,1992.0,330000.0
max,307.0,2012.0,903000.0


### Data Processing

In [21]:
resale_registration_full.count()

block                  157517
flat_model             157517
flat_type              157517
floor_area_sqm         157517
lease_commence_date    157517
month                  157517
remaining_lease        105314
resale_price           157517
storey_range           157517
street_name            157517
town                   157517
dtype: int64

In [22]:
resale_approval_full.count()

month                  656851
town                   656851
flat_type              656851
block                  656851
street_name            656851
storey_range           656851
floor_area_sqm         656851
flat_model             656851
lease_commence_date    656851
resale_price           656851
dtype: int64

Looking at the `NaN` values in `resale_registration_full`:

From above, we can see that some fields are not clean. For example, in `resale_registration_full.remaining_lease`, `NaN` values are present. These `NaN` appear as a result of the lack of data in part of the dataset

- We choose to convert these values to 0. If lease is unknown, we should treat it asif they aren't present. Furthermore, the dataset is of a older timeframe and as such, treating it as relatively lower than the rest is acceptable

In [3]:
## Import necessary ML functions/modules
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer

In [4]:
## A function that would be flexible enough to transform categorical datasets.
## Note: Some preprocessing probably needs to be done, to convert some string variables to float

def process_data(df: pd.DataFrame, lb_cols: list=[], le_cols:list =[]) -> pd.DataFrame:
    ''' Args
    df - the data frame to process. Should be a pd.DataFrame
    lb_cols - columns to apply label binarizing to. Uses sklearn's LabelBinarizer. 
    le_cols - columns to apply label encoding to. Uses sklearn's LabelEncoder. 
    
    Note: le_cols is good for ordinal data while lb_cols is used similar to one-hot
    '''
    final_df = df.copy()
    lb = LabelBinarizer()
    for lb_col in lb_cols:
        lb.fit(df[lb_col])
        to_merge_df = pd.DataFrame(lb.transform(df[lb_col]), columns=lb.classes_)
        final_df = pd.merge(final_df, to_merge_df, left_index=True, right_index=True)
    
    final_df= final_df.drop(lb_cols, axis=1)
    
    le = LabelEncoder()
    for le_col in le_cols:
        le.fit(df[lb_col])
        final_df[le_col] = le.fit_transform(final_df[le_col])
        
    return final_df


Preprocessing resale_registration_full

In [5]:
## Analysis using resale_registration_full
## 1. Process data
resale_registration_full = pd.read_csv("data/resale-flat-prices_HDB/resale-on-registration-full.csv", index_col=0)
resale_registration_full = resale_registration_full.fillna(0)

def convert_remaining_lease(s):
    if s == 0:
        return 0
    elif type(s) == float:
        return s
    elif type(s) == str:
        list_s = s.split(' ')
        if len(list_s) == 2:  ## X years
            return int(list_s[0])
        elif len(list_s) == 4:  ## X years Y months
            return(int(list_s[0]) + float(list_s[2]))
        elif len(list_s) == 1:  ## 78
            return(float(s))
    elif type(s == int):
        return float(s)
    raise Exception(f"Weird remaining_lease format: {s}, type: {type(s)}")
            
def to_lower_alpha(s:str):
    return(s.lower())

def months_since2012(s:str):
    ## months are in this format: YYYY-MM
    YYMM =s.split('-')
    return (12 * (int(YYMM[0]) - 2012) + (int(YYMM[1]))) 
        
resale_registration_full['flat_model'] = resale_registration_full['flat_model'].apply(to_lower_alpha)
resale_registration_full['remaining_lease'] = resale_registration_full['remaining_lease'].apply(convert_remaining_lease)
resale_registration_full['month'] = resale_registration_full['month'].apply(months_since2012)

lb_cols = ["flat_model", "flat_type", "town", "street_name"]
le_cols = ["storey_range"]
resale_registration_full = process_data(resale_registration_full, lb_cols, le_cols)

assert resale_registration_full.shape == (157517, 606)

# resale_registration_fullX = resale_registration_full.reset_index(drop=True)
X = resale_registration_full.drop(['resale_price','block'], axis=1)
y = resale_registration_full['resale_price']

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
## Split into train and test -- to used later during model fitting
def split_df(X, y, month_threshold=80):
    X_train = X[X['month'] <= month_threshold]
    X_test = X[X['month'] > month_threshold]
    y_train = y.iloc[:X_train.shape[0]]
    y_test = y.iloc[X_train.shape[0]:]
    X_train = X_train.reset_index(drop=True)
    
    return X_train, X_test, y_train, y_test

# X_train, X_test, y_train, y_test = split_df(X, y)

Preprocessing data on resale_approval_full

In [7]:
def commence_date_to_remaining(year: int):
    import datetime
    return 99 - (datetime.datetime.now().year - year)

## using process_data() on resale_approval_full
resale_approval_full = pd.read_csv("data/resale-flat-prices_HDB/resale-on-approval-full.csv")
resale_approval_full['month'] = resale_approval_full['month'].apply(months_since2012)
resale_approval_full['lease_remaining'] = resale_approval_full['lease_commence_date'].apply(commence_date_to_remaining)
resale_approval_full = resale_approval_full.drop(["block", "lease_commence_date"], axis=1)

lb_cols = ["flat_model", "street_name", "town"]
le_cols = ["flat_type", "storey_range"]
X_approval = process_data(resale_approval_full, lb_cols= lb_cols, le_cols=le_cols)
resale_approval_full.columns

Index(['month', 'town', 'flat_type', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'resale_price', 'lease_remaining'],
      dtype='object')

### Model fitting - ML Techniques

In [8]:
## functions to fit ML_models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, BayesianRidge
from sklearn.model_selection import GridSearchCV

## code is in housing/ML.py
from housing.ML import ML_Model

In [9]:
## ML techniques - Lasso
lasso = Lasso()
lasso_model = ML_Model(lasso, X, y, split_df)
lasso_model.fit()
lasso_model.predict()
lasso_model.get_metrics()
# lasso.fit(X_train,y_train)
# print(lasso.score(X_train, y_train))

  positive)


Adding some metrics: mse, R_squared, Explained variance
{'MSE': 20094647916.15412, 'Explained Variance': 0.05792120689732261, 'R^2': 0.05705361836435108, 'MAE': 104636.60118171675}


In [10]:
## ML techniques - Bayesian Ridge
br = BayesianRidge()
br_model = ML_Model(br, X, y, split_df)
br_model.fit()
br_model.predict()
br_model.get_metrics()

Adding some metrics: mse, R_squared, Explained variance
{'MSE': 19813117567.9257, 'Explained Variance': 0.07078195564484402, 'R^2': 0.07026450040071675, 'MAE': 103872.2911893091}


RFC - it takes really long to run this so I haven't

In [None]:
rfr = RandomForestRegressor(n_estimators=1000)
rfr.fit(X_train,y_train)
print(rfr.score(X_train, y_train))

Rolling predictions

build models with data on year $n$ to predict data on year $n+1$. We concat all predicted values.

In [11]:
lasso_model.rolling_predict(12)

(21475, 604) (21475,)


  positive)


(15773, 604) (15773,)


  positive)


(16344, 604) (16344,)


  positive)


(18166, 604) (18166,)


  positive)


(19411, 604) (19411,)


  positive)


(20472, 604) (20472,)


  positive)


(22246, 604) (22246,)


  positive)


Adding some metrics: mse, R_squared, Explained variance


In [12]:
lasso_model.get_metrics()

{'MSE': 20094647916.15412, 'Explained Variance': 0.05792120689732261, 'R^2': 0.05705361836435108, 'MAE': 104636.60118171675, '(Rolling mean) MAE': 67371.65460751504, '(Rolling mean) MSE': 8973238101.097103, '(Rolling mean) R^2': 0.2781672652508492, '(Rolling mean) Explained Variance': 0.26178754793839254}


In [13]:
lasso_model.rolling_predict(6)
lasso_model.get_metrics()

(12485, 604) (12485,)


  positive)


(8990, 604) (8990,)


  positive)


(8664, 604) (8664,)


  positive)


(7109, 604) (7109,)


  positive)


(8380, 604) (8380,)


  positive)


(7964, 604) (7964,)


  positive)


(9401, 604) (9401,)


  positive)


(8765, 604) (8765,)


  positive)


(10602, 604) (10602,)


  positive)


(8809, 604) (8809,)


  positive)


(10986, 604) (10986,)


  positive)


(9486, 604) (9486,)


  positive)


(12144, 604) (12144,)
(10102, 604) (10102,)


  positive)


(11717, 604) (11717,)
Adding some metrics: mse, R_squared, Explained variance
{'MSE': 20094647916.15412, 'Explained Variance': 0.05792120689732261, 'R^2': 0.05705361836435108, 'MAE': 104636.60118171675, '(Rolling mean) MAE': 62715.11415974779, '(Rolling mean) MSE': 8246972046.424003, '(Rolling mean) R^2': 0.3635206861364498, '(Rolling mean) Explained Variance': 0.36048740456647604}


  positive)


It seems that rolling predictions work better. This might likely be due to structural information that 
might be lost but persisted in the future due to models taking averages over the whole time period to
minimise loss functions.

### Explained Variance
Lasso on processed resale_registration_full : 0.05792120689732261

Bayesian Ridge on processed resale_registration_full : 0.07078195564484402

Rolling predictions, Lasso on processed resale_registration_full : 0.2625062541461246

Rolling predictions,Bayesian Ridge on processed resale_registration_full : 



### Deep Learning Techniques

#### ANN

In [15]:
X_train, X_test, y_train, y_test = split_df(X, y)
df = tf.data.Dataset.from_tensor_slices((X_train.values, y_train.values)).batch(16)
# for feat, targ in df.take(5):
#   print ('Features: {}, Target: {}'.format(feat, targ))

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=[X_train.shape[1]]),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1),
])
model.compile(optimizer='adam',
                loss=tf.keras.losses.MeanSquaredError(),
                metrics=['mae', 'mse'])

In [16]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               77440     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                650       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 86,357
Trainable params: 86,357
Non-trainable params: 0
_________________________________________________________________


In [17]:
EPOCHS = 100

history = model.fit(
  X_train, y_train,
  epochs=EPOCHS, validation_split = 0.2, verbose=2)

Train on 99328 samples, validate on 24832 samples
Epoch 1/100
99328/99328 - 11s - loss: 26822138366.6804 - mae: 117333.6562 - mse: 26822150144.0000 - val_loss: 19306000658.6392 - val_mae: 108330.2031 - val_mse: 19305998336.0000
Epoch 2/100
99328/99328 - 10s - loss: 10210779513.6907 - mae: 72519.3359 - mse: 10210778112.0000 - val_loss: 25677464936.4948 - val_mae: 122406.4141 - val_mse: 25677461504.0000
Epoch 3/100
99328/99328 - 10s - loss: 8722616750.1856 - mae: 66338.1641 - mse: 8722622464.0000 - val_loss: 25838952521.2371 - val_mae: 121450.2969 - val_mse: 25838956544.0000
Epoch 4/100
99328/99328 - 10s - loss: 8370510096.7423 - mae: 65280.2266 - mse: 8370511872.0000 - val_loss: 25809805947.5464 - val_mae: 121576.8750 - val_mse: 25809799168.0000
Epoch 5/100
99328/99328 - 10s - loss: 8151257794.7217 - mae: 64471.9336 - mse: 8151251456.0000 - val_loss: 25628049442.6392 - val_mae: 121626.7656 - val_mse: 25628061696.0000
Epoch 6/100
99328/99328 - 10s - loss: 8016004370.9691 - mae: 63951.535

Epoch 48/100
99328/99328 - 11s - loss: 6110940308.6186 - mae: 52900.7773 - mse: 6110944256.0000 - val_loss: 30736287227.7113 - val_mae: 131979.7500 - val_mse: 30736300032.0000
Epoch 49/100
99328/99328 - 11s - loss: 6092094646.7835 - mae: 52763.0352 - mse: 6092100096.0000 - val_loss: 31804368640.3299 - val_mae: 134609.9219 - val_mse: 31804401664.0000
Epoch 50/100
99328/99328 - 11s - loss: 6080424666.4330 - mae: 52710.5430 - mse: 6080423424.0000 - val_loss: 31502715848.1649 - val_mae: 133844.7656 - val_mse: 31502729216.0000
Epoch 51/100
99328/99328 - 10s - loss: 6063836526.4742 - mae: 52614.6875 - mse: 6063840768.0000 - val_loss: 31606513698.0619 - val_mae: 133345.9531 - val_mse: 31606530048.0000
Epoch 52/100
99328/99328 - 10s - loss: 6046874725.3608 - mae: 52468.2070 - mse: 6046878208.0000 - val_loss: 30774963328.4948 - val_mae: 131702.2031 - val_mse: 30774972416.0000
Epoch 53/100
99328/99328 - 10s - loss: 6029732275.1134 - mae: 52356.1758 - mse: 6029735424.0000 - val_loss: 30128614815.

Epoch 95/100
99328/99328 - 7s - loss: 5487305980.4948 - mae: 48984.6289 - mse: 5487310848.0000 - val_loss: 29869519359.1753 - val_mae: 128774.6719 - val_mse: 29869518848.0000
Epoch 96/100
99328/99328 - 7s - loss: 5470660550.8247 - mae: 48812.4062 - mse: 5470660608.0000 - val_loss: 31220181426.3093 - val_mae: 132404.0469 - val_mse: 31220183040.0000
Epoch 97/100
99328/99328 - 7s - loss: 5474408253.8969 - mae: 48880.7266 - mse: 5474402816.0000 - val_loss: 30146758356.9485 - val_mae: 129335.6250 - val_mse: 30146768896.0000
Epoch 98/100
99328/99328 - 8s - loss: 5464859912.9897 - mae: 48836.5586 - mse: 5464859648.0000 - val_loss: 30931341974.1031 - val_mae: 131213.7031 - val_mse: 30931333120.0000
Epoch 99/100
99328/99328 - 10s - loss: 5454236464.0619 - mae: 48678.8750 - mse: 5454239232.0000 - val_loss: 32282619020.7835 - val_mae: 133965.0781 - val_mse: 32282617856.0000
Epoch 100/100
99328/99328 - 7s - loss: 5452008378.4124 - mae: 48684.2422 - mse: 5452008960.0000 - val_loss: 32122065539.2165

In [18]:
## save model
model.save_weights("./checkpoints/ann-epoch-100")

In [20]:
## prediction
y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score
print(mean_squared_error(y_pred, y_test))
print(mean_absolute_error(y_pred, y_test))
print(explained_variance_score(y_pred, y_test))

22509405236.107265
107906.52360259842
-0.5796333655503159


### Time-series

Using a subset of data (certain street), we try to model data as a time-series, 
hopefully it'll improve eval metrics  

**Exogeneous Variables**

These are variables that would be similar throughout the time series and values dont change with time

- storey_range
- floor_area_sqm
- flat_model
- flat_type
- lease_commence_date -> remaining_lease

**Problems:**

We need one model per flat, which takes up alot of storage space and 
having so many models might be inefficient

In [26]:
time_series_registration = pd.read_csv("data/resale-flat-prices_HDB/resale-on-approval-full.csv")
time_series_registration.drop_duplicates(inplace=True)
time_series_registration["lease_remaining"] = time_series_registration["lease_commence_date"].apply(commence_date_to_remaining)
time_series_registration["flat_model"] = time_series_registration["flat_model"].apply(lambda x: x.lower())
time_series_registration.drop("lease_commence_date", axis=1, inplace=True)
print(time_series_registration.columns)
time_series_registration.shape

Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'resale_price', 'lease_remaining'],
      dtype='object')


(655512, 10)

1. Lag the values

In [27]:
time_series_registration.sort_values(by=["street_name", "block", "month", "storey_range"], inplace=True)
sorted_time_series = time_series_registration.reset_index(drop=True)
sorted_time_series["last_resale_price"] = sorted_time_series["resale_price"].shift(1)
sorted_time_series.dropna(inplace=True)
sorted_time_series

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,resale_price,lease_remaining,last_resale_price
1,2004-07,SEMBAWANG,5 ROOM,353A,ADMIRALTY DR,07 TO 09,110.0,improved,294000.0,81,325000.0
2,2005-01,SEMBAWANG,4 ROOM,353A,ADMIRALTY DR,13 TO 15,93.0,model a,255000.0,81,294000.0
3,2005-09,SEMBAWANG,4 ROOM,353A,ADMIRALTY DR,07 TO 09,93.0,model a,253000.0,81,255000.0
4,2005-09,SEMBAWANG,4 ROOM,353A,ADMIRALTY DR,13 TO 15,93.0,model a,244000.0,81,253000.0
5,2005-09,SEMBAWANG,4 ROOM,353A,ADMIRALTY DR,13 TO 15,93.0,model a,250000.0,81,244000.0
...,...,...,...,...,...,...,...,...,...,...,...
655507,2004-08,BUKIT MERAH,3 ROOM,91,ZION RD,10 TO 12,59.0,improved,170000.0,61,187000.0
655508,2004-09,BUKIT MERAH,3 ROOM,91,ZION RD,13 TO 15,59.0,improved,176000.0,61,170000.0
655509,2005-08,BUKIT MERAH,3 ROOM,91,ZION RD,04 TO 06,67.0,improved,202000.0,61,176000.0
655510,2005-09,BUKIT MERAH,3 ROOM,91,ZION RD,13 TO 15,59.0,improved,180000.0,61,202000.0


In [28]:
sorted_time_series["month"] = sorted_time_series["month"].apply(lambda x: int(x[:4])*12 + int(x[-2:])-1)
sorted_time_series.drop(["block", "town"],axis=1, inplace=True)
lb_cols = ["street_name", "flat_type", "flat_model"]
le_cols=["storey_range"]
ts_df = process_data(sorted_time_series,lb_cols=lb_cols, le_cols=le_cols)
ts_df.head(10)

Unnamed: 0,month,storey_range,floor_area_sqm,resale_price,lease_remaining,last_resale_price,ADMIRALTY DR,ADMIRALTY LINK,AH HOOD RD,ALEXANDRA RD,...,model a,model a-maisonette,model a2,multi generation,new generation,premium apartment,premium maisonette,simplified,standard,terrace
1,24054,2,110.0,294000.0,81,325000.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,24060,4,93.0,255000.0,81,294000.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,24068,2,93.0,253000.0,81,255000.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,24068,4,93.0,244000.0,81,253000.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,24068,4,93.0,250000.0,81,244000.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,24069,2,110.0,310000.0,81,250000.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,24069,3,93.0,244000.0,81,310000.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,24071,4,93.0,245000.0,81,244000.0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,24073,1,93.0,250000.0,81,245000.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,24073,2,110.0,311000.0,81,250000.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# np.amax(ts_df.month) ##24145
# np.amin(ts_df.month)  ## 23880
X = ts_df.drop("resale_price", axis=1)
y = ts_df["resale_price"]
# X_train, X_test, y_train, y_test = split_df(X, y, month_threshold=24110)
# print(X_train.shape, y_train.shape)
# print(X_test.shape, y_test.shape)

In [30]:
def tf_split(X, y):
    return split_df(X,y, month_threshold=24110)

br = BayesianRidge()
br_model = ML_Model(br, X, y, tf_split)
br_model.fit()
br_model.predict()
br_model.get_metrics()

Adding some metrics: mse, R_squared, Explained variance
{'MSE': 15768776911.158035, 'Explained Variance': -0.5954210504417528, 'R^2': -0.6056233903565931, 'MAE': 100481.18267924953}


In [31]:
br_model.rolling_predict(12)

(12175, 563) (12175,)
(12736, 563) (12736,)
(14389, 563) (14389,)
(19602, 563) (19602,)
(25735, 563) (25735,)
(28443, 563) (28443,)
(34086, 563) (34086,)
(32066, 563) (32066,)
(52830, 563) (52830,)
(55198, 563) (55198,)
(35192, 563) (35192,)
(38551, 563) (38551,)
(35359, 563) (35359,)
(28924, 563) (28924,)
(29179, 563) (29179,)
(29298, 563) (29298,)
(27639, 563) (27639,)
(26955, 563) (26955,)
(26952, 563) (26952,)
(31263, 563) (31263,)
(34123, 563) (34123,)
(21692, 563) (21692,)
Adding some metrics: mse, R_squared, Explained variance


In [32]:
br_model.get_metrics()

{'MSE': 15768776911.158035, 'Explained Variance': -0.5954210504417528, 'R^2': -0.6056233903565931, 'MAE': 100481.18267924953, '(Rolling mean) MAE': 25000.00606635096, '(Rolling mean) MSE': 1226947852.25971, '(Rolling mean) R^2': 0.8809032025415567, '(Rolling mean) Explained Variance': 0.8329001530126152}


### Checking Model Assumptions

In this section, we look at whether the fitted model satifies assumptions in the underlying model. 

For example, regression requires data to be homoscedastic. We observe this in a residual plot or QQ-plot, where residuals should look like a random variable with mean 0 while QQ-plot should show a straight line.