# Importing / sense-checking

In [4]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Get multiple outputs in the same cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


# Import the CSV file with Pandas.
# data = pd.read_csv('Data/raw_sales.csv', index_col=['datesold'], parse_dates=['datesold'])
# It seems that the 'Data/raw_sales.csv' does not exist. I have been able to assist in fixing. Kindly recpmfirm it.
data = pd.read_csv('raw_sales.csv', index_col=['datesold'], parse_dates=['datesold'])

# View the DataFrame.
print(data.shape)
print(data.dtypes)
data

(29580, 4)
postcode         int64
price            int64
propertyType    object
bedrooms         int64
dtype: object


Unnamed: 0_level_0,postcode,price,propertyType,bedrooms
datesold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2007-02-07,2607,525000,house,4
2007-02-27,2906,290000,house,3
2007-03-07,2905,328000,house,3
2007-03-09,2905,380000,house,4
2007-03-21,2906,310000,house,3
...,...,...,...,...
2019-07-25,2900,500000,unit,3
2019-07-25,2612,560000,unit,2
2019-07-26,2912,464950,unit,2
2019-07-26,2601,589000,unit,2


In [None]:
# Checking different values

data.nunique()

In [None]:
# Checking for missing values

data.isna().sum()

# Dataset does not contain NA/s

In [None]:
# Plotting house prices over time

prices_over_time = data.reset_index()
sns.lineplot(x='datesold', y='price', ci=False, data=prices_over_time)

plt.plot(figsize=(12, 4))
plt.title('House prices over time')
plt.xlabel('Date Sold')
plt.ylabel('Price')

In [None]:
# Plotting bedroom count

bedroom_count = data.groupby('bedrooms').count().reset_index()
sns.barplot(x='bedrooms', y='postcode', data=bedroom_count)

plt.title('Count of bedrooms')
plt.xlabel('Number of bedrooms')
plt.ylabel('Bedroom count')

In [None]:
# Creating subsets for the different bedroom numbers

bedroom_0 = data[data['bedrooms'] == 0]
bedroom_1 = data[data['bedrooms'] == 1]
bedroom_2 = data[data['bedrooms'] == 2]
bedroom_3 = data[data['bedrooms'] == 3]
bedroom_4 = data[data['bedrooms'] == 4]
bedroom_5 = data[data['bedrooms'] == 5]


In [None]:
# Plotting 1-bedroom properties with a histogram

fig, axes = plt.subplots(2, 3, figsize=(20, 20))

sns.histplot(bedroom_0['price'], ax=axes[0, 0])
sns.histplot(bedroom_1['price'], ax=axes[0, 1])
sns.histplot(bedroom_2['price'], ax=axes[0, 2])
sns.histplot(bedroom_3['price'], ax=axes[1, 0])
sns.histplot(bedroom_4['price'], ax=axes[1, 1])
sns.histplot(bedroom_5['price'], ax=axes[1, 2])


# Analyzing spread of values 

In [None]:
# Creating a boxplot to identify outliers for 1 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_1)


In [None]:
# Creating a boxplot to identify outliers for 2 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_2)


In [None]:
# Creating a boxplot to identify outliers for 3 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_3)


In [None]:
# Creating a boxplot to identify outliers for 4 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_4)


In [None]:
# Creating a boxplot to identify outliers for 5 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_5)


# Identifying and removing outliers

In [None]:
# Define function to filter out outliers
def remove_price_outlier(df):
    # Get prices
    prices = df['price']
    # calculate quartiles, IQR and limits
    q1 = prices.quantile(0.25)
    q3 = prices.quantile(0.75)
    IQR = q3 - q1
    lower_limit = q1 - 1.5 * IQR
    upper_limit = q3 + 1.5 * IQR
    
    # Filter dataset by limits
    df_filt = df[(prices > lower_limit) & (prices < upper_limit)]
    return df_filt

In [None]:
# Identifying and remmoving outliers: bedroom_1

bedroom_1_non_outlier = remove_price_outlier(bedroom_1)
bedroom_1_non_outlier

# Creating a boxplot to identify outliers for 1 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_1_non_outlier)


In [None]:
# Identifying and removing outliers: bedroom_2

bedroom_2_non_outlier = remove_price_outlier(bedroom_2)
bedroom_2_non_outlier

# Creating a boxplot to identify outliers for 1 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_2_non_outlier)


In [None]:
# Identifying and removing outliers: bedroom_3

bedroom_3_non_outlier = remove_price_outlier(bedroom_3)
bedroom_3_non_outlier

# Creating a boxplot to identify outliers for 1 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_3_non_outlier)


In [None]:
# Identifying and removing outliers: bedroom_4

bedroom_4_non_outlier = remove_price_outlier(bedroom_4)
bedroom_4_non_outlier

# Creating a boxplot to identify outliers for 1 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_4_non_outlier)


In [None]:
# Identifying and removing outliers: bedroom_5

bedroom_5_non_outlier = remove_price_outlier(bedroom_5)
bedroom_5_non_outlier

# Creating a boxplot to identify outliers for 1 bedroom

plt.figure(figsize=(18, 4))
sns.boxplot(x='price', data=bedroom_5_non_outlier)


# Time-series-Forecasting: 1 Bedroom

In [None]:
# Creating a time series for house prices 1 bedroom

bedroom_1_non_outlier

bedroom_1_non_outlier['price'].plot(figsize=(12, 4))
plt.legend(loc='best')
plt.title('Time series plot for house with 1 bedroom')
plt.show(block=False)

In [None]:
# Resampling the dataset

bedroom_1_res = bedroom_1_non_outlier.resample('M').mean()
bedroom_1_res.isna().sum()

# Drop the missing values. 
bedroom_1_res.dropna(inplace= True)
bedroom_1_res.isna().sum()


# Plot the time series data.

plt.figure(figsize=(12, 4))
sns.lineplot(x='datesold', y='price', data=bedroom_1_res)


In [2]:
# Resampling the dataset

bedroom_2_res = bedroom_2_non_outlier.resample('M').mean()
bedroom_2_res.isna().sum()

# Drop the missing values. 
bedroom_2_res.dropna(inplace= True)
bedroom_2_res.isna().sum()


# Plot the time series data.

plt.figure(figsize=(12, 4))
sns.lineplot(x='datesold', y='price', data=bedroom_1_res)


NameError: name 'bedroom_2_non_outlier' is not defined

In [None]:
# Discussed in tutorial video
# Function to calculate and plot the simple moving average 
def plot_moving_average(series, window, plot_intervals=False, scale=1.96):

    rolling_mean = series.rolling(window=window).mean()
    
    plt.figure(figsize=(12,4))
    plt.title("Moving average\n window size = {}".format(window))
    plt.plot(rolling_mean, 'g', label='Simple moving average trend')
    
    # Plot confidence intervals for smoothed values.
    if plot_intervals:
        mae = mean_absolute_error(series[window:], rolling_mean[window:])
        deviation = np.std(series[window:] - rolling_mean[window:])
        lower_bound = rolling_mean - (mae + scale * deviation)
        upper_bound = rolling_mean + (mae + scale * deviation)
        plt.plot(upper_bound, 'r--', label='Upper bound / Lower bound')
        plt.plot(lower_bound, 'r--')
            
    plt.plot(series[window:], label='Actual values')
    plt.legend(loc='best')
    plt.grid(True)
    
# Five data point smoothing
plot_moving_average(bedroom_1_res.price, 5, plot_intervals=True)

# 30 data point smoothing
plot_moving_average(bedroom_1_res.price, 30, plot_intervals=True)

# 90 data point smoothing
plot_moving_average(bedroom_1_res.price, 90, plot_intervals=True)

# Time-series-Forecasting: 2 Bedroom

In [None]:
# Creating a time series for house prices 2 bedroom

bedroom_2_non_outlier

bedroom_2_non_outlier['price'].plot(figsize=(12, 4))
plt.legend(loc='best')
plt.title('Time series plot for house with 2 bedroom')
plt.show(block=False)

In [None]:
# Resampling the dataset

bedroom_2_res = bedroom_2_non_outlier.resample('M').mean()
bedroom_2_res.isna().sum()

# Drop the missing values. 
bedroom_2_res.dropna(inplace= True)
bedroom_2_res.isna().sum()


# Plot the time series data.

plt.figure(figsize=(12, 4))
sns.lineplot(x='datesold', y='price', data=bedroom_2_res)


In [None]:
# Discussed in tutorial video
# Function to calculate and plot the simple moving average 
def plot_moving_average(series, window, plot_intervals=False, scale=1.96):

    rolling_mean = series.rolling(window=window).mean()
    
    plt.figure(figsize=(12,4))
    plt.title("Moving average\n window size = {}".format(window))
    plt.plot(rolling_mean, 'g', label='Simple moving average trend')
    
    # Plot confidence intervals for smoothed values.
    if plot_intervals:
        mae = mean_absolute_error(series[window:], rolling_mean[window:])
        deviation = np.std(series[window:] - rolling_mean[window:])
        lower_bound = rolling_mean - (mae + scale * deviation)
        upper_bound = rolling_mean + (mae + scale * deviation)
        plt.plot(upper_bound, 'r--', label='Upper bound / Lower bound')
        plt.plot(lower_bound, 'r--')
            
    plt.plot(series[window:], label='Actual values')
    plt.legend(loc='best')
    plt.grid(True)
    
# Five data point
plot_moving_average(bedroom_2_res.price, 5, plot_intervals=True)

# 30 data point smoothing
plot_moving_average(bedroom_2_res.price, 30, plot_intervals=True)

# 90 data point smoothing
plot_moving_average(bedroom_2_res.price, 90, plot_intervals=True)

# Time-series-Forecasting: 3 Bedroom


In [None]:
# Creating a time series for house prices 3 bedroom

bedroom_3_non_outlier

bedroom_3_non_outlier['price'].plot(figsize=(12, 4))
plt.legend(loc='best')
plt.title('Time series plot for house with 3 bedrooms')
plt.show(block=False)

In [None]:
# Resampling the dataset

bedroom_3_res = bedroom_3_non_outlier.resample('M').mean()
bedroom_3_res.isna().sum()

# Drop the missing values. 
bedroom_3_res.dropna(inplace= True)
bedroom_3_res.isna().sum()


# Plot the time series data.

plt.figure(figsize=(12, 4))
sns.lineplot(x='datesold', y='price', data=bedroom_3_res)


In [None]:
# Discussed in tutorial video
# Function to calculate and plot the simple moving average 
def plot_moving_average(series, window, plot_intervals=False, scale=1.96):

    rolling_mean = series.rolling(window=window).mean()
    
    plt.figure(figsize=(12,4))
    plt.title("Moving average\n window size = {}".format(window))
    plt.plot(rolling_mean, 'g', label='Simple moving average trend')
    
    # Plot confidence intervals for smoothed values.
    if plot_intervals:
        mae = mean_absolute_error(series[window:], rolling_mean[window:])
        deviation = np.std(series[window:] - rolling_mean[window:])
        lower_bound = rolling_mean - (mae + scale * deviation)
        upper_bound = rolling_mean + (mae + scale * deviation)
        plt.plot(upper_bound, 'r--', label='Upper bound / Lower bound')
        plt.plot(lower_bound, 'r--')
            
    plt.plot(series[window:], label='Actual values')
    plt.legend(loc='best')
    plt.grid(True)
    
# Five data point
plot_moving_average(bedroom_3_res.price, 5, plot_intervals=True)

# 30 data point smoothing
plot_moving_average(bedroom_3_res.price, 30, plot_intervals=True)

# 90 data point smoothing
plot_moving_average(bedroom_3_res.price, 90, plot_intervals=True)

# Time-series-Forecasting: 4 Bedroom


In [None]:
# Creating a time series for house prices 4 bedrooms

bedroom_4_non_outlier

bedroom_4_non_outlier['price'].plot(figsize=(12, 4))
plt.legend(loc='best')
plt.title('Time series plot for house with 3 bedrooms')
plt.show(block=False)

In [None]:
# Resampling the dataset

bedroom_4_res = bedroom_4_non_outlier.resample('M').mean()
bedroom_4_res.isna().sum()

# Drop the missing values. 
bedroom_4_res.dropna(inplace= True)
bedroom_4_res.isna().sum()


# Plot the time series data.

plt.figure(figsize=(12, 4))
sns.lineplot(x='datesold', y='price', data=bedroom_4_res)


In [None]:
# Discussed in tutorial video
# Function to calculate and plot the simple moving average 
def plot_moving_average(series, window, plot_intervals=False, scale=1.96):

    rolling_mean = series.rolling(window=window).mean()
    
    plt.figure(figsize=(12,4))
    plt.title("Moving average\n window size = {}".format(window))
    plt.plot(rolling_mean, 'g', label='Simple moving average trend')
    
    # Plot confidence intervals for smoothed values.
    if plot_intervals:
        mae = mean_absolute_error(series[window:], rolling_mean[window:])
        deviation = np.std(series[window:] - rolling_mean[window:])
        lower_bound = rolling_mean - (mae + scale * deviation)
        upper_bound = rolling_mean + (mae + scale * deviation)
        plt.plot(upper_bound, 'r--', label='Upper bound / Lower bound')
        plt.plot(lower_bound, 'r--')
            
    plt.plot(series[window:], label='Actual values')
    plt.legend(loc='best')
    plt.grid(True)
    
# Five data point
plot_moving_average(bedroom_4_res.price, 5, plot_intervals=True)

# 30 data point smoothing
plot_moving_average(bedroom_4_res.price, 30, plot_intervals=True)

# 90 data point smoothing
plot_moving_average(bedroom_4_res.price, 90, plot_intervals=True)

# Time-series-Forecasting: 5 Bedroom


In [None]:
# Creating a time series for house prices 5 bedrooms

bedroom_5_non_outlier

bedroom_5_non_outlier['price'].plot(figsize=(12, 4))
plt.legend(loc='best')
plt.title('Time series plot for house with 5 bedrooms')
plt.show(block=False)

In [None]:
# Resampling the dataset

bedroom_5_res = bedroom_5_non_outlier.resample('M').mean()
bedroom_5_res.isna().sum()

# Drop the missing values. 
bedroom_5_res.dropna(inplace= True)
bedroom_5_res.isna().sum()


# Plot the time series data.

plt.figure(figsize=(12, 4))
sns.lineplot(x='datesold', y='price', data=bedroom_4_res)


In [None]:
# Discussed in tutorial video
# Function to calculate and plot the simple moving average 
def plot_moving_average(series, window, plot_intervals=False, scale=1.96):

    rolling_mean = series.rolling(window=window).mean()
    
    plt.figure(figsize=(12,4))
    plt.title("Moving average\n window size = {}".format(window))
    plt.plot(rolling_mean, 'g', label='Simple moving average trend')
    
    # Plot confidence intervals for smoothed values.
    if plot_intervals:
        mae = mean_absolute_error(series[window:], rolling_mean[window:])
        deviation = np.std(series[window:] - rolling_mean[window:])
        lower_bound = rolling_mean - (mae + scale * deviation)
        upper_bound = rolling_mean + (mae + scale * deviation)
        plt.plot(upper_bound, 'r--', label='Upper bound / Lower bound')
        plt.plot(lower_bound, 'r--')
            
    plt.plot(series[window:], label='Actual values')
    plt.legend(loc='best')
    plt.grid(True)
    
# Five data point
plot_moving_average(bedroom_5_res.price, 5, plot_intervals=True)

# 30 data point smoothing
plot_moving_average(bedroom_5_res.price, 30, plot_intervals=True)

# 90 data point smoothing
plot_moving_average(bedroom_5_res.price, 90, plot_intervals=True)