#### Imports

In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
from ta import add_all_ta_features

# Statistics
from statsmodels.tsa.stattools import adfuller

# Data Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold

import matplotlib.pyplot as plt

### Data Ingestion

In [3]:
df = pd.read_csv("SydneyHousePrices.csv")
print(f"Length of Data: {len(df)}")
df.head()

Length of Data: 199504


Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house
3,2019-05-28,4,Avalon Beach,2107,1530000,3.0,1,2.0,house
4,2019-05-22,5,Whale Beach,2107,8000000,5.0,4,4.0,house


In [4]:
# Interpret data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199504 entries, 0 to 199503
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Date        199504 non-null  object 
 1   Id          199504 non-null  int64  
 2   suburb      199504 non-null  object 
 3   postalCode  199504 non-null  int64  
 4   sellPrice   199504 non-null  int64  
 5   bed         199350 non-null  float64
 6   bath        199504 non-null  int64  
 7   car         181353 non-null  float64
 8   propType    199504 non-null  object 
dtypes: float64(2), int64(4), object(3)
memory usage: 13.7+ MB


# Feature Engineering - Common Tasks

#### Handle Non-Numerical Data

In [7]:
# Count unique items for suburb
suburb_text_unique = df["suburb"].unique()
print("Unique suburbs: ", len(suburb_text_unique))
print("Perform label encoding")

Unique suburbs:  685


In [9]:
# Count unique items for PropType
prop_text_unique = df["propType"].unique()
print("Unique Prop Types: ", len(prop_text_unique))
print("One hot encoding")

Unique Prop Types:  8


In [10]:
# Label Encoding
labelencoder = LabelEncoder()
encoded_suburbs = labelencoder.fit_transform(df["suburb"])
df["suburbs_encoded"] = encoded_suburbs
df.head()

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburbs_encoded
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house,22
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house,22
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house,654
3,2019-05-28,4,Avalon Beach,2107,1530000,3.0,1,2.0,house,22
4,2019-05-22,5,Whale Beach,2107,8000000,5.0,4,4.0,house,654


In [None]:
# One hot encoding

onehot_encoded = pd.get_dummies(df["propType"], prefix='pt', drop_first=True)
df = df.join(onehot_encoded)
df.tail(3)