In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge

# Function to calculate Mean Absolute Percentage Error (MAPE)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Load the dataset
data = pd.read_csv('Final filtered copy.csv')

# Variable that was initially used to acquire a sample from cleaned dataset
sample_data = data

# Extracting year from settlement_date
sample_data['settlement_date'] = pd.to_datetime(sample_data['settlement_date'], format='%d/%m/%Y')
sample_data['settlement_year'] = sample_data['settlement_date'].dt.year

# Define features and target variable
X = sample_data.drop(['purchase_price', 'settlement_date'], axis=1)
y = sample_data['purchase_price']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Define numerical and categorical features
numerical_features = ['area', 'median_house_price(2021)', 'median_house_rent(per week)', 'time_to_CBD[townhall]', 'population']
categorical_features = ['council_name', 'address', 'locality', 'post_code', 'property_type', 'area_type', 'settlement_date', 'Primary_purpose', 'region']

# Update features
categorical_features.remove('settlement_date')
numerical_features.append('settlement_year')
X = sample_data.drop(['purchase_price', 'settlement_date'], axis=1)

# To display table that will be used for training model
X

Unnamed: 0,council_name,address,locality,post_code,property_type,area,area_type,Primary_purpose,median_house_price(2021),median_house_rent(per week),region,time_to_CBD[townhall],population,settlement_year
0,BLACKTOWN,307/11 SWINSON RD,Blacktown,2148,unit,100.0,M,RESIDENCE,800000,410,Western Suburbs,40,47500,2022
1,PENRITH,101/2 C LORD SHEFFIELD CCT,Penrith,2750,unit,100.0,M,RESIDENCE,750000,410,Western Suburbs,60,13500,2022
2,LIVERPOOL,27/1 BROWNE PDE,Warwick Farm,2170,unit,100.0,M,RESIDENCE,810000,400,Western Suburbs,60,6000,2023
3,PENRITH,201/2 C LORD SHEFFIELD CCT,Penrith,2750,unit,100.0,M,RESIDENCE,750000,410,Western Suburbs,60,13500,2022
4,CAMPBELLTOWN,29/12 TYLER ST,Campbelltown,2560,unit,100.0,M,RESIDENCE,650000,400,Western Suburbs,65,13000,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135452,PENRITH,113 BENNETT RD,Londonderry,2753,house,43550.0,M,RESIDENCE,1450000,470,Western Suburbs,110,4000,2022
135453,HORNSBY,216 NEW LINE RD,Dural,2158,house,45240.0,M,RESIDENCE,2000000,700,Hills Shire,80,7750,2023
135454,PENRITH,1046 CASTLEREAGH RD,Castlereagh,2749,house,46580.0,M,RESIDENCE,1440500,700,Western Suburbs,120,1250,2022
135455,HAWKESBURY,958 GROSE VALE RD,Kurrajong,2758,house,48980.0,M,RESIDENCE,1200000,460,Northern Suburbs,115,3250,2022


In [8]:
# Create preprocessing pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Define parameter grids for Ridge
param_grid_ridge = {
    'model__alpha': [0.01, 0.1, 1, 10, 100]
}

# Define Ridge model
ridge_model = Ridge()

# Create a pipeline with preprocessing and Ridge model
ridge_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', ridge_model)])

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    ridge_pipeline, param_distributions=param_grid_ridge, n_iter=125, cv=5, 
    scoring='neg_mean_absolute_percentage_error', n_jobs=-1, random_state=42
)

# Fit RandomizedSearchCV to the data
random_search.fit(X_train, y_train)

# Get the best model
best_model = random_search.best_estimator_

# Evaluate the best model
train_score = best_model.score(X_train, y_train)
test_score = best_model.score(X_test, y_test)

print(f"Best Model Train Score: {train_score}")
print(f"Best Model Test Score: {test_score}")

# Predict on test set
y_pred = best_model.predict(X_test)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Mean Absolute Percentage Error:", mape)




Best Model Train Score: 0.9908505292902339
Best Model Test Score: 0.6054301116217647
Mean Absolute Percentage Error: 29.586126163239797
