In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Function to calculate Mean Absolute Percentage Error (MAPE)
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

# Load the dataset
data = pd.read_csv('Final filtered copy.csv')

# Define features and target variable
X = data.drop(['purchase_price'], axis=1)
y = data['purchase_price']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

# Define numerical and categorical features
numerical_features = ['area', 'median_house_price(2021)', 'median_house_rent(per week)', 'time_to_CBD[townhall]', 'population']
categorical_features = ['council_name', 'address', 'locality', 'post_code', 'property_type', 'area_type', 'settlement_date', 'Primary_purpose', 'region']

# Create preprocessing pipeline
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Define SVM model
svm_model = SVR()

# Create a pipeline with preprocessing and SVM model
svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', svm_model)])

# Define hyperparameters to tune
param_grid = {
    'model__C': [0.1, 1, 10],
    'model__gamma': ['scale', 'auto'],
    'model__kernel': ['linear', 'rbf']
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
train_score = best_model.score(X_train, y_train)
test_score = best_model.score(X_test, y_test)

print(f"Best Model Train Score: {train_score}")
print(f"Best Model Test Score: {test_score}")

# Predict on test set
y_pred = best_model.predict(X_test)

# Calculate MAPE
mape = mean_absolute_percentage_error(y_test, y_pred)
print("Mean Absolute Percentage Error:", mape)

Best Model Train Score: 0.3383930885259956
Best Model Test Score: 0.33401405151936103
Mean Absolute Percentage Error: 31.402426335015587
