In [1]:
# Basic
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Import the CSV Data

In [3]:
# TODO Do the feature engineering etc. and import the loaded dataset instead! 
df = pd.read_csv('./data/processed_scraped_apartment_sales_processed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,region,price_sold_sek,number_of_rooms,area_size,floor_number,has_elevator,year_built,annual_fee_sek,annual_cost_sek,...,region_processed_årsta,region_processed_östermalm,has_balcony_nej,has_balcony_unknown,cleaned_floor_number,brokerage_firm_processed_erik olsson fastighetsförmedling,brokerage_firm_processed_fastighetsbyrån stockholm - södermalm,brokerage_firm_processed_länsförsäkringar fastighetsförmedling solna,brokerage_firm_processed_notar,brokerage_firm_processed_tradition mäkleri
0,0,bagarmossen,4400000,3.0,74,2 av 3,False,1953.0,4899,9000,...,False,False,False,False,2.0,False,False,False,False,False
1,1,åby,1015000,1.0,25,4 av 4,True,1961.0,2265,3120,...,False,False,False,True,4.0,False,False,False,False,False
2,2,vasastan / hagastaden,7450000,3.0,725,unknown,False,2023.0,4428,0,...,False,False,False,False,,False,False,False,False,False
3,3,hässelby gård,1395000,1.0,40,2 av 3,True,1955.0,3250,4500,...,False,False,False,False,2.0,False,False,False,False,False
4,4,södermalm,3150000,1.0,31,1,True,1963.0,1290,6804,...,False,False,False,False,1.0,False,False,False,True,False


https://www.youtube.com/watch?v=gqqGdu1P2FM&t=1706s&ab_channel=KrishNaik

## Split Data

In [None]:
X = df.drop(['price_sold_sek'], axis=1)
y = df[['price_sold_sek']]

In [None]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
  [
    ("OneHotEncoder", oh_transformer, cat_features),
    ("StandardScaler", numeric_transformer, num_features)
  ]
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

## Train and Test Models

In [None]:
def evaluate_model(observed_values, predictions):
  mse = mean_squared_error(observed_values, predictions)
  rmse = mean_squared_error(observed_values, predictions, squared=False)
  mae = mean_absolute_error(observed_values, predictions)
  r2 = r2_score(observed_values, predictions)

  print('Baseline results:')
  print(f"MSE: {mse} \nRMSE: {rmse} \nMAE: {mae} \nR-squared: {r2}")

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost": AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set:')
    print("- Root Mean Square Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print("-------------------------------------")

    print('Model performance for Test set:')
    print("- Root Mean Square Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))

    r2_list.append(model_test_r2)

    print('='*35)
    print('\n')

## Results

In [None]:
pd.DataFrame(list(zip(model_list)), columns=['Model Name', 'R2_Score']).sort_values(by=['R2_Score'], ascending=False)

# Linear Regression (or best performing model)

https://www.youtube.com/watch?v=gqqGdu1P2FM&t=1706s&ab_channel=KrishNaik

## Plot Predicted & Observed Values

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel('Actual')
plt.ylabel('Predicted')

In [None]:
sns.regplot(x=y_test, y=y_pred, ci=None, color='red')

## Difference Between Actual & Predicted Values

In [None]:
pred_df = pd.DataFrame({'Actual Value': y_test, 'Predicted Value': y_pred, 'Difference': y_test - y_pred})
pred_df