Note: for linear regression we only consider numerical columns. We drop the `address` column.

In [9]:
import pandas as pd

train_data = pd.read_csv('../../data/processed/train.csv')
test_data = pd.read_csv('../../data/processed/test.csv')

# Drop the address column
train_data = train_data.drop(columns=['address'])
test_data = test_data.drop(columns=['address'])

# One-hot encode the 'flat_model' since it's categorical
train_data = pd.get_dummies(train_data, columns=['flat_model'], drop_first=True)
test_data = pd.get_dummies(test_data, columns=['flat_model'], drop_first=True)


feature_columns = train_data.columns.difference({'resale_price'}).tolist()
target_column = 'resale_price'


# Reorder the columns of the test data to match train data order
test_data = test_data[feature_columns + [target_column]]

X_train = train_data[feature_columns]
y_train = train_data[target_column]

X_test = test_data[feature_columns]
y_test = test_data[target_column]

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize and train the Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)



Model Intercept: -379370.24410751753
Model Coefficients:
  flat_model_3Gen: 77425.59206651816
  flat_model_Adjoined flat: 203673.3455300664
  flat_model_Apartment: 93068.84142226439
  flat_model_DBSS: 273055.07770403
  flat_model_Improved: 78363.0795099695
  flat_model_Improved-Maisonette: 261544.1550661788
  flat_model_Maisonette: 172643.90720298231
  flat_model_Model A: 72133.02057805915
  flat_model_Model A-Maisonette: 231954.4270586484
  flat_model_Model A2: 13978.795142852077
  flat_model_Multi Generation: 193153.05960149624
  flat_model_New Generation: 86995.26235843432
  flat_model_Premium Apartment: 59466.51884053476
  flat_model_Premium Apartment Loft: 353797.055967996
  flat_model_Premium Maisonette: 132313.06913847083
  flat_model_Simplified: 84818.73755323792
  flat_model_Standard: 132921.08407015604
  flat_model_Terrace: 540003.1044821681
  flat_model_Type S1: 448677.22718826105
  flat_model_Type S2: 493866.1437062395
  flat_type_encoded: 20818.259274314478
  floor_area_sq

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Predict resale prices
y_pred = lin_reg.predict(X_test)

test_data['predicted_price'] = y_pred


# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\nEvaluation Metrics for Linear Regression:")
print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)




Evaluation Metrics for Linear Regression:
Mean Absolute Error (MAE): 73850.2557691586
Root Mean Squared Error (RMSE): 99476.89673165572
