In [5]:
import numpy as np
import math
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, PredefinedSplit

In [6]:
# models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.svm import LinearSVR

### Loading cleaned and encoded data

In [7]:
df_train = pd.read_csv('../data/train_modelling.csv')
df_test = pd.read_csv('../data/test_modelling.csv')

### Training

In [8]:
df_train = df_train.drop(['Unnamed: 0'], axis = 1)
X_train = df_train.drop(['price'], axis=1)
y_train = df_train['price']

In [9]:
np.random.seed(0)
idxs = np.arange(X_train.shape[0])
np.random.shuffle(idxs)

In [10]:
split = int(0.2*X_train.shape[0])

train_idxs = idxs[split:]
valid_idxs = idxs[:split]

split_idxs = [ -1 if x in train_idxs else 0 for x in range(df_train.shape[0])]
ps = PredefinedSplit(test_fold = split_idxs)

X_valid = X_train.iloc[valid_idxs]
y_valid = y_train.iloc[valid_idxs]

X_train = X_train.iloc[train_idxs]
y_train = y_train.iloc[train_idxs]

In [11]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train, X_valid = scaler.transform(X_train), scaler.transform(X_valid)

**LinearRegression**

In [12]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [13]:
yt_pred = lin_reg.predict(X_train)
train_error = mean_squared_error(y_train, yt_pred, squared=False)
print(f'training RMSE = {train_error:.2f}')
yv_pred = lin_reg.predict(X_valid)
valid_error = mean_squared_error(y_valid, yv_pred, squared=False)
print(f'valid RMSE    = {valid_error:.2f}')

training RMSE = 3593510.33
valid RMSE    = 68073532.22


In [14]:
print(f'Percentage error for training data {((abs(yt_pred - y_train)/y_train)*100).mean()}')
print(f'Percentage error for validation data {((abs(yv_pred - y_valid)/y_valid)*100).mean()}')

Percentage error for training data 73.62269996977588
Percentage error for validation data 74.14818710382792


### Decision Tree

In [15]:
tree_reg = DecisionTreeRegressor(max_depth=5)
tree_reg.fit(X_train, y_train)

In [16]:
yt_pred = tree_reg.predict(X_train)
train_error = mean_squared_error(y_train, yt_pred, squared=False)
print(f'training RMSE = {train_error:.2f}')
yv_pred = tree_reg.predict(X_valid)
valid_error = mean_squared_error(y_valid, yv_pred, squared=False)
print(f'valid RMSE    = {valid_error:.2f}')

training RMSE = 1507351.33
valid RMSE    = 84963750.75


In [17]:
print(f'Percentage error for training data {((abs(yt_pred - y_train)/y_train)*100).mean()}')
print(f'Percentage error for validation data {((abs(yv_pred - y_valid)/y_valid)*100).mean()}')

Percentage error for training data 26.34317705051243
Percentage error for validation data 27.566519835244325


### Linear SVR

In [18]:
svm_reg = LinearSVR(random_state = 0, max_iter = 1000)
svm_reg.fit(X_train, y_train)

In [19]:
yt_pred = svm_reg.predict(X_train)
train_error = mean_squared_error(y_train, yt_pred, squared=False)
print(f'training RMSE = {train_error:.2f}')
yv_pred = svm_reg.predict(X_valid)
valid_error = mean_squared_error(y_valid, yv_pred, squared=False)
print(f'valid RMSE    = {valid_error:.2f}')

training RMSE = 334392736.09
valid RMSE    = 85126648.05


In [20]:
print(f'Percentage error for training data {((abs(yt_pred - y_train)/y_train)*100).mean()}')
print(f'Percentage error for validation data {((abs(yv_pred - y_valid)/y_valid)*100).mean()}')

Percentage error for training data 98.91915438979035
Percentage error for validation data 98.90214423217678
