# Modelling v1:

In [60]:
# import libraries
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.linear_model import LinearRegression

## Data loading:
Load the preprocessed data

In [61]:
with open('../data/sampled_preprocessed_data.pickle','rb') as f:
    data= pickle.load(f)

What's in the data?

In [62]:
X=data['X']
y=data['y']

In [63]:
X= pd.DataFrame.sparse.from_spmatrix(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19996,19997,19998,19999,20000,20001,20002,20003,20004,20005
0,952.0,0,89.0,62.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,1.0
1,1286.0,9.0,28.0,372.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,0
2,764.0,9.0,87.0,18.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.0,1.0
3,966.0,9.0,57.0,466.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,1.0
4,952.0,9.0,5.0,385.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49724,952.0,9.0,5.0,385.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.0,0
49725,284.0,9.0,102.0,84.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.0,0
49726,952.0,5.0,5.0,457.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.0,0
49727,47.0,9.0,57.0,466.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2.0,0


In [64]:
y=pd.Series(y,name='Price')
y

0        15.0
1        81.0
2        10.0
3        12.0
4        24.0
         ... 
49724    44.0
49725    32.0
49726    84.0
49727    16.0
49728    51.0
Name: Price, Length: 49729, dtype: float64

## Split data into training and validation set:

Let's split the data

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Build model:

Train the model

In [66]:
model= RandomForestRegressor( )
model.fit(X_train,y_train)

Let's evaluate the baseline model

In [67]:
# Making predictions
y_pred= model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test.values, y_pred)
rmsle = np.sqrt(mean_squared_log_error(y_test.values, y_pred))

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error(RMSLE):", rmsle)
print("R-squared (R2):", r_squared)

Mean Squared Error (MSE): 855.3014396353589
Root Mean Squared Error (RMSE): 29.245537089192922
Root Mean Squared Logarithmic Error(RMSLE): 0.5954098365313002
R-squared (R2): 0.2816296447098078


What do our predictions look like?

In [68]:
y_pred

array([28.68, 13.3 , 22.95, ..., 14.06, 13.35, 11.66])

In [69]:
y_test.values

array([ 5.,  7., 10., ...,  8., 13., 28.])

## Feature selection:

Feature select from the model

In [70]:
selector = SelectFromModel(estimator=model).fit(X_train, y_train)
X_trans= selector.fit_transform(X_train, y_train)
X_trans_test= selector.transform(X_test)

In [71]:
model= RandomForestRegressor( )
model.fit(X_trans,y_train)

Let's evaluate the baseline model

In [72]:
# Making predictions
y_pred= model.predict(X_trans_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test.values, y_pred)
rmsle = np.sqrt(mean_squared_log_error(y_test.values, y_pred))

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error(RMSLE):", rmsle)
print("R-squared (R2):", r_squared)

Mean Squared Error (MSE): 878.4561295547949
Root Mean Squared Error (RMSE): 29.638760594107083
Root Mean Squared Logarithmic Error(RMSLE): 0.6126228641429777
R-squared (R2): 0.26218194819809504


What do our predictions look like?

In [75]:
y_pred

array([27.245     , 14.28333333, 27.75      , ..., 13.08      ,
       15.15      ,  9.31      ])

In [76]:
y_test.values

array([ 5.,  7., 10., ...,  8., 13., 28.])