# Modelling v2:

In [40]:
# import libraries
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.linear_model import LinearRegression

## Data loading:
Load the preprocessed data

In [41]:
with open('../data/sample_vector_preprocessed_data.pickle','rb') as f:
    data= pickle.load(f)

What's in the data?

In [42]:
X=data['X']
y=data['y']

In [43]:
X.shape

(4974, 606)

In [44]:
X= pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,596,597,598,599,600,601,602,603,604,605
0,324.0,0.0,79.0,37.0,-1.188922,-0.397173,0.711517,0.897947,0.594289,-2.418236,...,4.609313,-3.583575,-0.517026,1.065499,0.599137,-0.655274,-3.273834,0.590654,1.0,1.0
1,421.0,9.0,25.0,247.0,-2.331425,1.362068,0.291434,1.624530,0.519991,-0.420716,...,2.812929,-2.047805,-1.155372,1.072128,0.394988,-1.142712,-2.544832,1.192891,1.0,0.0
2,243.0,9.0,77.0,11.0,-2.244881,0.094330,0.392513,2.171628,1.157055,-1.029521,...,5.884979,-4.030570,-0.427418,2.056970,0.972062,-1.325980,-3.680802,0.746941,3.0,1.0
3,330.0,9.0,52.0,304.0,-1.287758,2.060548,-0.214825,1.679650,-0.479325,-0.184875,...,3.599762,-3.261988,0.075310,0.950053,0.469341,-0.403954,-3.331497,0.371955,1.0,1.0
4,324.0,9.0,5.0,256.0,-1.183864,1.312957,-0.879606,0.917889,0.871361,-0.388936,...,5.323649,-4.307009,-1.040022,1.116162,0.730822,-1.372428,-3.698938,0.541946,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4969,324.0,9.0,53.0,280.0,-0.933663,2.077572,-3.251820,-0.465564,0.690578,-0.173512,...,4.448956,-2.988212,-0.868217,1.913899,1.255109,-0.971755,-3.646072,0.257640,3.0,1.0
4970,324.0,1.0,21.0,67.0,-1.793618,0.642129,-0.598805,1.442434,1.123337,-0.618600,...,1.955032,-3.087261,0.309163,2.568345,-0.807026,-1.640751,-4.093068,-0.105423,1.0,1.0
4971,324.0,9.0,92.0,35.0,-1.270534,1.808835,-1.163674,0.996969,0.632005,-0.687106,...,4.419287,-3.561228,-0.790431,1.939986,1.469673,-1.691814,-3.148319,0.511977,2.0,0.0
4972,324.0,9.0,5.0,256.0,-1.433404,2.439240,-1.655477,1.107160,1.339474,0.329441,...,3.329554,-2.751823,0.290765,0.120514,0.559832,-0.460033,-4.487435,1.302876,3.0,0.0


In [45]:
y=pd.Series(y,name='Price')
y

0       15.0
1       81.0
2       10.0
3       12.0
4       24.0
        ... 
4969     9.0
4970     9.0
4971    11.0
4972    28.0
4973    33.0
Name: Price, Length: 4974, dtype: float64

## Split data into training and validation set:

Let's split the data

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Build model:

Train the model

In [47]:
model= RandomForestRegressor( )
model.fit(X_train,y_train)

Let's evaluate the baseline model

In [48]:
# Making predictions
y_pred= model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test.values, y_pred)
rmsle = np.sqrt(mean_squared_log_error(y_test.values, y_pred))

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error(RMSLE):", rmsle)
print("R-squared (R2):", r_squared)

Mean Squared Error (MSE): 1171.1278490255786
Root Mean Squared Error (RMSE): 34.22174526562868
Root Mean Squared Logarithmic Error(RMSLE): 0.8223761090958934
R-squared (R2): -0.09545252238803603


What do our predictions look like?

In [49]:
y_pred

array([33.11, 17.08, 15.15, ..., 16.14, 32.84, 26.43])

In [50]:
y_test.values

array([13.,  8., 20., ..., 17., 14., 26.])

## Feature selection:

Feature select from the model

In [51]:
selector = SelectFromModel(estimator=model).fit(X_train, y_train)
X_trans= selector.fit_transform(X_train, y_train)
X_trans_test= selector.transform(X_test)

In [52]:
model= RandomForestRegressor( )
model.fit(X_trans,y_train)

Let's evaluate the baseline model

In [53]:
# Making predictions
y_pred= model.predict(X_trans_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test.values, y_pred)
rmsle = np.sqrt(mean_squared_log_error(y_test.values, y_pred))

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error(RMSLE):", rmsle)
print("R-squared (R2):", r_squared)

Mean Squared Error (MSE): 1163.828326552984
Root Mean Squared Error (RMSE): 34.11492820676872
Root Mean Squared Logarithmic Error(RMSLE): 0.8135497739425672
R-squared (R2): -0.0886246766396106


What do our predictions look like?

In [54]:
y_pred

array([35.58, 18.68, 14.09, ..., 15.38, 31.8 , 24.45])

In [55]:
y_test.values

array([13.,  8., 20., ..., 17., 14., 26.])