# Modelling v3:

In [1]:
# import libraries
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error
from sklearn.linear_model import LinearRegression

## Data loading:
Load the preprocessed data

In [2]:
with open('../data/sample_vector_preprocessed_data.pickle','rb') as f:
    data= pickle.load(f)

What's in the data?

In [3]:
X=data['X']
y=data['y']

In [4]:
X.shape

(19899, 603)

In [5]:
X= pd.DataFrame(X)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,593,594,595,596,597,598,599,600,601,602
0,0.0,85.0,51.0,-1.188922,-0.397173,0.711517,0.897947,0.594289,-2.418236,2.019648,...,-0.301549,2.539666,4.609313,-3.583575,-0.517026,1.065499,0.599137,-0.655274,-3.273834,0.590654
1,9.0,26.0,322.0,-2.331425,1.362068,0.291434,1.624530,0.519991,-0.420716,1.469736,...,-0.153013,2.006297,2.812929,-2.047805,-1.155372,1.072128,0.394988,-1.142712,-2.544832,1.192891
2,9.0,83.0,14.0,-2.244881,0.094330,0.392513,2.171628,1.157055,-1.029521,1.712284,...,-0.740687,2.900525,5.884979,-4.030570,-0.427418,2.056970,0.972062,-1.325980,-3.680802,0.746941
3,9.0,54.0,401.0,-1.287758,2.060548,-0.214825,1.679650,-0.479325,-0.184875,1.996599,...,-0.050027,3.000441,3.599762,-3.261988,0.075310,0.950053,0.469341,-0.403954,-3.331497,0.371955
4,9.0,5.0,334.0,-1.183864,1.312957,-0.879606,0.917889,0.871361,-0.388936,1.328362,...,0.301310,3.216333,5.323649,-4.307009,-1.040022,1.116162,0.730822,-1.372428,-3.698938,0.541946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19894,3.0,52.0,234.0,-1.426734,0.594683,-0.195802,0.384977,1.081316,0.139225,1.339488,...,-0.787927,3.093393,5.060913,-3.895362,-0.920304,1.927590,1.012525,-1.022846,-3.584777,0.097837
19895,9.0,5.0,412.0,-1.359959,2.094411,3.876200,1.649275,-1.296688,-1.702773,2.887788,...,-0.673789,2.725097,3.829703,-3.024500,-1.058983,1.595870,0.258660,-1.502370,-3.555545,0.022006
19896,1.0,69.0,322.0,-1.405841,1.081867,2.286777,1.229013,0.179550,-0.939630,2.238693,...,-0.234419,2.761670,4.011301,-2.678232,-0.084800,1.692374,0.812023,-1.659527,-2.465833,0.905382
19897,9.0,98.0,433.0,-2.021770,1.619732,-0.866302,1.368436,1.106894,-0.535487,2.076682,...,0.016018,3.753549,4.969755,-3.921716,-0.519935,1.975573,1.562981,-1.278723,-3.460091,-0.157667


In [6]:
y=pd.Series(y,name='Price')
y

0        15.0
1        81.0
2        10.0
3        12.0
4        24.0
         ... 
19894    27.0
19895    25.0
19896    46.0
19897    14.0
19898     6.0
Name: Price, Length: 19899, dtype: float64

In [7]:
y.max()

917.0

## Split data into training and validation set:

Let's split the data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
# y_train_log = np.log1p(y_train)

## Build model:

Train the model

In [19]:
model= XGBRegressor(n_estimators= 100,
                            booster= 'dart',
                             learning_rate=0.01, 
                             max_depth=10,  
                             colsample_bytree=0.7, 
                             seed=42)

                             
model.fit(X_train,y_train)

Let's evaluate the baseline model

In [20]:
# Making predictions
y_pred= model.predict(X_test)
# y_pred= np.expm1(model.predict(X_test))

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r_squared = r2_score(y_test.values, y_pred)
rmsle = np.sqrt(mean_squared_log_error(y_test.values, y_pred))

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error(RMSLE):", rmsle)
print("R-squared (R2):", r_squared)

Mean Squared Error (MSE): 1281.4942976677257
Root Mean Squared Error (RMSE): 35.79796499338651
Root Mean Squared Logarithmic Error(RMSLE): 0.7650066772929669
R-squared (R2): 0.07174958432025025


What do our predictions look like?

In [12]:
y_pred

array([21.519194, 25.417809, 15.380769, ..., 14.241953, 12.066487,
       24.101667], dtype=float32)

In [13]:
y_test.values

array([22., 90., 11., ...,  7.,  7., 36.])