In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [13]:
!pwd

/home/marcnaweb/code/marcnaweb/car_recommendation_engine/notebooks


# Creating the data frame 

In [14]:
price_df = pd.read_csv('/home/marcnaweb/code/marcnaweb/car_recommendation_engine/raw_data/car_prices_w_prices_scaled.csv', index_col=0)

In [15]:
features_df = pd.read_csv('/home/marcnaweb/code/marcnaweb/car_recommendation_engine/raw_data/scaled_cleaned.csv')

## Merging features and prices

In [16]:
merged_df = price_df.merge(features_df, left_on="car_code", right_on="car_code", how="left")

In [17]:
merged_df.rename(columns=lambda x: x.strip(), inplace=True)

In [18]:
merged_df[merged_df.index == 87887 ]

Unnamed: 0,car_code,car_manufacturer,car_model,car_model_year,Year_x,Price_YoY,Next_YoY_Price,Next_YoY_Pr_Pred,calendar_year,Price_sd_scaled,...,Maximum power,Unit displacement,Trunk,USB connection,Gear speed transmissions,Urban,Year_y,Radio,Folding rear seat,Perimeter anti theft alarm
87887,441,Audi,A8 4.2 V8 Quattro,2003,1,0.749652,0.961737,0.917069,2004,-0.836267,...,0.167403,0.523084,0.033011,0.0,0.545455,0.062035,0.625,0.0,0.0,0.0


### making model first name feature 

In [19]:
def take_first_word(word):
    return word.split(" ")[0]

merged_df["car_model_small"] = merged_df["car_model"].map(take_first_word)
merged_df.drop(columns="car_model", inplace=True)

# Model preprocessor 

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OneHotEncoder

# Model X and y - train test validade sets

In [29]:
X = merged_df.drop( columns=['car_code' , 'car_model_year',  'Next_YoY_Price', 'Next_YoY_Pr_Pred',
       'Price_sd_scaled' ])


In [30]:

num_feat = [feature for feature in X.select_dtypes(include='number').columns.tolist() if feature not in ["Year_x", "calendar_year" , "car_code", "car_model_year"] ]
#num_feat
categorical_features = list(merged_df.select_dtypes(include='object').columns)
categorical_features = [] # intentionaly removing these features ['car_manufacturer', 'car_model_small']
year_features = ["Year_x", "calendar_year" ]
year_features
#num_feat

['Year_x', 'calendar_year']

In [31]:

# Impute then scale numerical values:
num_inputer = Pipeline([
    ('imputer', SimpleImputer(strategy="mean"))
    #('standard_scaler', StandardScaler())
])

standard_scaler = Pipeline([
    #('imputer', SimpleImputer(strategy="mean"))
    ('standard_scaler', StandardScaler())
])




# Encode categorical values
cat_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)




# Parallelize "num_transformer" and "cat_transfomer"
preprocessor = ColumnTransformer([
    ('cat_transformer', cat_transformer, categorical_features ), #, 
    ('standard_scaler', standard_scaler, year_features ) ,
    ('num_inputer', num_inputer, num_feat  )  #numerical_columns 
])


In [32]:
y = merged_df['Next_YoY_Price']
#y = merged_df["Next_Price_sd_scaled"]

In [33]:
from sklearn.model_selection import train_test_split

X = preprocessor.fit_transform(X)  #keep in mind, is not procedural good

# Split data into train, test and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.3, random_state = 42  # TEST = 30%
)

# Use the same function above for the validation set
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size = 0.5, random_state = 42  # TEST = 15%
)

#  XG Boost Model

In [35]:
from xgboost import XGBRegressor

In [36]:

#X_train_transformed = preprocessor.fit_transform(X_train)
#X_val_transformed = preprocessor.transform(X_val)
X.shape

(87891, 103)

In [37]:
X_train.shape
#X_train.shape

(61523, 103)

In [38]:
X_val.shape
#X_val.shape

(13184, 103)

In [63]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor(max_depth=10, n_estimators=300, learning_rate=0.1)

xgb_reg.fit(X_train, y_train,
    # evaluate loss at each iteration
    eval_set=[(X_train, y_train), (X_val, y_val)],  
    # stop iterating when eval loss increases 5 times in a row
    early_stopping_rounds=3 
)

y_pred = xgb_reg.predict(X_val)



[0]	validation_0-rmse:0.41909	validation_1-rmse:0.41829
[1]	validation_0-rmse:0.37789	validation_1-rmse:0.37729
[2]	validation_0-rmse:0.34085	validation_1-rmse:0.34043
[3]	validation_0-rmse:0.30757	validation_1-rmse:0.30735
[4]	validation_0-rmse:0.27767	validation_1-rmse:0.27764
[5]	validation_0-rmse:0.25082	validation_1-rmse:0.25098
[6]	validation_0-rmse:0.22670	validation_1-rmse:0.22710
[7]	validation_0-rmse:0.20507	validation_1-rmse:0.20572
[8]	validation_0-rmse:0.18565	validation_1-rmse:0.18651
[9]	validation_0-rmse:0.16824	validation_1-rmse:0.16935
[10]	validation_0-rmse:0.15263	validation_1-rmse:0.15400
[11]	validation_0-rmse:0.13868	validation_1-rmse:0.14032
[12]	validation_0-rmse:0.12620	validation_1-rmse:0.12814
[13]	validation_0-rmse:0.11505	validation_1-rmse:0.11731
[14]	validation_0-rmse:0.10510	validation_1-rmse:0.10772
[15]	validation_0-rmse:0.09626	validation_1-rmse:0.09923
[16]	validation_0-rmse:0.08838	validation_1-rmse:0.09176
[17]	validation_0-rmse:0.08142	validation

In [64]:
y_pred

array([0.9267315 , 0.87350184, 0.8672882 , ..., 1.3090404 , 0.9356797 ,
       0.9174847 ], dtype=float32)

In [65]:
from sklearn.metrics import mean_squared_error

In [66]:
y_pred = xgb_reg.predict(X_test)
y_pred

array([0.8936684 , 0.88299817, 0.9107943 , ..., 0.92543316, 0.9198769 ,
       0.96319157], dtype=float32)

## comparing model with naive model

In [67]:
mean_squared_error(y_test, y_pred) ** (0.5)   #model score: mean square error --> lower is better

0.04349424852256784

In [68]:
mean_squared_error(y_test, [y_test.mean()]*len(y_test)) ** (0.5)   #naive score: mean square error 

0.11389767051634032

# Simple tester

In [77]:
car = merged_df[merged_df["car_model_year"] == 2015 ] [merged_df["calendar_year"] == 2017]#[merged_df.index == 23575 ]
car = merged_df[merged_df.index == 60263 ]
car

  car = merged_df[merged_df["car_model_year"] == 2015 ] [merged_df["calendar_year"] == 2017]#[merged_df.index == 23575 ]


Unnamed: 0,car_code,car_manufacturer,car_model_year,Year_x,Price_YoY,Next_YoY_Price,Next_YoY_Pr_Pred,calendar_year,Price_sd_scaled,Propulsion_Combustion,...,Unit displacement,Trunk,USB connection,Gear speed transmissions,Urban,Year_y,Radio,Folding rear seat,Perimeter anti theft alarm,car_model_small
60263,2005,Peugeot,2015,2,0.844462,0.966102,0.916243,2017,-1.451522,1.0,...,0.34035,0.027676,1.0,0.454545,0.111663,0.839286,1.0,1.0,0.0,207


In [78]:
car = car.drop( columns=['car_code' , 'car_model_year',  'Next_YoY_Price', 'Next_YoY_Pr_Pred', 'Price_sd_scaled' ])


#X = merged_df.drop( columns=['car_code' , 'car_model_year',  'Next_YoY_Price', 'Next_YoY_Pr_Pred',
#       'Price_sd_scaled' ])


#car["car_model_small"] = car["car_model"].map(take_first_word)
#car.drop(columns="car_model", inplace=True)
car

Unnamed: 0,car_manufacturer,Year_x,Price_YoY,calendar_year,Propulsion_Combustion,Propulsion_Electric,Propulsion_Hybrid,Propulsion_Light Hybrid,Propulsion_Plug-in hybrid,Car gearbox_Automated,...,Unit displacement,Trunk,USB connection,Gear speed transmissions,Urban,Year_y,Radio,Folding rear seat,Perimeter anti theft alarm,car_model_small
60263,Peugeot,2,0.844462,2017,1.0,0.0,0.0,0.0,0.0,0.0,...,0.34035,0.027676,1.0,0.454545,0.111663,0.839286,1.0,1.0,0.0,207


In [79]:
car_transformed = preprocessor.transform(car)

In [80]:
xgb_reg.predict(car_transformed)

array([0.9635005], dtype=float32)