In [30]:
import pandas as pd
import numpy as np


import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor

from catboost import CatBoostClassifier,CatBoostRegressor


In [31]:
df = pd.read_csv('get_around_pricing_project.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [6]:
df.shape

(4843, 15)

In [7]:
for column in df.columns:
    fig = px.histogram(df,x=column)
    fig.show()

In [20]:
df.isna().sum()

Unnamed: 0                   0
model_key                    0
mileage                      0
engine_power                 0
fuel                         0
paint_color                  0
car_type                     0
private_parking_available    0
has_gps                      0
has_air_conditioning         0
automatic_car                0
has_getaround_connect        0
has_speed_regulator          0
winter_tires                 0
rental_price_per_day         0
dtype: int64

In [32]:
# Create mapping for model categories
model_counts = df.groupby('model_key')['car_type'].count().reset_index()
model_counts['model_key_category'] = model_counts['model_key']
model_counts.loc[model_counts['car_type'] < 5, 'model_key_category'] = 'Others'

# Create a dictionary for mapping
category_mapping = dict(zip(model_counts['model_key'], model_counts['model_key_category']))

# Add the new categorized column to the main dataframe
df['model_key'] = df['model_key'].map(category_mapping)

In [33]:
df = df[df.columns[1:]]

df.head()

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires,rental_price_per_day
0,Citroën,140411,100,diesel,black,convertible,True,True,False,False,True,True,True,106
1,Citroën,13929,317,petrol,grey,convertible,True,True,False,False,False,True,True,264
2,Citroën,183297,120,diesel,white,convertible,False,False,False,False,True,False,True,101
3,Citroën,128035,135,diesel,red,convertible,True,True,False,False,True,True,True,158
4,Citroën,97097,160,diesel,silver,convertible,True,True,False,False,False,True,True,183


In [34]:
features_list = list(df.columns[:-1])


target_variable = 'rental_price_per_day'

X = df.loc[:, features_list]
Y = df.loc[:, target_variable]

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [36]:
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)



print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)



Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


In [37]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())

])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [38]:
X_train

Unnamed: 0,model_key,mileage,engine_power,fuel,paint_color,car_type,private_parking_available,has_gps,has_air_conditioning,automatic_car,has_getaround_connect,has_speed_regulator,winter_tires
4550,BMW,132485,135,diesel,white,suv,True,True,False,False,False,False,True
1237,Citroën,131121,135,diesel,black,estate,False,True,False,False,True,False,True
3158,Renault,209216,135,diesel,grey,sedan,True,True,False,False,True,False,True
900,Peugeot,148986,100,diesel,black,estate,True,True,False,False,False,False,True
933,Citroën,170500,135,diesel,black,estate,True,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,Peugeot,153721,100,diesel,brown,estate,True,True,False,False,False,False,True
3264,Citroën,172907,120,diesel,white,sedan,False,True,False,False,True,False,False
1653,Renault,138489,120,diesel,blue,estate,True,True,False,False,True,False,True
2607,Renault,219417,110,diesel,black,sedan,True,True,False,False,False,False,True


In [39]:
# Preprocessings on train set
print("Performing preprocessings on train set...")

print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !!
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

Performing preprocessings on train set...
     model_key  mileage  engine_power    fuel paint_color car_type  \
4550       BMW   132485           135  diesel       white      suv   
1237   Citroën   131121           135  diesel       black   estate   
3158   Renault   209216           135  diesel        grey    sedan   
900    Peugeot   148986           100  diesel       black   estate   
933    Citroën   170500           135  diesel       black   estate   

      private_parking_available  has_gps  has_air_conditioning  automatic_car  \
4550                       True     True                 False          False   
1237                      False     True                 False          False   
3158                       True     True                 False          False   
900                        True     True                 False          False   
933                        True     True                 False          False   

      has_getaround_connect  has_speed_regulator  

## Catboost model

In [40]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression

from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from matplotlib import pyplot


# CatBoost Regression Model 
from catboost import CatBoostRegressor 
  
# Initialize the CatBoostRegressor with RMSE as the loss function 
model = CatBoostRegressor(loss_function='RMSE') 
  
# Fit the model on the training data with verbose logging every 100 iterations 
model.fit(X_train, Y_train, verbose=100) 



Learning rate set to 0.050712
0:	learn: 32.8621328	total: 20.3ms	remaining: 20.3s
100:	learn: 16.7342451	total: 493ms	remaining: 4.39s
200:	learn: 15.3379798	total: 1.1s	remaining: 4.38s
300:	learn: 14.5351015	total: 1.52s	remaining: 3.52s
400:	learn: 13.7701797	total: 1.94s	remaining: 2.89s
500:	learn: 13.1743759	total: 2.35s	remaining: 2.34s
600:	learn: 12.6998725	total: 2.73s	remaining: 1.81s
700:	learn: 12.1987814	total: 3.28s	remaining: 1.4s
800:	learn: 11.7827632	total: 3.8s	remaining: 943ms
900:	learn: 11.4043567	total: 4.33s	remaining: 475ms
999:	learn: 11.0274347	total: 4.84s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x2499f5da390>

In [41]:
# Import the mean squared error (MSE) function from sklearn and alias it as 'mse' 
from sklearn.metrics import mean_squared_error as mse 

# Generate predictions on the training and validation sets using the trained 'model' 
y_train = model.predict(X_train) 
y_val = model.predict(X_test) 

# Calculate and print the Root Mean Squared Error (RMSE) for training and validation sets 
print("Training RMSE: ", np.sqrt(mse(Y_train, y_train))) 
print("Validation RMSE: ", np.sqrt(mse(Y_test, y_val))) 


Training RMSE:  11.027434672753916
Validation RMSE:  15.071486041169166


In [42]:
print("R2 score on training set : ", model.score(X_train, Y_train))
print("R2 score on test set : ", model.score(X_test, Y_test))

R2 score on training set :  0.8930249412481734
R2 score on test set :  0.7908250893608215


In [46]:
col_names = list(preprocessor.transformers_[0][1].get_feature_names_out()) + list(preprocessor.transformers_[1][1].get_feature_names_out())
coefficient = model.get_feature_importance(prettified=False)


In [47]:
coefs = pd.DataFrame(list(zip(col_names,coefficient)),columns=['category','coefficient']).reset_index()


fig = px.bar(coefs,x="category",y="coefficient")
fig.update_layout(
    width=1200, 
    height=800, 
)
fig.show()

## Gradient Boosting

In [48]:
gbr = GradientBoostingRegressor(learning_rate= 0.1, max_depth= 8, max_leaf_nodes= 30, min_samples_leaf= 4, n_estimators= 200)
gbr.fit(X_train, Y_train)

train_pred = gbr.predict(X_train)
test_pred = gbr.predict(X_test)

print("f1-score on train set : ", r2_score(Y_train, train_pred))
print("f1-score on test set : ", r2_score(Y_test, test_pred))

f1-score on train set :  0.9255164225764835
f1-score on test set :  0.7714394837766856


In [49]:
col_names = list(preprocessor.transformers_[0][1].get_feature_names_out()) + list(preprocessor.transformers_[1][1].get_feature_names_out())
coefficient = pd.Series(gbr.feature_importances_, index=col_names).sort_values(ascending=False)

oefs = pd.DataFrame(list(zip(col_names,coefficient)),columns=['category','coefficient']).reset_index()


fig = px.bar(coefs,x="category",y="coefficient")
fig.update_layout(
    width=1200, 
    height=800, 
)
fig.show()