In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
# from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor


In [16]:
# Load the data
df = pd.read_csv('data/get_around_pricing_project.csv', index_col=0)

print(df.head())

  model_key  mileage  engine_power    fuel paint_color     car_type   
0   Citroën   140411           100  diesel       black  convertible  \
1   Citroën    13929           317  petrol        grey  convertible   
2   Citroën   183297           120  diesel       white  convertible   
3   Citroën   128035           135  diesel         red  convertible   
4   Citroën    97097           160  diesel      silver  convertible   

   private_parking_available  has_gps  has_air_conditioning  automatic_car   
0                       True     True                 False          False  \
1                       True     True                 False          False   
2                      False    False                 False          False   
3                       True     True                 False          False   
4                       True     True                 False          False   

   has_getaround_connect  has_speed_regulator  winter_tires   
0                   True                 

In [17]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4843 entries, 0 to 4842
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   model_key                  4843 non-null   object
 1   mileage                    4843 non-null   int64 
 2   engine_power               4843 non-null   int64 
 3   fuel                       4843 non-null   object
 4   paint_color                4843 non-null   object
 5   car_type                   4843 non-null   object
 6   private_parking_available  4843 non-null   bool  
 7   has_gps                    4843 non-null   bool  
 8   has_air_conditioning       4843 non-null   bool  
 9   automatic_car              4843 non-null   bool  
 10  has_getaround_connect      4843 non-null   bool  
 11  has_speed_regulator        4843 non-null   bool  
 12  winter_tires               4843 non-null   bool  
 13  rental_price_per_day       4843 non-null   int64 
dtypes: bool(7), i

In [18]:
print("Percentage of missing values: ")
display(100*df.isnull().sum()/df.shape[0])

Percentage of missing values: 


model_key                    0.0
mileage                      0.0
engine_power                 0.0
fuel                         0.0
paint_color                  0.0
car_type                     0.0
private_parking_available    0.0
has_gps                      0.0
has_air_conditioning         0.0
automatic_car                0.0
has_getaround_connect        0.0
has_speed_regulator          0.0
winter_tires                 0.0
rental_price_per_day         0.0
dtype: float64

In [19]:
# Extract the features
X = df.drop('rental_price_per_day', axis=1)

# Extract the target column
y = df.loc[:, 'rental_price_per_day']

# Train / test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)

In [20]:
# determine categorical and numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Numerical Transformer
numerical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
])

# Categorical Transformer
categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical_transformer", numerical_transformer, numerical_features),
        ("categorical_transformer", categorical_transformer, categorical_features)
    ]
)

# X_train = preprocessor.fit_transform(X_train)
# X_test = preprocessor.transform(X_test)

In [21]:
# List of models
models = [
    XGBRegressor()
]

# List of param_grids for each model
param_grids = [
{'model__gamma': [0], 'model__learning_rate': [0.1], 'model__max_depth': [10], 'model__min_child_weight': [5], 'model__n_estimators': [100]}
]
# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['Model', 'Best_Params', 'Best_Score'])

results = []

for i, model in enumerate(models):
    param_grid = param_grids[i]
    
    # Create a pipeline with the preprocessor and the model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Perform grid search with the current model and its param_grid
    grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

    grid.fit(X_train, y_train)

    # Store the best estimator and its score
    best_estimator = grid.best_estimator_
    best_score = grid.best_score_
    
    results.append((best_estimator, best_score))

    # Make predictions on train and test sets
    y_train_pred = best_estimator.predict(X_train)
    y_test_pred = best_estimator.predict(X_test)

    # Calculate RMSE on train and test sets
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

    # Calculate R2 score on train and test sets
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)

    # Add the R2 scores to the DataFrame
    results_df = pd.concat([results_df, pd.DataFrame({
        'Model': [model.__class__.__name__],
        'Best_Params': [grid.best_params_],
        'Best_Score': [best_score],
        'Train_RMSE': [train_rmse],
        'Test_RMSE': [test_rmse],
        'Train_R2': [train_r2],
        'Test_R2': [test_r2]
    })], ignore_index=True)
    
    print(f"Best parameters for {model.__class__.__name__}: {grid.best_params_}")
    
# Print the results DataFrame
print(results_df)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




Best parameters for XGBRegressor: {'model__gamma': 0, 'model__learning_rate': 0.1, 'model__max_depth': 10, 'model__min_child_weight': 5, 'model__n_estimators': 100}
          Model                                        Best_Params   
0  XGBRegressor  {'model__gamma': 0, 'model__learning_rate': 0....  \

   Best_Score  Train_RMSE  Test_RMSE  Train_R2   Test_R2  
0 -265.045043    8.302036  16.576924  0.939795  0.739092  


In [22]:
pd.set_option('display.max_columns', None)
results_df

Unnamed: 0,Model,Best_Params,Best_Score,Train_RMSE,Test_RMSE,Train_R2,Test_R2
0,XGBRegressor,"{'model__gamma': 0, 'model__learning_rate': 0....",-265.045043,8.302036,16.576924,0.939795,0.739092


### In the context of the grid search, best_score represents the highest mean score, based on the selected scoring metric (in this case, neg_mean_squared_error). The negative value is used because GridSearchCV minimizes the metric, so by taking the negative value we can still track the performance in a maximization setting. The lower the mean squared error, the better the model's performance.