In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
df = pd.read_csv('data/get_around_pricing_project.csv', index_col=0)

# Extract the features
X = df.drop('rental_price_per_day', axis=1)

# Extract the target column
y = df.loc[:, 'rental_price_per_day']

# Train / test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size = 0.2)

# Determine categorical and numerical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object', 'bool']).columns

# Numerical Transformer
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical Transformer
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("numerical_transformer", numerical_transformer, numerical_features),
        ("categorical_transformer", categorical_transformer, categorical_features)
    ]
)

# Create a pipeline for each model with preprocessor
pipelines = [
    Pipeline([('preprocessor', preprocessor), ('model', LinearRegression())]),
    Pipeline([('preprocessor', preprocessor), ('model', DecisionTreeRegressor())]),
    Pipeline([('preprocessor', preprocessor), ('model', RandomForestRegressor())]),
    Pipeline([('preprocessor', preprocessor), ('model', GradientBoostingRegressor())])
]

# Hyperparameters for each model
param_grids = [
    {},  # Linear Regression
    {'model__max_depth': [2, 5, 10, 20], 'model__min_samples_split': [2, 5, 10]},  # Decision Tree Regressor
    {'model__n_estimators': [100, 200, 500], 'model__max_depth': [2, 5, 10, 20], 'model__min_samples_split': [2, 5, 10]},  # Random Forest Regressor
    {'model__n_estimators': [100, 200, 500], 'model__learning_rate': [0.01, 0.1, 0.2]},  # Gradient Boosting Regressor
]

best_estimators = []

for pipe, param_grid in zip(pipelines, param_grids):
    grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    best_estimators.append(grid.best_estimator_)
    print(grid.best_params_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits




{}
Fitting 5 folds for each of 12 candidates, totalling 60 fits




{'model__max_depth': 10, 'model__min_samples_split': 10}
Fitting 5 folds for each of 36 candidates, totalling 180 fits




{'model__max_depth': 20, 'model__min_samples_split': 5, 'model__n_estimators': 500}
Fitting 5 folds for each of 9 candidates, totalling 45 fits




{'model__learning_rate': 0.1, 'model__n_estimators': 500}


In [2]:
# Create an empty list to store the model results
model_results = []

# Evaluate the models using R2 score and Mean Squared Error, and save the results
for i, model in enumerate(best_estimators):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    print(f"Model {i+1}")
    print(best_estimators[i].get_params())
          
    model_results.append({
        "model": f"Model {i+1}",
        "best_params": best_estimators[i].get_params(),
        "mse_train": mse_train,
        "mse_test": mse_test,
        "r2_train": r2_train,
        "r2_test": r2_test
    })

# Convert the list of model results into a DataFrame
model_results_df = pd.DataFrame(model_results)

# Print the DataFrame
print(model_results_df)

Model 1
{'memory': None, 'steps': [('preprocessor', ColumnTransformer(transformers=[('numerical_transformer',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 ('scaler', StandardScaler())]),
                                 Index(['mileage', 'engine_power'], dtype='object')),
                                ('categorical_transformer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('encoder',
                                                  OneHotEncoder(drop='first',
                                                                handle_unknown='ignore'))]),
                                 Index(['model_key', 'fuel', 'paint_color', 'car_type',
       'private_parking_available', 'has_gps', 'has_air_conditioning',
       'automatic_car', 'has_getaround_c