In [None]:
!pip install -q pymongo
!pip install -q feature_engine

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import TransformedTargetRegressor

from xgboost import XGBRegressor

import xgboost as xg
import pickle
import warnings
import json
warnings.filterwarnings('ignore')

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.7 MB[0m [31m10.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m331.1/331.1 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.0/230.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h

#### **Load and split the data into training and testing**

In [None]:
import pandas as pd

train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

train_X = train_df.drop(columns=['price']).to_numpy()
train_y = train_df['price'].to_numpy()

test_X = test_df.drop(columns=['price']).to_numpy()
test_y = test_df['price'].to_numpy()

print("Train X shape:", train_X.shape)
print("Train y shape:", train_y.shape)
print("Test X shape:", test_X.shape)
print("Test y shape:", test_y.shape)

Train X shape: (4968, 10)
Train y shape: (4968,)
Test X shape: (552, 10)
Test y shape: (552,)


In [None]:
from sklearn.model_selection import GridSearchCV, KFold
import numpy as np

def perform_gridsearch(
    X,
    y,
    param_grids,
    model,
    cv=9
):
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)

    scoring = {
        'r2': 'r2',
        'mse': 'neg_mean_squared_error',
        'mae': 'neg_mean_absolute_error'
    }

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids,
        cv=kf,
        scoring=scoring,
        refit='r2',
        n_jobs=-1,
        verbose=2
    )

    grid_search.fit(X, y)

    results = grid_search.cv_results_

    best_scores = {
        'best_r2_score': results['mean_test_r2'][grid_search.best_index_],
        'best_mse_score': -results['mean_test_mse'][grid_search.best_index_],
        'best_mae_score': -results['mean_test_mae'][grid_search.best_index_],
        'best_rmse_score': np.sqrt(
            -results['mean_test_mse'][grid_search.best_index_]
        )
    }

    return {
        'best_params': grid_search.best_params_,
        'scores': best_scores,
        'best_model': grid_search.best_estimator_
    }


#### **Perform gridsearch for XGBoost**


In [None]:
# XGBoost model
model = xg.XGBRegressor(objective='reg:squarederror', random_state=42)

param_grids = {
    "n_estimators": [300, 500, 800],
    "learning_rate": [0.01, 0.03, 0.05],
    "max_depth": [3, 5, 7],
    "min_child_weight": [1, 5, 10],
    "gamma": [0, 0.1],
    "subsample": [0.6, 0.8],
    "colsample_bytree": [0.6, 0.8],
    "reg_alpha": [0, 0.1, 0.5],
    "reg_lambda": [1, 1.5, 2]
}


result = perform_gridsearch(train_X, train_y, param_grids, model, cv = 9)

Fitting 9 folds for each of 5832 candidates, totalling 52488 fits


#### **Randomized Search CV**

In [None]:
from sklearn.model_selection import RandomizedSearchCV

search = RandomizedSearchCV(
    model,
    param_distributions=param_grids,
    n_iter=80,
    scoring='r2',
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

search.fit(train_X, train_y)


#### **Evaluation**

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import pickle

save_path = '/content/xgb_model_new.pkl'

best_model = exp['best_model']

with open(save_path, 'wb') as f:
    pickle.dump(result, f)

with open(save_path, 'rb') as f:
    exp = pickle.load(f)

loaded_model = exp['best_model']
y_test_pred = loaded_model.predict(test_X)

# Metrics
r2 = r2_score(test_y, y_test_pred)
mse = mean_squared_error(test_y, y_test_pred)
mae = mean_absolute_error(test_y, y_test_pred)
rmse = np.sqrt(mse)

print(f"R2-score on test set: {r2}")
print(f"MSE on test set: {mse}")
print(f"MAE on test set: {mae}")
print(f"RMSE on test set: {rmse}")

print(f"Best parameters: {exp['best_params']}")
print(f"Best R2-score: {exp['scores']['best_r2_score']}")
print(f"Best MSE score: {exp['scores']['best_mse_score']}")
print(f"Best MAE score: {exp['scores']['best_mae_score']}")
print(f"Best RMSE score: {exp['scores']['best_rmse_score']}")


R2-score on test set: 0.6872172409571817
MSE on test set: 4.739073488764636
MAE on test set: 1.40156609315223
RMSE on test set: 2.1769413149565233
Best parameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.03, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 500, 'reg_alpha': 0.5, 'reg_lambda': 1.5, 'subsample': 0.6}
Best R2-score: 0.7259384548803822
Best MSE score: 4.1376642353500435
Best MAE score: 1.2737842804144137
Best RMSE score: 2.0341249311067506
