In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
import xgboost as xgb

In [2]:
# Load dataset (e.g. California Housing or Ames Housing)
# For this example, we'll use the California Housing dataset
from sklearn.datasets import fetch_california_housing
california = fetch_california_housing()
df = pd.DataFrame(california.data, columns=california.feature_names)
df['PRICE'] = california.target

In [3]:
# Handling missing values using an imputer
imputer = SimpleImputer(strategy='mean')
df_imputed = imputer.fit_transform(df)

In [4]:
# Scaling features using a standard scaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_imputed)

In [5]:
# Select top k features using ANOVA F-value
# Assume df_scaled is your feature array and df_imputed is your target array
X = df_scaled  # feature array
y = df_imputed  # target array

# Flatten the target array if it's 2D
if len(y.shape) > 1:
    y = np.argmax(y, axis=1)  # or y = y.ravel() depending on your needs

selector = SelectKBest(f_classif, k=5)
X_selected = selector.fit_transform(X, y)

# Convert the selected features back to a DataFrame
df_selected = pd.DataFrame(X_selected)

In [6]:
# Split data into training and testing sets
X = df_selected.iloc[:, :-1]  # access all rows and all columns except the last one
y = df_selected.iloc[:, -1]  # access all rows and the last column

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Define and train XGBoost model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

In [8]:
# Define and train Random Forest model
rf_model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [9]:
# Evaluate models using mean squared error and R2 score
y_pred_xgb = xgb_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f'XGBoost Model: MSE={mse_xgb:.2f}, R2={r2_xgb:.2f}')

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f'Random Forest Model: MSE={mse_rf:.2f}, R2={r2_rf:.2f}')

XGBoost Model: MSE=0.97, R2=0.03
Random Forest Model: MSE=0.91, R2=0.09


In [10]:
# Hyperparameter tuning using RandomizedSearchCV for both models
from sklearn.model_selection import RandomizedSearchCV

param_dist_xgb = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.5, 1],
    'n_estimators': [50, 100, 200]
}

random_search_xgb = RandomizedSearchCV(xgb.XGBRegressor(), param_dist_xgb, cv=5, scoring='neg_mean_squared_error', n_iter=10)
random_search_xgb.fit(X_train, y_train)

print(f'Best XGBoost Model: {random_search_xgb.best_params_}, MSE={random_search_xgb.best_score_:.2f}')

Best XGBoost Model: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1}, MSE=-0.91


In [11]:
param_dist_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

random_search_rf = RandomizedSearchCV(RandomForestRegressor(), param_dist_rf, cv=5, scoring='neg_mean_squared_error', n_iter=10)
random_search_rf.fit(X_train, y_train)

print(f'Best Random Forest Model: {random_search_rf.best_params_}, MSE={random_search_rf.best_score_:.2f}')

Best Random Forest Model: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 7}, MSE=-0.91
