# 1. Dataset and Setup


In [26]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import VarianceThreshold, RFECV, SelectKBest, f_regression, SelectFromModel
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error

# Load the dataset
url = "https://drive.google.com/file/d/1Je0e4sj5uEh2f86t8SeQweFcwvO6XLjX/view?usp=drive_link"
path = "https://drive.google.com/uc?export=download&id=" + url.split("/")[-2]
data = pd.read_csv(path, index_col="Id")

# Define target and features
y = data["SalePrice"]
X = data.drop(columns=["SalePrice"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Separate numerical and categorical columns
X_cat = X_train.select_dtypes(exclude="number").copy()
X_num = X_train.select_dtypes(include="number").copy()

# Build preprocessing pipelines
numeric_pipe = make_pipeline(SimpleImputer(strategy="mean"))
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
).set_output(transform="pandas")  # Preserve feature names

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num.columns),
        ("cat_pipe", categoric_pipe, X_cat.columns),
    ],
    verbose_feature_names_out=False
).set_output(transform="pandas")  # Preserve feature names

# Preprocess and scale the data
scaler = StandardScaler()
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

X_train_scaled = scaler.fit_transform(X_train_preprocessed)
X_test_scaled = scaler.transform(X_test_preprocessed)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_preprocessed.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_preprocessed.columns)

# Initialize performances DataFrame
performances = pd.DataFrame()


# 2. Baseline Models

In [22]:
# Baseline Models
tree_pipeline = make_pipeline(preprocessor, DecisionTreeRegressor(random_state=123))
tree_pipeline.fit(X_train, y_train)
tree_predictions = tree_pipeline.predict(X_test)
performances.loc["Baseline R²", "Decision Tree"] = r2_score(y_test, tree_predictions)

knn_pipeline = make_pipeline(preprocessor, KNeighborsRegressor(n_neighbors=1))
knn_pipeline.fit(X_train, y_train)
knn_predictions = knn_pipeline.predict(X_test)
performances.loc["Baseline R²", "KNN"] = r2_score(y_test, knn_predictions)


# 3. Feature Selection and R² Comparison

In [25]:
# Variance Threshold
selector = VarianceThreshold(threshold=0.02)
X_train_var = selector.fit_transform(X_train_scaled)
X_test_var = selector.transform(X_test_scaled)

var_tree = DecisionTreeRegressor(random_state=123)
var_tree.fit(X_train_var, y_train)
performances.loc["varThreshold_0_02", "Decision Tree"] = r2_score(y_test, var_tree.predict(X_test_var))

var_knn = KNeighborsRegressor(n_neighbors=1)
var_knn.fit(X_train_var, y_train)
performances.loc["varThreshold_0_02", "KNN"] = r2_score(y_test, var_knn.predict(X_test_var))

# Collinearity
corr_matrix = X_train_scaled.corr().abs()
correlation_threshold = 0.95

# Adjust correlation threshold
correlation_threshold = 0.99  # More lenient threshold to retain more features

# Recalculate correlation matrix
corr_matrix = pd.DataFrame(X_train_scaled, columns=X_train_preprocessed.columns).corr().abs()

# Find columns to drop
to_drop = set()
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] >= correlation_threshold:
            to_drop.add(corr_matrix.columns[j])

# Drop the identified columns
X_train_corr = pd.DataFrame(X_train_scaled, columns=X_train_preprocessed.columns).drop(columns=to_drop, errors="ignore")
X_test_corr = pd.DataFrame(X_test_scaled, columns=X_train_preprocessed.columns).drop(columns=to_drop, errors="ignore")

# Ensure columns remain after dropping
if X_train_corr.shape[1] == 0 or X_test_corr.shape[1] == 0:
    raise ValueError("Too many features dropped. Adjust the correlation threshold or review feature engineering.")

print(f"Columns retained after collinearity filtering: {X_train_corr.shape[1]}")

# Fit models on the reduced dataset
col_tree = DecisionTreeRegressor(random_state=123)
col_tree.fit(X_train_corr, y_train)
performances.loc["Collinearity", "Decision Tree"] = r2_score(y_test, col_tree.predict(X_test_corr))

col_knn = KNeighborsRegressor(n_neighbors=1)
col_knn.fit(X_train_corr, y_train)
performances.loc["Collinearity", "KNN"] = r2_score(y_test, col_knn.predict(X_test_corr))

# Select KBest
KBest = SelectKBest(score_func=f_regression, k=10)
X_train_KBest = KBest.fit_transform(X_train_scaled, y_train)
X_test_KBest = KBest.transform(X_test_scaled)

kbest_tree = DecisionTreeRegressor(random_state=123)
kbest_tree.fit(X_train_KBest, y_train)
performances.loc["KBest_10", "Decision Tree"] = r2_score(y_test, kbest_tree.predict(X_test_KBest))

kbest_knn = KNeighborsRegressor(n_neighbors=1)
kbest_knn.fit(X_train_KBest, y_train)
performances.loc["KBest_10", "KNN"] = r2_score(y_test, kbest_knn.predict(X_test_KBest))

# RFE
rfe_tree = RFECV(estimator=DecisionTreeRegressor(random_state=123), step=1, cv=5, scoring="r2", n_jobs=-1)
rfe_tree.fit(X_train_scaled, y_train)
performances.loc["RFE", "Decision Tree"] = r2_score(y_test, rfe_tree.predict(X_test_scaled))

# SelectFromModel
select_model_tree = SelectFromModel(DecisionTreeRegressor(random_state=123), threshold=None)
X_train_selected = select_model_tree.fit_transform(X_train_scaled, y_train)
X_test_selected = select_model_tree.transform(X_test_scaled)

model_tree = DecisionTreeRegressor(random_state=123)
model_tree.fit(X_train_selected, y_train)
performances.loc["Model Selected", "Decision Tree"] = r2_score(y_test, model_tree.predict(X_test_selected))

model_knn = KNeighborsRegressor(n_neighbors=1)
model_knn.fit(X_train_selected, y_train)
performances.loc["Model Selected", "KNN"] = r2_score(y_test, model_knn.predict(X_test_selected))

# Display final performances
print("\nFinal Performances:")
print(performances)

Columns retained after collinearity filtering: 289

Final Performances:
                   Decision Tree       KNN
Baseline R²             0.704384  0.461212
varThreshold_0_02       0.704384  0.540624
Collinearity            0.723160  0.528993
KBest_10                0.767766  0.770957
RFE                     0.741450       NaN
Model Selected          0.702761  0.790710


# 4. Final - lets tune some parameters

In [29]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor


# Define target and features
y = data['SalePrice']
X = data.drop(columns=['SalePrice'])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Preprocessing
nums = X.select_dtypes(include='number').columns
cats = X.select_dtypes(exclude='number').columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipe', make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()), nums),
        ('cat_pipe', make_pipeline(SimpleImputer(strategy='constant', fill_value='N_A'),
                                   OneHotEncoder(handle_unknown='ignore')), cats)
    ]
)

# Feature Selection (KBest)
kbest = SelectKBest(score_func=f_regression, k=10)
X_train_selected = kbest.fit_transform(preprocessor.fit_transform(X_train), y_train)
X_test_selected = kbest.transform(preprocessor.transform(X_test))

# Model Pipelines
models = {
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=123),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=15, random_state=123),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, random_state=123),
    'Linear Regression': LinearRegression(),
    'KNN': KNeighborsRegressor(n_neighbors=5)
}

# Hyperparameter Grid for Tuning
param_grids = {
    'Decision Tree': {'max_depth': [5, 10, 15], 'min_samples_split': [10, 20, 30]},
    'Random Forest': {'n_estimators': [100, 200], 'max_depth': [10, 20], 'min_samples_split': [10, 20]},
    'Gradient Boosting': {'n_estimators': [100, 200], 'learning_rate': [0.05, 0.1, 0.2]}
}

# Evaluate Models and Tune Parameters
results = pd.DataFrame(columns=['Model', 'MAE', 'MAPE', 'RMSE', 'R2'])

for name, model in models.items():
    if name in param_grids:
        grid = GridSearchCV(model, param_grids[name], cv=5, scoring='r2', n_jobs=-1, verbose=1)
        grid.fit(X_train_selected, y_train)
        best_model = grid.best_estimator_
    else:
        best_model = model
        best_model.fit(X_train_selected, y_train)

    # Predictions
    y_pred = best_model.predict(X_test_selected)

    # Evaluate
    result = pd.DataFrame({
        'Model': [name],
        'MAE': [mean_absolute_error(y_test, y_pred)],
        'MAPE': [mean_absolute_percentage_error(y_test, y_pred)],
        'RMSE': [mean_squared_error(y_test, y_pred, squared=False)],
        'R2': [r2_score(y_test, y_pred)]
    })

    results = pd.concat([results, result], ignore_index=True)

# Sort Results
results = results.sort_values(by='R2', ascending=False)

print("\nFinal Results:")
results


Fitting 5 folds for each of 9 candidates, totalling 45 fits


  results = pd.concat([results, result], ignore_index=True)


Fitting 5 folds for each of 8 candidates, totalling 40 fits




Fitting 5 folds for each of 6 candidates, totalling 30 fits

Final Results:




Unnamed: 0,Model,MAE,MAPE,RMSE,R2
2,Gradient Boosting,18275.934038,0.107157,26635.585303,0.885194
1,Random Forest,18912.916214,0.108262,28371.247223,0.869745
4,KNN,20103.245205,0.115635,30521.532171,0.849252
3,Linear Regression,21125.488443,0.121032,32777.5486,0.826143
0,Decision Tree,22969.836332,0.126838,35004.001261,0.801722


### Gradient Boosting: Best performing model overall with the highest R2 (0.885) and lowest error metrics.
This model effectively balances bias and variance and is robust to noisy features.

# 4. Competiton on Kaggle
https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques

## First attempt

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Load data from Google Drive
url_train = "https://drive.google.com/file/d/17_B-sKEO8xJdYJHp4Gy7RaHz_egmII-n/view?usp=drive_link"
path_train = "https://drive.google.com/uc?export=download&id=" + url_train.split("/")[-2]
train_data = pd.read_csv(path_train, index_col="Id")

url_test = "https://drive.google.com/file/d/1veq1UDvdGjv1L1d1Neasxu-B4dzh1ch9/view?usp=drive_link"
path_test = "https://drive.google.com/uc?export=download&id=" + url_test.split("/")[-2]
test_data = pd.read_csv(path_test, index_col="Id")

# Separate target and features
y = train_data['SalePrice']
X = train_data.drop(columns=['SalePrice'])
test_ids = test_data.index
X_test_final = test_data

# Log-transform the target variable
y = np.log1p(y)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
num_cols = X.select_dtypes(include=['number']).columns
cat_cols = X.select_dtypes(exclude=['number']).columns

# Define preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Models to compare
models = {
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'KNN': KNeighborsRegressor(),
    'Linear Regression': LinearRegression()
}

# Results DataFrame
results = pd.DataFrame(columns=['Model', 'MAE', 'MAPE', 'RMSE', 'R2'])

# Train and evaluate models
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Train
    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_val)

    # Evaluate
    results = pd.concat([
        results,
        pd.DataFrame({
            'Model': [name],
            'MAE': [mean_absolute_error(np.expm1(y_val), np.expm1(y_pred))],
            'MAPE': [mean_absolute_percentage_error(np.expm1(y_val), np.expm1(y_pred))],
            'RMSE': [mean_squared_error(np.expm1(y_val), np.expm1(y_pred), squared=False)],
            'R2': [r2_score(np.expm1(y_val), np.expm1(y_pred))]
        })
    ])

# Sort results by RMSE
results = results.sort_values(by='RMSE')
print("Model Comparison:")
print(results)

# Final Model: Gradient Boosting
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])

# Train on full training data
final_pipeline.fit(X, y)

# Predict on test data
test_predictions = np.expm1(final_pipeline.predict(X_test_final))

# Prepare submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})

# Save predictions to a CSV file
output_file = "submission.csv"
submission.to_csv(output_file, index=False)

print(f"Submission saved to {output_file}")

  results = pd.concat([


Model Comparison:
               Model           MAE      MAPE          RMSE        R2
0  Linear Regression  15064.954938  0.092164  22903.214167  0.931612
0  Gradient Boosting  16490.319504  0.095054  27474.538596  0.901588
0      Random Forest  17383.412602  0.102784  29027.989423  0.890145
0                KNN  22354.178099  0.129299  38787.178196  0.803862
0      Decision Tree  25694.613014  0.149597  40488.536315  0.786277
Submission saved to submission.csv


In [8]:
from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**score 0.13364**

## Second attempt
 have updated your code to include improvements, including a grid search for hyperparameter tuning of the Gradient Boosting model, which optimizes the evaluation metric as per the competition's requirements.

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Load data from Google Drive
url_train = "https://drive.google.com/file/d/17_B-sKEO8xJdYJHp4Gy7RaHz_egmII-n/view?usp=drive_link"
path_train = "https://drive.google.com/uc?export=download&id=" + url_train.split("/")[-2]
train_data = pd.read_csv(path_train, index_col="Id")

url_test = "https://drive.google.com/file/d/1veq1UDvdGjv1L1d1Neasxu-B4dzh1ch9/view?usp=drive_link"
path_test = "https://drive.google.com/uc?export=download&id=" + url_test.split("/")[-2]
test_data = pd.read_csv(path_test, index_col="Id")

# Separate target and features
y = train_data['SalePrice']
X = train_data.drop(columns=['SalePrice'])
test_ids = test_data.index
X_test_final = test_data

# Log-transform the target variable
y = np.log1p(y)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
num_cols = X.select_dtypes(include=['number']).columns
cat_cols = X.select_dtypes(exclude=['number']).columns

# Define preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Models to compare
models = {
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'KNN': KNeighborsRegressor(),
    'Linear Regression': LinearRegression()
}

# Results DataFrame
results = pd.DataFrame(columns=['Model', 'MAE', 'MAPE', 'RMSE', 'R2'])

# Train and evaluate models
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Train
    pipeline.fit(X_train, y_train)

    # Predict
    y_pred = pipeline.predict(X_val)

    # Evaluate
    results = pd.concat([
        results,
        pd.DataFrame({
            'Model': [name],
            'MAE': [mean_absolute_error(np.expm1(y_val), np.expm1(y_pred))],
            'MAPE': [mean_absolute_percentage_error(np.expm1(y_val), np.expm1(y_pred))],
            'RMSE': [mean_squared_error(np.expm1(y_val), np.expm1(y_pred), squared=False)],
            'R2': [r2_score(np.expm1(y_val), np.expm1(y_pred))]
        })
    ])

# Sort results by RMSE
results = results.sort_values(by='RMSE')
print("Model Comparison:")
print(results)

# Final Model: Gradient Boosting
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])

# Grid Search for Gradient Boosting
gb_param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0]
}
gb_grid_search = GridSearchCV(final_pipeline, gb_param_grid, scoring='neg_root_mean_squared_error', cv=5, verbose=2, n_jobs=-1)

gb_grid_search.fit(X, y)
print("Best Parameters for Gradient Boosting:", gb_grid_search.best_params_)

# Predict on test data using best estimator
test_predictions = np.expm1(gb_grid_search.best_estimator_.predict(X_test_final))

# Prepare submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})

# Save predictions to a CSV file
output_file = "submission.csv"
submission.to_csv(output_file, index=False)

print(f"Submission saved to {output_file}")

  results = pd.concat([


Model Comparison:
               Model           MAE      MAPE          RMSE        R2
0  Linear Regression  15064.954938  0.092164  22903.214167  0.931612
0  Gradient Boosting  16490.319504  0.095054  27474.538596  0.901588
0      Random Forest  17383.412602  0.102784  29027.989423  0.890145
0                KNN  22354.178099  0.129299  38787.178196  0.803862
0      Decision Tree  25694.613014  0.149597  40488.536315  0.786277
Fitting 5 folds for each of 54 candidates, totalling 270 fits




Best Parameters for Gradient Boosting: {'model__learning_rate': 0.05, 'model__max_depth': 3, 'model__n_estimators': 300, 'model__subsample': 0.8}
Submission saved to submission.csv


In [4]:
from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Score 0.13026**

## Third attepmpt

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import StackingRegressor

# Load data from Google Drive
url_train = "https://drive.google.com/file/d/17_B-sKEO8xJdYJHp4Gy7RaHz_egmII-n/view?usp=drive_link"
path_train = "https://drive.google.com/uc?export=download&id=" + url_train.split("/")[-2]
train_data = pd.read_csv(path_train, index_col="Id")

url_test = "https://drive.google.com/file/d/1veq1UDvdGjv1L1d1Neasxu-B4dzh1ch9/view?usp=drive_link"
path_test = "https://drive.google.com/uc?export=download&id=" + url_test.split("/")[-2]
test_data = pd.read_csv(path_test, index_col="Id")

# Separate target and features
y = train_data['SalePrice']
X = train_data.drop(columns=['SalePrice'])
test_ids = test_data.index
X_test_final = test_data

# Log-transform the target variable
y = np.log1p(y)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical columns
num_cols = X.select_dtypes(include=['number']).columns
cat_cols = X.select_dtypes(exclude=['number']).columns

# Define preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='N/A')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

# Gradient Boosting Hyperparameter Tuning
param_grid = {
    'model__n_estimators': [100, 300, 500],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.05, 0.1, 0.2],
    'model__subsample': [0.8, 1.0]
}

gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(random_state=42))
])

grid_search = GridSearchCV(estimator=gb_pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Extract best parameters
best_gb_params = {
    'n_estimators': grid_search.best_params_['model__n_estimators'],
    'max_depth': grid_search.best_params_['model__max_depth'],
    'learning_rate': grid_search.best_params_['model__learning_rate'],
    'subsample': grid_search.best_params_['model__subsample']
}

print("Best Parameters for Gradient Boosting:", best_gb_params)

# Stacking Regressor
stacking_regressor = StackingRegressor(
    estimators=[
        ('linear', Pipeline([
            ('preprocessor', preprocessor),
            ('model', LinearRegression())
        ])),
        ('random_forest', Pipeline([
            ('preprocessor', preprocessor),
            ('model', RandomForestRegressor(n_estimators=100, random_state=42))
        ])),
        ('gradient_boosting', Pipeline([
            ('preprocessor', preprocessor),
            ('model', GradientBoostingRegressor(**best_gb_params, random_state=42))
        ]))
    ],
    final_estimator=GradientBoostingRegressor(n_estimators=100, random_state=42)
)

# Train the ensemble model
stacking_regressor.fit(X_train, y_train)

# Evaluate on validation data
y_val_pred = stacking_regressor.predict(X_val)

# Calculate RMSE
rmse = mean_squared_error(np.expm1(y_val), np.expm1(y_val_pred), squared=False)
print(f"Final RMSE on validation set: {rmse:.5f}")

# Predict on Test Data
stacking_regressor.fit(X, y)  # Train on the full dataset
test_predictions = np.expm1(stacking_regressor.predict(X_test_final))

# Save submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': test_predictions
})


# Save predictions to a CSV file
output_file = "submission.csv"
submission.to_csv(output_file, index=False)

print(f"Submission saved to {output_file}")


Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [None]:
from google.colab import files
files.download("submission.csv")