In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import root_mean_squared_log_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

In [32]:
train_data = pd.read_csv("C:/Users/drmn_/Desktop/hw3-data/my_train.csv")
dev_data = pd.read_csv("C:/Users/drmn_/Desktop/hw3-data/my_dev.csv")
test_data = pd.read_csv("C:/Users/drmn_/Desktop/hw3-data/test.csv")

In [33]:
X_train = train_data.drop(["Id", "SalePrice"], axis=1)
y_train = train_data["SalePrice"]
X_dev = dev_data.drop(["Id", "SalePrice"], axis=1)
y_dev = dev_data["SalePrice"]
test_ids = test_data["Id"]
X_test = test_data.drop(["Id"], axis=1)

y_train_log = np.log(y_train)
y_dev_log = np.log(y_dev) 

In [36]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

num_type_transformer = FunctionTransformer(lambda x: x.astype(float))
cat_type_transformer = FunctionTransformer(lambda x: x.astype(str))

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("type_cast", cat_type_transformer), 
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  
])

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")), 
    ("type_cast", num_type_transformer), 
    ("scaler", MinMaxScaler()),      
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, num_cols),  
        ("cat", categorical_transformer, cat_cols), 
    ])

ridge_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('ridge', Ridge())
])

In [44]:
ridge_params = {'ridge__alpha': [0.1, 1, 10, 50, 100]}
ridge_grid_search = GridSearchCV(ridge_pipeline, ridge_params, scoring='neg_mean_squared_log_error', cv=5)
ridge_grid_search.fit(X_train, y_train_log)
ridge_best_model = ridge_grid_search.best_estimator_
dev_predictions_ridge = np.exp(ridge_best_model.predict(X_dev))
dev_rmsle_ridge = root_mean_squared_log_error(y_dev, np.maximum(0, dev_predictions_ridge))
print(f"Ridge Best Alpha: {ridge_grid_search.best_params_['ridge__alpha']}")
print(f"Dev RMSLE with Ridge: {dev_rmsle_ridge}")

Ridge Best Alpha: 10
Dev RMSLE with Ridge: 0.13569028203977498


In [48]:
gbr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('gbr', GradientBoostingRegressor())
])

gbr_params = {
    'gbr__n_estimators': [100, 200, 500],
    'gbr__max_depth': [3, 5, 7],
    'gbr__learning_rate': [0.01, 0.1, 0.2]
}

gbr_grid_search = GridSearchCV(gbr_pipeline, gbr_params, scoring='neg_mean_squared_log_error', cv=5)
gbr_grid_search.fit(X_train, y_train_log)

gbr_best_model = gbr_grid_search.best_estimator_
dev_predictions_gbr = np.exp(gbr_best_model.predict(X_dev))
dev_rmsle_gbr = root_mean_squared_log_error(y_dev, np.maximum(0, dev_predictions_gbr))
print(f"Dev RMSLE with Gradient Boosting: {dev_rmsle_gbr}")

Dev RMSLE with Gradient Boosting: 0.12681002587797557


In [50]:
lgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lgbm', LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=5))
])

lgb_pipeline.fit(X_train, y_train_log)
dev_predictions_lgb = np.exp(lgb_pipeline.predict(X_dev))
dev_rmsle_lgb = root_mean_squared_log_error(y_dev, np.maximum(0, dev_predictions_lgb))
print(f"Dev RMSLE with LightGBM: {dev_rmsle_lgb}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001613 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3280
[LightGBM] [Info] Number of data points in the train set: 1314, number of used features: 185
[LightGBM] [Info] Start training from score 12.029784
Dev RMSLE with LightGBM: 0.13039954232924225


In [54]:
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('xgb', XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=5))
])

xgb_pipeline.fit(X_train, y_train_log)
dev_predictions_xgb = np.exp(xgb_pipeline.predict(X_dev))
dev_rmsle_xgb = root_mean_squared_log_error(y_dev, np.maximum(0, dev_predictions_xgb))
print(f"Dev RMSLE with XGBoost: {dev_rmsle_xgb}")

Dev RMSLE with XGBoost: 0.13272748346554314


In [56]:
ensemble_dev_predictions = (
    dev_predictions_ridge +
    dev_predictions_gbr +
    dev_predictions_lgb +
    dev_predictions_xgb
) / 4
ensemble_rmsle = root_mean_squared_log_error(y_dev, np.maximum(0, ensemble_dev_predictions))
print(f"Dev RMSLE with Ensemble: {ensemble_rmsle}")

Dev RMSLE with Ensemble: 0.12168667535924281


In [58]:
models = {
    "Ridge": (ridge_best_model, dev_rmsle_ridge),
    "Gradient Boosting": (gbr_best_model, dev_rmsle_gbr),
    "LightGBM": (lgb_pipeline, dev_rmsle_lgb),
    "XGBoost": (xgb_pipeline, dev_rmsle_xgb),
    "Ensemble": (None, ensemble_rmsle)  # Ensemble is handled separately
}

best_model_name = min(models, key=lambda k: models[k][1])  # Lowest RMSLE
print(f"Best Model: {best_model_name}")

if best_model_name != "Ensemble":
    best_model = models[best_model_name][0]
    test_predictions = np.maximum(0, exp(best_model.predict(X_test)))
else:
    test_predictions = (
        np.exp(ridge_best_model.predict(X_test)) +
        np.exp(gbr_best_model.predict(X_test)) +
        np.exp(lgb_pipeline.predict(X_test)) +
        np.exp(xgb_pipeline.predict(X_test))
    ) / 4

Best Model: Ensemble


In [None]:
final_predictions = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': test_predictions})
final_predictions.to_csv("C:/Users/drmn_/Desktop/hw3-data/final_predictions.csv", index=False)
print("Final predictions saved as final_predictions.csv")