In [56]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import root_mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [58]:
train_data = pd.read_csv("C:/Users/drmn_/Desktop/hw3-data/my_train.csv")
dev_data = pd.read_csv("C:/Users/drmn_/Desktop/hw3-data/my_dev.csv")
test_data = pd.read_csv("C:/Users/drmn_/Desktop/hw3-data/test.csv")

X_train = train_data.drop(["Id", "SalePrice"], axis=1)
y_train = train_data["SalePrice"]
X_dev = dev_data.drop(["Id", "SalePrice"], axis=1)
y_dev = dev_data["SalePrice"]

test_ids = test_data["Id"]
X_test = test_data.drop(["Id"], axis=1)

In [59]:
y_train_log = np.log(y_train)
y_dev_log = np.log(y_dev) 

In [60]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

In [62]:
len(num_cols) + len(cat_cols)

79

In [65]:
num_type_transformer = FunctionTransformer(lambda x: x.astype(float))
cat_type_transformer = FunctionTransformer(lambda x: x.astype(str))

In [66]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("type_cast", cat_type_transformer), 
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  
])

In [68]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")), 
    ("type_cast", num_type_transformer), 
    ("scaler", StandardScaler()),      
])

In [69]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, num_cols),  
        ("cat", categorical_transformer, cat_cols), 
    ])

In [70]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),           
    ("regressor", Ridge())                  
])

In [72]:
param_grid = {
    "regressor__alpha": [0.1, 1, 10, 100, 1000],
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="neg_root_mean_squared_log_error")
grid_search.fit(X_train, y_train_log)

In [75]:
print("Best alpha:", grid_search.best_params_)
print("Best score (negative MRSLE):", grid_search.best_score_)

dev_score = grid_search.score(X_dev, y_dev)
print("Test set score (negative MRSLE):", dev_score)

Best alpha: {'regressor__alpha': 10}
Best score (negative MRSLE): -0.01091230282383071
Test set score (negative MRSLE): -9.414798826884612


In [76]:
predictions = grid_search.best_estimator_.predict(X_test)
print("Predictions for new set (X_test):", predictions[:10]) 


Predictions for new set (X_test): [11.64099556 11.87495746 12.03726579 12.1556195  12.21197781 12.03424478
 12.14277437 11.99646594 12.14556737 11.71849086]


In [79]:
ridge_model = grid_search.best_estimator_.named_steps['regressor']
preprocessor = grid_search.best_estimator_.named_steps['preprocessor']
numerical_features_transformed = num_cols
categorical_features_transformed = preprocessor.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(cat_cols)
feature_names = np.concatenate([numerical_features_transformed, categorical_features_transformed])
weights = ridge_model.coef_
sorted_indices = np.argsort(weights)
most_negative_features = feature_names[sorted_indices[:10]]
most_positive_features = feature_names[sorted_indices[-10:]]
print("\nTop 10 most positive features:")

top_positive_features = sorted(zip(weights, feature_names), reverse=True)[:10]
top_negative_features = sorted(zip(weights, feature_names))[:10]

for coef, feature in top_positive_features:
    print(f"{feature}")

print("\nTop 10 most negative features:")
for coef, feature in top_negative_features:
    print(f"{feature}")


Top 10 most positive features:
Neighborhood_Crawfor
Neighborhood_StoneBr
OverallQual
Neighborhood_NridgHt
Functional_Typ
Exterior1st_BrkFace
Neighborhood_Somerst
MSZoning_RL
GrLivArea
Neighborhood_NoRidge

Top 10 most negative features:
MSZoning_C (all)
RoofMatl_ClyTile
Condition2_PosN
Neighborhood_Edwards
Functional_Maj2
Neighborhood_MeadowV
SaleCondition_Abnorml
BldgType_Twnhs
SaleType_WD
Foundation_Slab


In [80]:
train_predictions_log = grid_search.predict(X_train)
train_predictions = np.expm1(train_predictions_log)
train_rmsle = root_mean_squared_log_error(y_train, train_predictions)
print("Train RMSLE:", train_rmsle)
dev_predictions_log = grid_search.predict(X_dev) 
dev_predictions = np.expm1(dev_predictions_log)
dev_rmsle = root_mean_squared_log_error(y_dev, dev_predictions)
print("Dev RMSLE:", dev_rmsle)

Train RMSLE: 0.11136467152842583
Dev RMSLE: 0.12841107483259093


In [81]:
log_base_housing_price = ridge_model.intercept_
base_housing_price = np.exp(log_base_housing_price)
print("Log Base Housing Price:", log_base_housing_price)
print("Base Housing Price:", base_housing_price)

Log Base Housing Price: 11.733620480921573
Base Housing Price: 124694.30748183945


In [82]:
test_pred_log = grid_search.predict(X_test)
test_pred = np.exp(test_pred_log)
ids = test_data['Id']
df_csv_file = pd.DataFrame({
    'Id': ids,
    'SalePrice': test_pred
})
df_csv_file.to_csv('Pred_HW3PR4_Ridge_Smart.csv', index=False)
print("Pred_HW3PR4_Ridge_Smart.csv file saved successfully")

Pred_HW3PR4_Ridge_Smart.csv file saved successfully
