In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, FunctionTransformer, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import root_mean_squared_log_error
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
train_data = pd.read_csv("C:/Users/drmn_/Desktop/hw3-data/my_train.csv")
dev_data = pd.read_csv("C:/Users/drmn_/Desktop/hw3-data/my_dev.csv")
test_data = pd.read_csv("C:/Users/drmn_/Desktop/hw3-data/test.csv")

X_train = train_data.drop(["Id", "SalePrice"], axis=1)
y_train = train_data["SalePrice"]
X_dev = dev_data.drop(["Id", "SalePrice"], axis=1)
y_dev = dev_data["SalePrice"]

test_ids = test_data["Id"]
X_test = test_data.drop(["Id"], axis=1)

In [4]:
y_train_log = np.log(y_train)
y_dev_log = np.log(y_dev) 

In [5]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

In [6]:
len(num_cols) + len(cat_cols)

79

In [7]:
num_type_transformer = FunctionTransformer(lambda x: x.astype(float))
cat_type_transformer = FunctionTransformer(lambda x: x.astype(str))

In [8]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("type_cast", cat_type_transformer), 
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  
])

In [9]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")), 
    ("type_cast", num_type_transformer), 
    ("scaler", MinMaxScaler()),      
])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, num_cols),  
        ("cat", categorical_transformer, cat_cols), 
    ])

In [11]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),           
    ("regressor", LinearRegression())                  
])

In [41]:
transformed_features = model.named_steps['preprocessor'].transform(X_train)
n_features = transformed_features.shape[1]
n_features

286

In [45]:
len(num_cols)

36

In [53]:
unique_counts = sum(X_train[cat_cols].nunique())
unique_counts

250

In [12]:
model.fit(X_train, y_train_log)

In [13]:
train_pred_log = model.predict(X_train)
train_pred = np.exp(train_pred_log)
train_rmsle = root_mean_squared_log_error(y_train, train_pred)
print("Train RMSLE:", train_rmsle)
dev_pred_log = model.predict(X_dev) 
dev_pred = np.exp(dev_pred_log)
dev_rmsle = root_mean_squared_log_error(y_dev, dev_pred)
print("Dev RMSLE:", dev_rmsle)

Train RMSLE: 0.09264409673402366
Dev RMSLE: 0.12393728873166943


In [14]:
LR_model = model.named_steps['regressor']
log_base_housing_price = LR_model.intercept_
base_housing_price = np.exp(log_base_housing_price)
print("Log Base Housing Price:", log_base_housing_price)
print("Base Housing Price:", base_housing_price)

Log Base Housing Price: 9.933571371764549
Base Housing Price: 20610.818189547794


In [15]:
numerical_features_transformed = preprocessor.named_transformers_["num"].named_steps["scaler"].get_feature_names_out(num_cols)
categorical_features_transformed = preprocessor.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(cat_cols)
feature_names = np.concatenate([numerical_features_transformed, categorical_features_transformed])
weights = LR_model.coef_
sorted_indices = np.argsort(weights)
most_negative_features = feature_names[sorted_indices[:10]]
most_positive_features = feature_names[sorted_indices[-10:]]
print("\nTop 10 most positive features:")

top_positive_features = sorted(zip(weights, feature_names), reverse=True)[:10]
top_negative_features = sorted(zip(weights, feature_names))[:10]

for coef, feature in top_positive_features:
    print(f"{feature}")

print("\nTop 10 most negative features:")
for coef, feature in top_negative_features:
    print(f"{feature}")


Top 10 most positive features:
RoofMatl_Membran
LotArea
GrLivArea
1stFlrSF
RoofMatl_Metal
Condition2_PosA
TotalBsmtSF
BsmtFinSF1
OverallQual
RoofStyle_Shed

Top 10 most negative features:
RoofMatl_ClyTile
Condition2_PosN
Condition2_RRAe
MSZoning_C (all)
GarageCond_Ex
Functional_Sev
Functional_Maj2
MiscVal
Exterior1st_BrkComm
MiscFeature_TenC


In [37]:
test_pred_log = model.predict(X_test)
test_pred = np.exp(test_pred_log)
ids = test_data['Id']
df_csv_file = pd.DataFrame({
    'Id': ids,
    'SalePrice': test_pred
})
df_csv_file.to_csv('Pred_HW3PR3_SmartBinarization.csv', index=False)
print("Pred_HW3PR3_SmartBinarization.csv file saved successfully")

Pred_HW3PR3_SmartBinarization.csv file saved successfully
