## Preprocessing

In [2]:
import pickle
from typing import Tuple
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

def load_data(file_path: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df = pd.read_csv(file_path)
    df.drop(columns=["Id"], inplace=True)
    y = np.log1p(df["SalePrice"])
    X = df.drop(columns=["SalePrice"])
    return X, y

In [24]:
def preprocess_data_pandas(X: pd.DataFrame) -> ColumnTransformer:
    num_features = X.select_dtypes(include=["int64", "float64"]).columns
    cat_features = X.select_dtypes(include=["object"]).columns

    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("power_transform", PowerTransformer(method="yeo-johnson"))
    ])
    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    preprocessor = ColumnTransformer([
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_features)
    ])
    return preprocessor

In [25]:
X, y = load_data("../data/train.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
preprocessor = preprocess_data_pandas(X_train)
X_train_processed = preprocessor.fit_transform(X_train)

In [27]:
model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train_processed, y_train)

In [28]:
# Define model pipeline
model_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", model)
])


In [None]:
from sklearn.metrics import mean_squared_error

# Predict and evaluate
y_pred = model_pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 0.15


__ TEST ___

In [3]:
X, y = load_data("../data/train.csv")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
import pickle

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

def preprocess_data(X: pd.DataFrame) -> ColumnTransformer:
    num_features = X.select_dtypes(include=["int64", "float64"]).columns
    cat_features = X.select_dtypes(include=["object"]).columns

    num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("power_transform", PowerTransformer(method="yeo-johnson"))
    ])
    cat_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])
    preprocessor = ColumnTransformer([
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_features)
    ])
    return preprocessor


In [5]:
preprocessor = preprocess_data(X_train)

model = LinearRegression()
# model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

pipeline.fit(X_train, y_train)

In [7]:
pipeline.predict(X_test)

array([11.93834519, 12.73430031, 11.53841002, 12.03392469, 12.67760525,
       11.28597933, 12.41306258, 11.88981609, 11.24059314, 11.88137112,
       11.88798593, 11.62622806, 11.34888384, 12.26268973, 12.05656154,
       11.78610476, 12.14342162, 11.75814812, 11.64668877, 12.28192875,
       11.95067032, 12.22499299, 12.06575398, 11.75497591, 12.18482217,
       11.91945922, 12.15961044, 11.61575071, 12.10287814, 12.20678009,
       12.0717141 , 12.54461561, 12.21939745, 11.66156138, 12.44964939,
       11.91571551, 11.81008061, 12.20350586, 12.69795252, 11.57370289,
       11.7285082 , 12.27208953, 11.61362128, 12.82171652, 11.76798888,
       11.81408892, 11.5302094 , 11.80023634, 13.02621732, 11.88585872,
       11.69072348, 12.28444215, 11.57074501, 12.5595151 , 11.97976345,
       12.4164197 , 12.21338197, 11.94505161, 11.74280941, 11.44258827,
       11.21470822, 12.01906803, 12.62551139, 12.41940333, 12.61971735,
       12.22757407, 11.62593051, 12.65048904, 11.84186472, 12.05