In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

In [9]:
# Load datasets
train_data = pd.read_csv(r'C:\Users\nirma\dsp-nirmalkumar-murali\data\train.csv')
test_data = pd.read_csv(r'C:\Users\nirma\dsp-nirmalkumar-murali\data\test.csv')

In [10]:
# Feature Selection
continuous_features = ['LotArea', 'GrLivArea']  # Adjust as needed
categorical_features = ['MSZoning', 'Street']   # Adjust as needed



In [11]:
# Data Setup: Splitting into train and test sets
X = train_data[continuous_features + categorical_features]
y = train_data['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
# Feature Processing Pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_features),
        ('cat', categorical_transformer, categorical_features)])


In [13]:
# Model Training
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor())])  # Change the regressor as needed
model.fit(X_train, y_train)

In [14]:
# Predictions
y_pred = model.predict(X_test)


In [15]:
y_pred

array([135395.        , 287298.12      , 101599.        , 174750.        ,
       220926.28      ,  79920.        , 298913.22      , 156627.3       ,
        80400.        , 158252.7       , 176974.14      , 126932.        ,
       129245.5       , 203035.48      , 192680.86      , 108500.        ,
       179716.5       , 152514.5       , 101704.        , 177656.        ,
       221721.        , 213603.82      , 180823.46      , 106213.75      ,
       182406.32      , 140172.87      , 160680.11      , 133265.        ,
       179696.93      , 186069.68      , 164325.87      , 239815.        ,
       261372.85      , 123063.        , 212500.        , 130603.        ,
       182840.        , 191579.51      , 325829.25      , 115373.        ,
       114397.        , 253182.99      , 121848.6       , 281788.55      ,
       134847.        , 142884.5       , 130803.        , 127221.2       ,
       268978.        , 198814.        , 127671.83333333, 193405.56      ,
       116349.44      , 3

In [16]:
# Model Evaluation
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

In [17]:
rmsle_score = compute_rmsle(np.log(y_test), np.log(y_pred))
print("Root-Mean-Squared-Error (RMSE) Score:", rmsle_score)

Root-Mean-Squared-Error (RMSE) Score: 0.02
