In [244]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,root_mean_squared_error

## Using only tabular Data

In [245]:
tabular_data=pd.read_csv('tabular_data.csv')
tabular_data_val=pd.read_csv('test.csv')

In [246]:
tabular_data.drop('Unnamed: 0',axis=1,inplace=True)
tabular_data_val.drop('Unnamed: 0',axis=1,inplace=True)

In [247]:
tabular_data.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront',
       'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'lat',
       'long', 'sqft_living15', 'sqft_lot15', 'log_price', 'house_age',
       'is_renovated'],
      dtype='object')

In [248]:
tabular_data_val.columns

Index(['id', 'bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront',
       'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'lat',
       'long', 'sqft_living15', 'sqft_lot15', 'house_age', 'is_renovated'],
      dtype='object')

In [249]:
y=tabular_data['log_price']
X=tabular_data.drop(['price','log_price'],axis=1)

In [250]:
id_col=tabular_data_val['id']
X_val=tabular_data_val.drop(['id'],axis=1)

In [251]:
X.shape

(16208, 16)

In [252]:
X_val.shape

(5404, 16)

In [253]:
X.columns

Index(['bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront', 'view',
       'condition', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'house_age', 'is_renovated'],
      dtype='object')

In [254]:
X_val.columns

Index(['bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront', 'view',
       'condition', 'grade', 'sqft_above', 'sqft_basement', 'lat', 'long',
       'sqft_living15', 'sqft_lot15', 'house_age', 'is_renovated'],
      dtype='object')

In [255]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=2)

In [256]:
X_train.shape

(12966, 16)

### One Hot Encoding + Scaling

In [257]:
col_to_encode=['bedrooms','floors','waterfront','view','condition','grade','is_renovated']

In [258]:
columns_to_scale=['bathrooms','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','house_age']

In [259]:

ct=ColumnTransformer([('One Hot',OneHotEncoder(handle_unknown='ignore',drop='if_binary',sparse_output=False),col_to_encode),('scaler',StandardScaler(),columns_to_scale)],remainder='passthrough')
scaled_X_train=pd.DataFrame(ct.fit_transform(X_train),columns=ct.get_feature_names_out())
scaled_X_test=pd.DataFrame(ct.transform(X_test),columns=ct.get_feature_names_out())
scaled_X_val=pd.DataFrame(ct.transform(X_val),columns=ct.get_feature_names_out())



In [260]:
print(scaled_X_train.shape)
print(scaled_X_test.shape)
print(scaled_X_val.shape)

(12966, 49)
(3242, 49)
(5404, 49)


In [261]:
scaled_X_train.columns

Index(['One Hot__bedrooms_0', 'One Hot__bedrooms_1', 'One Hot__bedrooms_2',
       'One Hot__bedrooms_3', 'One Hot__bedrooms_4', 'One Hot__bedrooms_5',
       'One Hot__bedrooms_6', 'One Hot__bedrooms_7', 'One Hot__bedrooms_8',
       'One Hot__bedrooms_9', 'One Hot__bedrooms_10', 'One Hot__floors_1.0',
       'One Hot__floors_1.5', 'One Hot__floors_2.0', 'One Hot__floors_2.5',
       'One Hot__floors_3.0', 'One Hot__floors_3.5',
       'One Hot__waterfront_True', 'One Hot__view_0', 'One Hot__view_1',
       'One Hot__view_2', 'One Hot__view_3', 'One Hot__view_4',
       'One Hot__condition_1', 'One Hot__condition_2', 'One Hot__condition_3',
       'One Hot__condition_4', 'One Hot__condition_5', 'One Hot__grade_3',
       'One Hot__grade_4', 'One Hot__grade_5', 'One Hot__grade_6',
       'One Hot__grade_7', 'One Hot__grade_8', 'One Hot__grade_9',
       'One Hot__grade_10', 'One Hot__grade_11', 'One Hot__grade_12',
       'One Hot__grade_13', 'One Hot__is_renovated_True', 'scaler__bath

In [262]:
scaled_X_val.columns

Index(['One Hot__bedrooms_0', 'One Hot__bedrooms_1', 'One Hot__bedrooms_2',
       'One Hot__bedrooms_3', 'One Hot__bedrooms_4', 'One Hot__bedrooms_5',
       'One Hot__bedrooms_6', 'One Hot__bedrooms_7', 'One Hot__bedrooms_8',
       'One Hot__bedrooms_9', 'One Hot__bedrooms_10', 'One Hot__floors_1.0',
       'One Hot__floors_1.5', 'One Hot__floors_2.0', 'One Hot__floors_2.5',
       'One Hot__floors_3.0', 'One Hot__floors_3.5',
       'One Hot__waterfront_True', 'One Hot__view_0', 'One Hot__view_1',
       'One Hot__view_2', 'One Hot__view_3', 'One Hot__view_4',
       'One Hot__condition_1', 'One Hot__condition_2', 'One Hot__condition_3',
       'One Hot__condition_4', 'One Hot__condition_5', 'One Hot__grade_3',
       'One Hot__grade_4', 'One Hot__grade_5', 'One Hot__grade_6',
       'One Hot__grade_7', 'One Hot__grade_8', 'One Hot__grade_9',
       'One Hot__grade_10', 'One Hot__grade_11', 'One Hot__grade_12',
       'One Hot__grade_13', 'One Hot__is_renovated_True', 'scaler__bath

### 1. Linear Regression

In [263]:
lr=LinearRegression()

In [264]:
lr.fit(scaled_X_train,y_train)

In [265]:
y_pred_lr=lr.predict(scaled_X_test)
y_pred_lr=np.expm1(y_pred_lr)

In [266]:
print(r2_score(y_pred_lr,np.expm1(y_test)))
print(root_mean_squared_error(y_pred_lr,np.expm1(y_test)))

0.6883177373030136
180823.3708361113


In [267]:
results_tabular = []
results_tabular.append({
        "Model": "Linear Regression",
        "R2 Score": round(r2_score(y_pred_lr,np.expm1(y_test)), 4),
        "RMSE": round(root_mean_squared_error(y_pred_lr,np.expm1(y_test)), 4)
    })

### 2. Random Forest Regressor

In [268]:
rf=RandomForestRegressor(n_estimators=150,max_depth=15,min_samples_split=10,random_state=42,n_jobs=-1)

In [269]:
rf.fit(scaled_X_train.values,y_train)

In [270]:
y_pred_rf=rf.predict(scaled_X_test.values)
y_pred_rf=np.expm1(y_pred_rf)

In [271]:
print(r2_score(y_pred_rf,np.expm1(y_test)))
print(root_mean_squared_error(y_pred_rf,np.expm1(y_test)))

0.7593495851051253
146645.09575942185


In [272]:
results_tabular.append({
        "Model": "Random Forest",
        "R2 Score": round(r2_score(y_pred_rf,np.expm1(y_test)), 4),
        "RMSE": round(root_mean_squared_error(y_pred_rf,np.expm1(y_test)), 4)
    })

### 3. XGB regressor

In [273]:
xgb=XGBRegressor(n_estimators=500,max_depth=6,learning_rate=0.05,subsample=0.8,colsample_bytree=0.8,objective="reg:squarederror",random_state=42)

In [274]:
xgb.fit(scaled_X_train.values,y_train)

In [275]:
y_pred_xgb=xgb.predict(scaled_X_test.values)
y_pred_xgb=np.expm1(y_pred_xgb)

In [276]:
print(r2_score(y_pred_xgb,np.expm1(y_test)))
print(root_mean_squared_error(y_pred_xgb,np.expm1(y_test)))

0.8530450737774338
125284.92610435728


In [215]:
predicted_price=xgb.predict(scaled_X_val)
predicted_price=np.expm1(predicted_price)

In [217]:
prediction=pd.DataFrame({
    'id':id_col,
    'predicted_price':predicted_price
})

In [224]:
prediction.to_csv('predictions.csv',index=False)

In [277]:
results_tabular.append({
        "Model": "XGB Regressor",
        "R2 Score": round(r2_score(y_pred_xgb,np.expm1(y_test)), 4),
        "RMSE": round(root_mean_squared_error(y_pred_xgb,np.expm1(y_test)), 4)
    })

In [278]:
pd.DataFrame(results_tabular)

Unnamed: 0,Model,R2 Score,RMSE
0,Linear Regression,0.6883,180823.3708
1,Random Forest,0.7593,146645.0958
2,XGB Regressor,0.853,125284.9261


## Using Tabular data + Setellite image data

In [279]:
tabular_data=pd.read_csv('tabular_data.csv')
tabular_data_val=pd.read_csv('test.csv')

In [280]:
tabular_data.drop('Unnamed: 0',axis=1,inplace=True)
tabular_data_val.drop('Unnamed: 0',axis=1,inplace=True)

In [281]:
image_data=pd.read_csv('image_data.csv')
image_data_val=pd.read_csv('image_data_val.csv')

image_data.drop('Unnamed: 0',axis=1,inplace=True)
image_data_val.drop('Unnamed: 0',axis=1,inplace=True)

In [282]:
image_data.shape

(16208, 512)

In [283]:
image_data_val.shape

(5404, 512)

In [284]:
y=tabular_data['log_price']
X=tabular_data.drop(['price','log_price'],axis=1)

In [285]:
id_col=tabular_data_val['id']
X_val=tabular_data_val.drop(['id'],axis=1)

In [286]:
image_data.shape

(16208, 512)

In [287]:
X = X.reset_index(drop=True)
image_data = image_data.reset_index(drop=True)

In [288]:
X_val = X_val.reset_index(drop=True)
image_data_val = image_data_val.reset_index(drop=True)

In [289]:
X=pd.concat([X,image_data],axis=1)

In [290]:
X_val=pd.concat([X_val,image_data_val],axis=1)

In [291]:
X_val.columns

Index(['bedrooms', 'bathrooms', 'sqft_lot', 'floors', 'waterfront', 'view',
       'condition', 'grade', 'sqft_above', 'sqft_basement',
       ...
       '502', '503', '504', '505', '506', '507', '508', '509', '510', '511'],
      dtype='object', length=528)

In [292]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=2)

In [293]:
X_train.shape

(12966, 528)

In [294]:
col_to_encode=['bedrooms','floors','waterfront','view','condition','grade','is_renovated']

In [295]:
columns_to_scale=['bathrooms','sqft_lot','sqft_above','sqft_basement','lat','long','sqft_living15','sqft_lot15','house_age']
for i in range(512):
    columns_to_scale.insert(len(columns_to_scale),str(i))


In [296]:
from sklearn.compose import ColumnTransformer
ct=ColumnTransformer([('One Hot',OneHotEncoder(handle_unknown='ignore',drop='if_binary',sparse_output=False),col_to_encode),('scaler',StandardScaler(),columns_to_scale)],remainder='passthrough')
scaled_X_train=pd.DataFrame(ct.fit_transform(X_train),columns=ct.get_feature_names_out())
scaled_X_test=pd.DataFrame(ct.transform(X_test),columns=ct.get_feature_names_out())
scaled_X_val=pd.DataFrame(ct.transform(X_val),columns=ct.get_feature_names_out())



### 1. Linear Regression

In [297]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()

In [298]:
lr.fit(scaled_X_train,y_train)

In [299]:
y_pred_lr=lr.predict(scaled_X_test)
y_pred_lr=np.expm1(y_pred_lr)

In [300]:
print(r2_score(y_pred_lr,np.expm1(y_test)))
print(root_mean_squared_error(y_pred_lr,np.expm1(y_test)))

0.6943243754898342
182527.54745139385


In [301]:
results_tab_images = []
results_tab_images.append({
        "Model": "Linear Regression",
        "R2 Score": round(r2_score(y_pred_lr,np.expm1(y_test)), 4),
        "RMSE": round(root_mean_squared_error(y_pred_lr,np.expm1(y_test)), 4)
    })

### 2. Random Forest Regressor

In [302]:
rf=RandomForestRegressor(n_estimators=150,max_depth=15,min_samples_split=10,random_state=42,n_jobs=-1)

In [303]:
rf.fit(scaled_X_train.values,y_train)

In [304]:
y_pred_rf=rf.predict(scaled_X_test.values)
y_pred_rf=np.expm1(y_pred_rf)

In [311]:
print(r2_score(y_pred_rf,np.expm1(y_test)))
print(root_mean_squared_error(y_pred_rf,np.expm1(y_test)))

0.6633688032537117
164427.07402959856


In [312]:
results_tab_images.append({
        "Model": "Random Forest",
        "R2 Score": round(r2_score(y_pred_rf,np.expm1(y_test)), 4),
        "RMSE": round(root_mean_squared_error(y_pred_rf,np.expm1(y_test)), 4)
    })

### 3. XGB Regressor

In [306]:
xgb=XGBRegressor(n_estimators=500,max_depth=6,learning_rate=0.05,subsample=0.8,colsample_bytree=0.8,objective="reg:squarederror",random_state=42,n_jobs=-1)

In [307]:
xgb.fit(scaled_X_train.values,y_train)

In [308]:
y_pred_xgb=xgb.predict(scaled_X_test.values)
y_pred_xgb=np.expm1(y_pred_xgb)

In [309]:
print(r2_score(y_pred_xgb,np.expm1(y_test)))
print(root_mean_squared_error(y_pred_xgb,np.expm1(y_test)))

0.7960355777869476
139189.90346523665


In [313]:
results_tab_images.append({
        "Model": "XGB Regressor",
        "R2 Score": round(r2_score(y_pred_xgb,np.expm1(y_test)), 4),
        "RMSE": round(root_mean_squared_error(y_pred_xgb,np.expm1(y_test)), 4)
    })

In [315]:
pd.DataFrame(results_tab_images)

Unnamed: 0,Model,R2 Score,RMSE
0,Linear Regression,0.6943,182527.5475
1,Random Forest,0.6634,164427.074
2,XGB Regressor,0.796,139189.9035


### 4. MPL Model

In [316]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [317]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

In [318]:
class RegressionDataset(Dataset):
    def __init__(self, X, y):
        # Convert DataFrame / Series → NumPy
        X = X.values if hasattr(X, "values") else X
        y = y.values if hasattr(y, "values") else y

        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).float().view(-1, 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]



In [319]:
train_ds = RegressionDataset(scaled_X_train, y_train)
val_ds   = RegressionDataset(scaled_X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=256, shuffle=False)


In [320]:
class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU(),

            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.model(x)


In [321]:
model = MLPRegressor(scaled_X_train.shape[1]).to(device)


In [322]:
criterion = nn.MSELoss()

optimizer = optim.AdamW(
    model.parameters(),
    lr=1e-3,
    weight_decay=1e-4
)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=5
)


In [323]:
EPOCHS = 500
best_val_loss = np.inf
patience = 10
counter = 0

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # ---- Validation ----
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    scheduler.step(val_loss)

    print(f"Epoch [{epoch+1}/{EPOCHS}] "
          f"Train MSE: {train_loss:.4f} | Val MSE: {val_loss:.4f}")

    # ---- Early stopping ----
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        counter = 0
        torch.save(model.state_dict(), "best_mlp.pt")
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered")
            break


Epoch [1/500] Train MSE: 11.4827 | Val MSE: 1.9725
Epoch [2/500] Train MSE: 1.6055 | Val MSE: 1.3064
Epoch [3/500] Train MSE: 1.3806 | Val MSE: 0.9233
Epoch [4/500] Train MSE: 1.2825 | Val MSE: 0.5628
Epoch [5/500] Train MSE: 1.1887 | Val MSE: 0.9223
Epoch [6/500] Train MSE: 1.1166 | Val MSE: 0.4409
Epoch [7/500] Train MSE: 1.0227 | Val MSE: 0.4698
Epoch [8/500] Train MSE: 1.0169 | Val MSE: 0.4861
Epoch [9/500] Train MSE: 0.9144 | Val MSE: 0.5765
Epoch [10/500] Train MSE: 0.8776 | Val MSE: 0.6114
Epoch [11/500] Train MSE: 0.8121 | Val MSE: 0.2524
Epoch [12/500] Train MSE: 0.7560 | Val MSE: 0.2143
Epoch [13/500] Train MSE: 0.7146 | Val MSE: 0.2000
Epoch [14/500] Train MSE: 0.6387 | Val MSE: 0.2187
Epoch [15/500] Train MSE: 0.5801 | Val MSE: 0.1616
Epoch [16/500] Train MSE: 0.5309 | Val MSE: 0.1795
Epoch [17/500] Train MSE: 0.4757 | Val MSE: 0.5120
Epoch [18/500] Train MSE: 0.4476 | Val MSE: 0.1382
Epoch [19/500] Train MSE: 0.3509 | Val MSE: 0.2174
Epoch [20/500] Train MSE: 0.3216 | Val 

In [324]:
model.load_state_dict(torch.load("best_mlp.pt"))
model.eval()

val_preds = []

with torch.no_grad():
    for X_batch, _ in val_loader:
        X_batch = X_batch.to(device)
        preds = model(X_batch)
        val_preds.append(preds.cpu().numpy())

y_val_pred = np.vstack(val_preds).ravel()
y_val_pred=np.expm1(y_val_pred)

In [325]:
r2 = r2_score(np.expm1(y_test), y_val_pred)
rmse = root_mean_squared_error(np.expm1(y_test), y_val_pred)

print(f"PyTorch MLP R²  : {r2:.4f}")
print(f"PyTorch MLP RMSE: {rmse:.2f}")


results_tab_images.append({
        "Model": "MLP Regressor",
        "R2 Score": round(r2, 4),
        "RMSE": round(rmse, 4)
    })

PyTorch MLP R²  : 0.7247
PyTorch MLP RMSE: 190996.69


In [326]:
pd.DataFrame(results_tab_images)

Unnamed: 0,Model,R2 Score,RMSE
0,Linear Regression,0.6943,182527.5475
1,Random Forest,0.6634,164427.074
2,XGB Regressor,0.796,139189.9035
3,MLP Regressor,0.7247,190996.6871
