In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
train_df = pd.read_csv("train.csv")

cat_cols = ["neighbourhood_group", "room_type"]
num_cols = ["minimum_nights", "amenity_score", "number_of_reviews", "availability_365"]

# Fill missing values
for c in cat_cols:
    train_df[c] = train_df[c].fillna(train_df[c].mode()[0])

for c in num_cols:
    train_df[c] = train_df[c].fillna(train_df[c].median())

# One-hot encoding
train_encoded = pd.get_dummies(train_df, columns=cat_cols, drop_first=True)

# Convert bool → int
bool_cols = train_encoded.select_dtypes(include=["bool"]).columns
train_encoded[bool_cols] = train_encoded[bool_cols].astype(int)

# Save feature names for later
feature_names = train_encoded.drop("price_class", axis=1).columns.tolist()

# Split X and y
X = train_encoded.drop("price_class", axis=1).values
y = train_encoded["price_class"].values

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Normalize numeric columns (first 4 columns)
scaler = StandardScaler()
X_train[:, :4] = scaler.fit_transform(X_train[:, :4])
X_val[:, :4] = scaler.transform(X_val[:, :4])

print("Shapes:", X_train.shape, X_val.shape)
print("Feature count:", len(feature_names))


Shapes: (33078, 10) (8270, 10)
Feature count: 10


In [6]:
import pandas as pd

train_df = pd.read_csv("train.csv")

cat_cols = ["neighbourhood_group", "room_type"]
num_cols = ["minimum_nights", "amenity_score", "number_of_reviews", "availability_365"]

# Fill missing values
for c in cat_cols:
    train_df[c] = train_df[c].fillna(train_df[c].mode()[0])
for c in num_cols:
    train_df[c] = train_df[c].fillna(train_df[c].median())

# One-hot encode
train_encoded = pd.get_dummies(train_df, columns=cat_cols, drop_first=True)

# Convert bool → int
bool_cols = train_encoded.select_dtypes(include=["bool"]).columns
train_encoded[bool_cols] = train_encoded[bool_cols].astype(int)

# Now get feature names
feature_names = train_encoded.drop("price_class", axis=1).columns.tolist()

print(feature_names)


['minimum_nights', 'amenity_score', 'number_of_reviews', 'availability_365', 'neighbourhood_group_Brooklyn', 'neighbourhood_group_Manhattan', 'neighbourhood_group_Queens', 'neighbourhood_group_Staten Island', 'room_type_Private room', 'room_type_Shared room']


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Convert numpy -> torch
Xtr_t = torch.tensor(X_train, dtype=torch.float32)
ytr_t = torch.tensor(y_train, dtype=torch.long)
Xva_t = torch.tensor(X_val, dtype=torch.float32)
yva_t = torch.tensor(y_val, dtype=torch.long)

train_loader = DataLoader(TensorDataset(Xtr_t, ytr_t), batch_size=256, shuffle=True)
val_loader   = DataLoader(TensorDataset(Xva_t, yva_t), batch_size=256, shuffle=False)

class MLP(nn.Module):
    def __init__(self, d_in, h1=32, h2=16, num_classes=4):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_in, h1),
            nn.ReLU(),
            nn.Linear(h1, h2),
            nn.ReLU(),
            nn.Linear(h2, num_classes)
        )
    def forward(self, x):
        return self.net(x)

model = MLP(d_in=X_train.shape[1], h1=32, h2=16, num_classes=4)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

def eval_acc(loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for xb, yb in loader:
            logits = model(xb)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)
    return correct / total

# Train (keep it comparable; 10-30 epochs is fine)
for epoch in range(1, 21):
    model.train()
    for xb, yb in train_loader:
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

    if epoch % 5 == 0:
        tr_acc = eval_acc(train_loader)
        va_acc = eval_acc(val_loader)
        print(f"Epoch {epoch:2d} | train_acc={tr_acc:.4f} | val_acc={va_acc:.4f}")


Epoch  5 | train_acc=0.8226 | val_acc=0.8190
Epoch 10 | train_acc=0.8267 | val_acc=0.8204
Epoch 15 | train_acc=0.8281 | val_acc=0.8180
Epoch 20 | train_acc=0.8359 | val_acc=0.8260


In [8]:
model.eval()

feature_importance = torch.zeros(X_train.shape[1])
N = 0

for xb, yb in val_loader:
    xb = xb.clone().detach().requires_grad_(True)

    logits = model(xb)
    loss = criterion(logits, yb)

    model.zero_grad()
    loss.backward()

    grads = xb.grad.detach().abs().sum(dim=0)
    feature_importance += grads
    N += xb.size(0)

feature_importance = feature_importance / N


In [9]:
imp = feature_importance.numpy()
rank_idx = np.argsort(-imp)

for i in rank_idx:
    print(feature_names[i], imp[i])


amenity_score 0.004634509
room_type_Shared room 0.0012950944
room_type_Private room 0.001006806
neighbourhood_group_Manhattan 0.0008542189
neighbourhood_group_Queens 0.00058581296
neighbourhood_group_Brooklyn 0.00051855086
minimum_nights 0.00044069
neighbourhood_group_Staten Island 0.0004104523
number_of_reviews 0.00026537434
availability_365 0.000225434


In [11]:
X_test = X_test.astype(float)


In [12]:
# Split features and target
X_test = test_encoded.drop("price_class", axis=1).values
y_test = test_encoded["price_class"].values

# Ensure numeric type
X_test = X_test.astype(float)

# Normalize numeric features
X_test[:, :4] = scaler.transform(X_test[:, :4])

# Convert to tensors
import torch
Xtest_t = torch.tensor(X_test, dtype=torch.float32)
ytest_t = torch.tensor(y_test, dtype=torch.long)


In [13]:
print(X_test.dtype)
print(np.isnan(X_test).sum())


float64
0


In [15]:
model.eval()

with torch.no_grad():
    logits = model(Xtest_t)
    preds = torch.argmax(logits, dim=1)
    test_acc = (preds == ytest_t).float().mean().item()

print("Test Accuracy:", test_acc)


Test Accuracy: 0.35110318660736084


In [16]:
print("Train feature count:", X_train.shape[1])
print("Test feature count:", X_test.shape[1])


Train feature count: 10
Test feature count: 10


In [17]:
print(train_encoded.drop("price_class", axis=1).columns.equals(
      test_encoded.drop("price_class", axis=1).columns))


True
