In [29]:
import pandas as pd
from fastai.tabular.all import *

# Load your already cleaned/train DataFrame (use what you had with SalePrice, Id, etc)
df = pd.read_csv("../../data/train.csv")

# Optionally: use your earlier data cleaning code here to match your best results!

# Set which columns are categorical and which are continuous:
cat_names = df.select_dtypes("object").columns.tolist()
cont_names = [c for c in df.columns if c not in cat_names + ["SalePrice", "Id"]]

# We'll predict the log of SalePrice
df["LogSalePrice"] = np.log1p(df["SalePrice"])

In [30]:
cat_names = df.select_dtypes("object").columns.tolist()
cont_names = df.select_dtypes(["float64", "int64"]).columns.tolist()
cont_names = [c for c in cont_names if c not in ["SalePrice", "Id"]]

In [31]:
# fastai needs to know which is the dependent variable (y), and validation set
splits = RandomSplitter(valid_pct=0.2, seed=42)(range_of(df))

to = TabularPandas(
    df,
    procs=[Categorify, FillMissing, Normalize],
    cat_names=cat_names,
    cont_names=cont_names,
    y_names="LogSalePrice",
    splits=splits,
)

dls = to.dataloaders(bs=64)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [32]:
learn = tabular_learner(dls, metrics=rmse)
learn.fit_one_cycle(50)

epoch,train_loss,valid_loss,_rmse,time
0,0.897553,0.824627,0.90809,00:00
1,0.604734,0.273307,0.522788,00:00
2,0.43015,0.152347,0.390317,00:00
3,0.334912,0.130062,0.360641,00:00
4,0.263094,0.094071,0.30671,00:00
5,0.201002,0.086901,0.29479,00:00
6,0.160398,0.078674,0.280488,00:00
7,0.127139,0.057198,0.239162,00:00
8,0.106759,0.052729,0.229629,00:00
9,0.093485,0.062975,0.250949,00:00


In [33]:
# Predict on validation set
preds, targs = learn.get_preds()
val_rmse = rmse(preds, targs)
print(f"Validation RMSE: {float(val_rmse):.4f}")

Validation RMSE: 0.1600


In [35]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error

# 1. Validation set from fastai TabularPandas
valid_idx = to.splits[1]
df_valid = to.items.iloc[valid_idx].copy()
X_valid = df_valid[dls.x_names]
y_valid = df_valid["LogSalePrice"]

# 2. Baseline RMSE
dl = learn.dls.test_dl(X_valid)
preds, _ = learn.get_preds(dl=dl)
base_rmse = mean_squared_error(y_valid, preds.numpy(), squared=False)

# 3. Permutation importance
importances = []
for col in X_valid.columns:
    X_shuff = X_valid.copy()
    X_shuff[col] = np.random.permutation(X_shuff[col])
    dl_shuff = learn.dls.test_dl(X_shuff)
    preds_shuff, _ = learn.get_preds(dl=dl_shuff)
    rmse = mean_squared_error(y_valid, preds_shuff.numpy(), squared=False)
    importances.append(rmse - base_rmse)

feat_imp = pd.Series(importances, index=X_valid.columns).sort_values(ascending=False)
feat_imp.plot(
    kind="barh",
    title="Permutation Feature Importance (fastai tabular NN)",
    figsize=(8, 5),
)

AttributeError: splits

In [37]:
# Load test data
test_df = pd.read_csv("../../data/test.csv")

# Apply same processing as training set
to_test = TabularPandas(
    test_df,
    procs=[Categorify, FillMissing, Normalize],
    cat_names=cat_names,
    cont_names=cont_names,
    y_names=None,
    splits=None,
)
test_dl = to.dataloaders(bs=64)

# Predict
test_preds = learn.get_preds(dl=test_dl)[0].squeeze().numpy()
test_preds_final = np.expm1(test_preds)  # Undo log transform

# Prepare submission
submission = pd.DataFrame({"Id": test_df["Id"], "SalePrice": test_preds_final})
submission.to_csv("submission_fastai.csv", index=False)

KeyError: "['LogSalePrice'] not in index"