In [24]:
# 1. Imports and data loading
import pandas as pd
from fastai.tabular.all import *

# Paths - adjust if different
PATH = Path("../../data")
train_df = pd.read_csv(PATH / "train.csv")
test_df = pd.read_csv(PATH / "test.csv")
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (1460, 81)
Test shape: (1459, 80)


In [25]:
# 2. Preprocessing: Identify columns, combine for consistency
# Remove target for test, but keep Id
target = "SalePrice"
train_df["is_train"] = True
test_df["is_train"] = False
test_df[target] = np.nan  # Add dummy target for test set

# Combine for encoding consistency
all_df = pd.concat([train_df, test_df], ignore_index=True)

# Find categorical and continuous columns automatically
cat_names = all_df.select_dtypes("object").columns.tolist()
cat_names = [c for c in cat_names if c not in ["Id", target]]
cont_names = all_df.select_dtypes("number").columns.difference(["Id", target]).tolist()

print("Categorical:", cat_names)
print("Continuous:", cont_names)

Categorical: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
Continuous: ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MSSubClass', 'MasVnrArea', 'MiscVal', 'MoSold', 'OpenPorchSF', 'OverallCond', 'OverallQual', 'Pool

In [26]:
# 3. Create splits for train/valid/test
from sklearn.model_selection import train_test_split

train_idx = all_df[all_df.is_train].index.tolist()
test_idx = all_df[~all_df.is_train].index.tolist()

# Fastai split: (train/valid, test)
train_idxs, valid_idxs = train_test_split(train_idx, test_size=0.2, random_state=42)
splits = (L(train_idxs), L(valid_idxs))

# For Fastai, remove helper column
all_df = all_df.drop("is_train", axis=1)

In [27]:
# 4. Setup TabularPandas object
to = TabularPandas(
    all_df,
    procs=[Categorify, FillMissing, Normalize],
    cat_names=cat_names,
    cont_names=cont_names,
    y_names=target,
    splits=splits,
)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  to[n].fillna(self.na_dict[n], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [28]:
# 5. Create DataLoaders and model
dls = to.dataloaders(bs=64)
learn = tabular_learner(dls, metrics=rmse)
learn.fit_one_cycle(5)

epoch,train_loss,valid_loss,_rmse,time
0,37292347392.0,39653818368.0,199132.65625,00:00
1,38236938240.0,39653371904.0,199131.546875,00:00
2,37949476864.0,39652950016.0,199130.484375,00:00
3,38626471936.0,39652921344.0,199130.421875,00:00
4,38821421056.0,39652786176.0,199130.078125,00:00


In [None]:
# 6. Make test predictions & format submission
# Get test set dataloader
test_dl = dls.test_dl(to.items.loc[test_idx])
test_preds, _ = learn.get_preds(dl=test_dl)

# Build submission DataFrame
submission = pd.DataFrame(
    {"Id": to.items.loc[test_idx, "Id"].values, "SalePrice": test_preds.numpy().squeeze()}
)

# Save to CSV
submission.to_csv("submission.csv", index=False)
print("Submission saved! Shape:", submission.shape)

In [None]:
# 7. Troubleshooting cell: Check for shape and ID mismatches
print("Test predictions shape:", test_preds.shape)
print("Test index shape:", len(test_idx))
print("Submission shape:", submission.shape)
print("First few IDs:", submission["Id"].head())